In [279]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pathlib import Path

In [280]:
pwd_path = str(Path.cwd())

In [281]:
train = pd.read_csv(pwd_path+'/data/train.csv')
test = pd.read_csv(pwd_path+'/data/test.csv')

# checking the data completeness

In [282]:
train.isnull().sum()
# Age, Cabin and Embarked columns have null values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [283]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Cleaning

## filling null values

In [284]:
# Adding title column
train['title'] = train['Name'].apply(lambda x : x.split(',')[1].strip().split('.')[0])
test['title'] = test['Name'].apply(lambda x : x.split(',')[1].strip().split('.')[0])

# filling null values of Age column
train['Age'] = train.groupby('title')['Age'].transform(lambda x : x.fillna(x.mean()))
test['Age'] = test.groupby('title')['Age'].transform(lambda x : x.fillna(x.mean()))

# filling null values of Age column
train['Age'] = train.groupby('Sex')['Age'].transform(lambda x : x.fillna(x.mean()))
test['Age'] = test.groupby('Sex')['Age'].transform(lambda x : x.fillna(x.mean()))

def mod_title(y):
    if  y not in ('Mr','Miss','Mrs','Master') :
        return 'rare'
    else :
        return y
train['title'] = train['title'].apply(lambda x : mod_title(x))
test['title'] = test['title'].apply(lambda x : mod_title(x))


# filling null values of embarked column with the most common embarked station
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().index[0])


# filling null value of Fare column in test data
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

# precision handling for fare column
test['Fare'] = test['Fare'].round(decimals =1)
train['Fare'] = train['Fare'].round(decimals =1)

train['Cabin'] = train['Cabin'].fillna('no')
test['Cabin'] = test['Cabin'].fillna('no')

def change_cabin(y):
    if y != 'no':
        return 'yes'
    else :
        return 'no'
train['Cabin'] = train['Cabin'].apply(lambda x : change_cabin(x))    
test['Cabin'] = test['Cabin'].apply(lambda x : change_cabin(x))

# Feature Engineering

In [285]:
# classifying the age
train['age_cat'] = pd.cut(train['Age'],bins = [-1,18,30,45,100],labels =['age1','age2','age3','age4'])
test['age_cat'] = pd.cut(test['Age'],bins = [-1,18,30,45,100],labels =['age1','age2','age3','age4'])

In [286]:
# creating a new column combining sex and Passenger class
def sex_class(x):
    if x['Sex'] == 'female':
        if x['Pclass'] == 1 :
            return 'female_1'
        elif x['Pclass'] == 2 :
            return 'female_2'
        else :
            return 'female_3'
    else :
        if x['Pclass'] == 1 :
            return 'male_1'
        elif x['Pclass'] == 2 :
            return 'male_2'
        else :
            return 'male_3' 
train['newc'] = train.apply(lambda x : sex_class(x),axis =1)
test['newc'] = test.apply(lambda x : sex_class(x),axis =1)

In [287]:
# getting the ticket count

# train1 = train
# test1 = test
# train1 = train1.drop(['Survived'],axis =1)
# total_df = pd.concat([test1,train1])
# ticket_df = total_df['Ticket'].value_counts()
# ticket_df = ticket_df.reset_index()
# ticket_df.columns = ['ticket','total_count']
# ticket_df.head()

In [288]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,age_cat,newc
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2,no,S,Mr,age2,male_3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.3,yes,C,Mrs,age3,female_1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9,no,S,Miss,age2,female_3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,yes,S,Mrs,age3,female_1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0,no,S,Mr,age3,male_3


In [289]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'title', 'age_cat',
       'newc'],
      dtype='object')

In [290]:
feature_names = ["Pclass","SibSp","Parch", "Fare"]

In [291]:
train_df = train[feature_names]
test_df = test[feature_names]

In [292]:
xtrain,xvalid,ytrain,yvalid = train_test_split(train_df[feature_names],train['Survived'],test_size = 0.3, stratify=train['Survived'],random_state=553)

In [293]:
knn = KNeighborsClassifier(n_neighbors=5,p=2)
knn.fit(xtrain,ytrain)
pred1 =  knn.predict(xvalid)
print(accuracy_score(yvalid,pred1))

0.6082089552238806


In [294]:
knn2 = KNeighborsClassifier(n_neighbors=5,p=1)
knn2.fit(xtrain,ytrain)
pred2 =  knn2.predict(xvalid)
print(accuracy_score(yvalid,pred2))

0.6082089552238806


# understand the ranking difference between manhattan distance and eculidean distance

In [297]:
df = pd.DataFrame()
df['col1'] = np.random.normal(0,0.1,1000)
df['col2'] = np.random.normal(5,0.2,1000)
df['col3'] = np.random.normal(-5,0.15,1000)

In [299]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
new_df = pd.DataFrame(ss.fit_transform(df), columns=['col1','col2','col3'])

In [303]:
test_data = [0,0,0]

In [4]:
a= "1 2 3 4 1"

In [5]:
b = [int(i) for i in a.split(" ")]
print(b)

[1, 2, 3, 4, 1]


In [7]:
from collections import Counter
coun = dict(Counter(b))
# coun.pop("1")
print(coun)


{1: 2, 2: 1, 3: 1, 4: 1}


In [10]:
coun.pop(2)

1

In [11]:
coun

{3: 1, 4: 1}

In [12]:
d = {}

In [13]:
for i in range(5): 
    d[i].append(i) 

KeyError: 0

In [35]:
def retunr_this():
    return "not present"

In [41]:
from collections import defaultdict
d = defaultdict(list)

In [42]:
d

defaultdict(list, {})

In [43]:
d[i] =3

In [44]:
d

defaultdict(list, {0: 3})

In [45]:
d[2]

[]

In [47]:
d[3]

[]

In [48]:
d

defaultdict(list, {0: 3, 2: [], 3: []})

In [52]:
names = ["Sam", "Peter", "James", "Julian", "Ann"]

In [57]:
print(','.join(names))

Sam,Peter,James,Julian,Ann


In [59]:
j = ','.join(names)

In [60]:
print(j)

Sam,Peter,James,Julian,Ann


In [61]:
j ='raa'

In [62]:
print(j)

raa


In [71]:
a = "  the  banana  is  good"

In [72]:
print(a.strip())

the  banana  is  good


In [68]:
print(a)

  the  banana  is  good


In [77]:
a.split()

['the', 'banana', 'is', 'good']

In [78]:
print(i for i in range(4))

<generator object <genexpr> at 0x7fb0d7f3bdf0>


In [85]:
a = (i for i in range(4))

[0, 1, 2, 3]

In [96]:
a, *b = [1,2,3,5]

2 3 5


In [101]:
counter = Counter(list("qwertyuiopasdfghjklzxcvbnm"))
print(counter)
# elements()
elements = counter.elements()  # doesn't return elements with count 0 or less
for value in elements:
    print(value)

Counter({'q': 1, 'w': 1, 'e': 1, 'r': 1, 't': 1, 'y': 1, 'u': 1, 'i': 1, 'o': 1, 'p': 1, 'a': 1, 's': 1, 'd': 1, 'f': 1, 'g': 1, 'h': 1, 'j': 1, 'k': 1, 'l': 1, 'z': 1, 'x': 1, 'c': 1, 'v': 1, 'b': 1, 'n': 1, 'm': 1})
q
w
e
r
t
y
u
i
o
p
a
s
d
f
g
h
j
k
l
z
x
c
v
b
n
m


In [102]:
counter.most_common(3)

[('q', 1), ('w', 1), ('e', 1)]

In [103]:
s = "qwertyuiopasdfghjklzxcvbnm"

In [106]:
cont = Counter(list(s))
iterable = sorted(list(cont.items()), key = lambda x : (-x[1],x[0]))
# print(iterable)
for key, value in iterable[:3]:
    print(key , value)

a 1
b 1
c 1


In [107]:
import calendar

In [109]:
calendar.weekday(2022, 10, 6)

3

In [110]:
import datetime
import calendar
 
def findDay(date):
    born = datetime.datetime.strptime(date, '%d %m %Y').weekday()
    return (calendar.day_name[born])
 
# Driver program
date = '03 02 2019'
print(findDay(date))

Sunday


In [112]:
born = datetime.datetime.strptime(date, '%d %m %Y').weekday()

In [113]:
born

6

In [118]:
a= 2
b=0
try :
    print(a/b)
except BaseException as err :
    print(f"Unexpected {err=}, {type(err)=}")
except ZeroDivisionError as err :
    print("zero division error")

Unexpected err=ZeroDivisionError('division by zero'), type(err)=<class 'ZeroDivisionError'>


In [126]:
class B(Exception):
    pass

class C(B):
    pass

class D(C):
    pass

for cls in [B, C, D]:
    try:
        print("value")
        raise cls()
    # except B:
    #     print("B")
    except D:
        print("D")
    except B:
        print("B")
    except C:
        print("C")
    # except B:
    #     print("B")

value
B
value
B
value
D
