#### Use titanic dataset. Handle the null values and convert the categorical values into numerical values. Make a classification model using k-nn classifier to predict the survival of a passenger on the ship. Use “gridsearchCV( )” to find the best value of ‘k’.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.shape

(891, 12)

In [4]:
data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Embarked  889 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 48.9+ KB


In [6]:
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [7]:
data.dropna(subset=['Embarked'], inplace=True)

In [8]:
X = data.drop(labels='Survived', axis=1)
y = data['Survived']

In [9]:
y.value_counts()

0    549
1    340
Name: Survived, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                    random_state=1, stratify=y)

In [11]:
X_train['Age'].value_counts()

22.0    22
19.0    20
24.0    19
25.0    19
30.0    19
        ..
36.5     1
63.0     1
34.5     1
53.0     1
20.5     1
Name: Age, Length: 81, dtype: int64

In [12]:
X_train[X_train['Age'].isna()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
367,3,female,,0,0,C
223,3,male,,0,0,S
36,3,male,,0,0,C
301,3,male,,2,0,Q
825,3,male,,0,0,Q
...,...,...,...,...,...,...
76,3,male,,0,0,S
330,3,female,,2,0,Q
650,3,male,,0,0,S
347,3,female,,1,0,S


In [13]:
X_train.isna().sum()

Pclass        0
Sex           0
Age         114
SibSp         0
Parch         0
Embarked      0
dtype: int64

In [14]:
y_train[X_train['Age'].isnull()].sum() 
# i.e; 36 outof 129 poeple with missing age survived

36

In [15]:
print('Mode:{0},  Median:{1},'
      '  Mean:{2}'.format(*X_train['Age'].mode().values, X_train['Age'].median(),
                        X_train['Age'].mean()))

Mode:22.0,  Median:29.0,  Mean:29.357941787941787


In [16]:
i_median = SimpleImputer(strategy='median')
i_median.fit(X_train['Age'].values.reshape(-1, 1))

leber = LabelEncoder()
leber.fit(X_train['Sex'].values.reshape(-1, 1).ravel())

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[['Embarked']])

stanscale = StandardScaler()

def preprocess(data):
    df = data.copy()
    df.loc[:, 'Age'] = i_median.transform(df['Age'].values.reshape(-1,1))
    df.loc[:, 'Sex'] = leber.transform(df['Sex'].values.reshape(-1, 1).ravel())
    ohed_pd = pd.DataFrame(enc.transform(df[['Embarked']]).toarray(),
                           columns=['Emb_S','Emb_Q','Emb_S'], index=df.index)
    df = df.join(ohed_pd)
    df['kins'] = df['SibSp'] + df['Parch']
    df.drop(labels=['Embarked','SibSp','Parch'], axis=1, inplace=True)
    df = stanscale.fit_transform(df)
    return df


In [17]:
# Pre-Processing Training data
X_train = preprocess(X_train)

In [18]:
knn = KNeighborsClassifier(n_jobs=-1)
kf = KFold(n_splits=4, shuffle=True, random_state=42)
params = {'n_neighbors':np.arange(3,21,2),
         'leaf_size': np.arange(1,5),
         'p': [1,2]}

gs = GridSearchCV(estimator=knn, param_grid=params, scoring='f1_weighted', cv=kf)

In [19]:
gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=4, random_state=42, shuffle=True),
             estimator=KNeighborsClassifier(n_jobs=-1),
             param_grid={'leaf_size': array([1, 2, 3, 4]),
                         'n_neighbors': array([ 3,  5,  7,  9, 11, 13, 15, 17, 19]),
                         'p': [1, 2]},
             scoring='f1_weighted')

In [20]:
print(gs.best_estimator_)
print(gs.best_score_)

KNeighborsClassifier(leaf_size=4, n_jobs=-1, n_neighbors=15, p=1)
0.8037397021679246


In [21]:
# Pre-Processing Testing data
X_test = preprocess(X_test)

In [22]:
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

In [23]:
print('F1 Score(Train): {0:.4f}'.format(f1_score(y_train, y_pred_train,
                                                 average='weighted')))
print('F1 Score(Test): {0:.4f}'.format(f1_score(y_test, y_pred_test,
                                                average='weighted')))

print('\nAccuracy Score(Train): {0:.4f}'.format(accuracy_score(y_train,
                                                               y_pred_train)))
print('Accuracy Score(Test): {0:.4f}'.format(accuracy_score(y_test,
                                                            y_pred_test)))

F1 Score(Train): 0.8173
F1 Score(Test): 0.8325

Accuracy Score(Train): 0.8235
Accuracy Score(Test): 0.8367
