# K-Nearest Neighbours (K-NN)

## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the Dataset

In [2]:
dataset = sns.load_dataset("titanic")
dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Taking care of Missing Data

In [3]:
dataset.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [4]:
col_to_drop=["embark_town","alive","class","adult_male","deck"]
dataset = dataset.drop(col_to_drop,axis=1)

In [5]:
dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,alone
0,0,3,male,22.0,1,0,7.25,S,man,False
1,1,1,female,38.0,1,0,71.2833,C,woman,False
2,1,3,female,26.0,0,0,7.925,S,woman,True
3,1,1,female,35.0,1,0,53.1,S,woman,False
4,0,3,male,35.0,0,0,8.05,S,man,True


In [6]:
col_to_fill = ['age']
dataset = dataset.fillna(dataset[col_to_fill].mean())

In [7]:
dataset = dataset.fillna(dataset['embarked'].ffill)

In [8]:
dataset.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
who         0
alone       0
dtype: int64

## Encoding Categorical Data

In [9]:
dataset = pd.get_dummies(dataset)
dataset.head(10)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,"embarked_<bound method NDFrame.ffill of 0 S\n1 C\n2 S\n3 S\n4 S\n ..\n886 S\n887 S\n888 S\n889 C\n890 Q\nName: embarked, Length: 891, dtype: object>",embarked_C,embarked_Q,embarked_S,who_child,who_man,who_woman
0,0,3,22.0,1,0,7.25,False,0,1,0,0,0,1,0,1,0
1,1,1,38.0,1,0,71.2833,False,1,0,0,1,0,0,0,0,1
2,1,3,26.0,0,0,7.925,True,1,0,0,0,0,1,0,0,1
3,1,1,35.0,1,0,53.1,False,1,0,0,0,0,1,0,0,1
4,0,3,35.0,0,0,8.05,True,0,1,0,0,0,1,0,1,0
5,0,3,29.699118,0,0,8.4583,True,0,1,0,0,1,0,0,1,0
6,0,1,54.0,0,0,51.8625,True,0,1,0,0,0,1,0,1,0
7,0,3,2.0,3,1,21.075,False,0,1,0,0,0,1,1,0,0
8,1,3,27.0,0,2,11.1333,False,1,0,0,0,0,1,0,0,1
9,1,2,14.0,1,0,30.0708,False,1,0,0,1,0,0,1,0,0


In [10]:
x = dataset.drop("survived",axis=1)
y = dataset['survived']

In [11]:
x

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,"embarked_<bound method NDFrame.ffill of 0 S\n1 C\n2 S\n3 S\n4 S\n ..\n886 S\n887 S\n888 S\n889 C\n890 Q\nName: embarked, Length: 891, dtype: object>",embarked_C,embarked_Q,embarked_S,who_child,who_man,who_woman
0,3,22.000000,1,0,7.2500,False,0,1,0,0,0,1,0,1,0
1,1,38.000000,1,0,71.2833,False,1,0,0,1,0,0,0,0,1
2,3,26.000000,0,0,7.9250,True,1,0,0,0,0,1,0,0,1
3,1,35.000000,1,0,53.1000,False,1,0,0,0,0,1,0,0,1
4,3,35.000000,0,0,8.0500,True,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,True,0,1,0,0,0,1,0,1,0
887,1,19.000000,0,0,30.0000,True,1,0,0,0,0,1,0,0,1
888,3,29.699118,1,2,23.4500,False,1,0,0,0,0,1,0,0,1
889,1,26.000000,0,0,30.0000,True,0,1,0,1,0,0,0,1,0


In [12]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

## Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=0)

In [14]:
x_train

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,"embarked_<bound method NDFrame.ffill of 0 S\n1 C\n2 S\n3 S\n4 S\n ..\n886 S\n887 S\n888 S\n889 C\n890 Q\nName: embarked, Length: 891, dtype: object>",embarked_C,embarked_Q,embarked_S,who_child,who_man,who_woman
140,3,29.699118,0,2,15.2458,False,1,0,0,1,0,0,0,0,1
439,2,31.000000,0,0,10.5000,True,0,1,0,0,0,1,0,1,0
817,2,31.000000,1,1,37.0042,False,0,1,0,1,0,0,0,1,0
378,3,20.000000,0,0,4.0125,True,0,1,0,1,0,0,0,1,0
491,3,21.000000,0,0,7.2500,True,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,1,39.000000,1,1,83.1583,False,1,0,0,1,0,0,0,0,1
192,3,19.000000,1,0,7.8542,False,1,0,0,0,0,1,0,0,1
629,3,29.699118,0,0,7.7333,True,0,1,0,0,1,0,0,1,0
559,3,36.000000,1,0,17.4000,False,1,0,0,0,0,1,0,0,1


In [15]:
y_train

140    0
439    0
817    0
378    0
491    0
      ..
835    1
192    1
629    0
559    1
684    0
Name: survived, Length: 712, dtype: int64

## Before Applying Feature Scaling 

### Training the K-NN model on the Training set

In [16]:
from sklearn.neighbors import KNeighborsClassifier
cls = KNeighborsClassifier(n_neighbors=7,metric='minkowski',p=2)
cls.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=7)

### Model Score on Training data

In [17]:
cls.score(x_train,y_train)

0.800561797752809

### Model Score on Testing data

In [18]:
cls.score(x_test,y_test)

0.7653631284916201

### Predicting a New Result

### Predicting the Test set Results

In [19]:
y_predict = cls.predict(x_test)

### Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[94, 16],
       [26, 43]], dtype=int64)

### Classification Report

In [21]:
from sklearn.metrics import accuracy_score,recall_score,r2_score,precision_score,f1_score
print("Accuracy Score =" ,accuracy_score(y_predict,y_test))
print("R2 Score =", r2_score(y_predict,y_test))
print("Recall Score =",recall_score(y_predict,y_test))
print("Precision Score =",precision_score(y_predict,y_test))
print("f1 Score =", f1_score(y_predict,y_test))

Accuracy Score = 0.7653631284916201
R2 Score = -0.06186440677966121
Recall Score = 0.7288135593220338
Precision Score = 0.6231884057971014
f1 Score = 0.671875


## After Applying Feature Scaling 

### Feature Scaling

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [23]:
x_train

array([[ 0.81925059, -0.00282437, -0.46445234, ..., -0.31963602,
        -1.24578141,  1.53059564],
       [-0.38096838,  0.09662937, -0.46445234, ..., -0.31963602,
         0.80270904, -0.65334042],
       [-0.38096838,  0.09662937,  0.41270964, ..., -0.31963602,
         0.80270904, -0.65334042],
       ...,
       [ 0.81925059, -0.00282437, -0.46445234, ..., -0.31963602,
         0.80270904, -0.65334042],
       [ 0.81925059,  0.47888431,  0.41270964, ..., -0.31963602,
        -1.24578141,  1.53059564],
       [-0.38096838,  2.31370804,  0.41270964, ..., -0.31963602,
         0.80270904, -0.65334042]])

### Training the K-NN model on the Training set

In [24]:
from sklearn.neighbors import KNeighborsClassifier
cls = KNeighborsClassifier(n_neighbors=7,metric='minkowski',p=2)
cls.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=7)

### Model Score on Training data

In [25]:
cls.score(x_train,y_train)

0.8455056179775281

### Model Score on Testing data

In [26]:
cls.score(x_test,y_test)

0.8156424581005587

### Predicting the Test set Results

In [27]:
y_predict = cls.predict(x_test)

### Confusion Matrix

In [28]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[99, 11],
       [22, 47]], dtype=int64)

### Classification Report

In [29]:
from sklearn.metrics import accuracy_score,recall_score,r2_score,precision_score,f1_score
print("Accuracy Score =" ,accuracy_score(y_predict,y_test))
print("R2 Score =", r2_score(y_predict,y_test))
print("Recall Score =",recall_score(y_predict,y_test))
print("Precision Score =",precision_score(y_predict,y_test))
print("f1 Score =", f1_score(y_predict,y_test))

Accuracy Score = 0.8156424581005587
R2 Score = 0.15830721003134784
Recall Score = 0.8103448275862069
Precision Score = 0.6811594202898551
f1 Score = 0.7401574803149606
