# <font color = 'orange'><b>KNN : K-Nearest Neighbors</b></font>

In [1]:
# K Nearest Neighbors algorithm is a simple machine learning algorithm that is used for both classification and regression problems.
# It relies on the idea that the similar data points tend to have similar classes or labels. 
# KNN algorithm assumes that the similar things exist in close proximity.
# The KNN stores the entire training dataset as a reference, and when a new data point is given, it calculates the distance between the new data point and all the stored data points. 

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# import titanic dataset
df = sns.load_dataset('titanic')

In [4]:
df.sample(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
727,1,3,female,,0,0,7.7375,Q,Third,woman,False,,Queenstown,yes,True
868,0,3,male,,0,0,9.5,S,Third,man,True,,Southampton,no,True
783,0,3,male,,1,2,23.45,S,Third,man,True,,Southampton,no,False
219,0,2,male,30.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
689,1,1,female,15.0,0,1,211.3375,S,First,child,False,B,Southampton,yes,False


### <font color = 'orange'><b>Data Preprocessing</b></font>

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

In [13]:
df['age'] = imputer.fit_transform(df[['age']]).astype('int')

In [14]:
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [15]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

For simplicity and practice we are only selecting the two features `Fare` and `Age` to train the KNN model.

In [17]:
X = df[['age', 'fare']]
y = df[['survived']]

In [20]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [21]:
knn = KNeighborsClassifier(n_neighbors=20)

In [22]:
knn.fit(Xtrain, ytrain)

  return self._fit(X, y)


In [23]:
ypred = knn.predict(Xtest)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)

0.695067264573991

In [26]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79       139
           1       0.70      0.33      0.45        84

    accuracy                           0.70       223
   macro avg       0.70      0.62      0.62       223
weighted avg       0.70      0.70      0.66       223



In [25]:
print(confusion_matrix(ytest, ypred))

[[127  12]
 [ 56  28]]
