In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import env
import acquire
import prepare

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
SEED = 21

In [4]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df = df.drop(columns=['sex','embark_town'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,0,1,0,0,1
1,1,1,1,0,71.2833,0,1,0,1,0,0
2,1,3,0,0,7.925,1,1,0,0,0,1
3,1,1,1,0,53.1,0,1,0,0,0,1
4,0,3,0,0,8.05,1,0,1,0,0,1


In [5]:
train, val, test = prepare.split_data(df,target='survived')
train.shape,val.shape,test.shape

((711, 11), (124, 11), (54, 11))

In [6]:
X_train = train.drop(columns='survived')
y_train = train.survived

X_val = val.drop(columns='survived')
y_val = val.survived

X_test = test.drop(columns='survived')
y_test = test.survived

Create KNeighbors object and fit it

In [7]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [8]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
knn.score(X_train, y_train)

0.8368495077355836

In [9]:
print('Train Set Score - n_neighbors=5, weights="uniform"')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

Train Set Score - n_neighbors=5, weights="uniform"
[[391  48]
 [ 68 204]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       439
           1       0.81      0.75      0.78       272

    accuracy                           0.84       711
   macro avg       0.83      0.82      0.82       711
weighted avg       0.84      0.84      0.84       711



In [10]:
print('Validate Set Score - n_neighbors=5, weights="uniform"')
y_pred = knn.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Validate Set Score - n_neighbors=5, weights="uniform"
[[62 15]
 [11 36]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83        77
           1       0.71      0.77      0.73        47

    accuracy                           0.79       124
   macro avg       0.78      0.79      0.78       124
weighted avg       0.79      0.79      0.79       124



Adjust our KNN object to n_neighbors=10, weights="distance"

In [11]:
knn = KNeighborsClassifier(n_neighbors=10, weights="uniform")
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [12]:
print('Train Set Score - n_neighbors=10')
y_pred = knn.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

Train Set Score - n_neighbors=10
[[392  47]
 [ 97 175]]
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       439
           1       0.79      0.64      0.71       272

    accuracy                           0.80       711
   macro avg       0.79      0.77      0.78       711
weighted avg       0.80      0.80      0.79       711



In [13]:
print('Validate Set Score - n_neighbors=10')
y_pred = knn.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Validate Set Score - n_neighbors=10
[[62 15]
 [14 33]]
              precision    recall  f1-score   support

           0       0.82      0.81      0.81        77
           1       0.69      0.70      0.69        47

    accuracy                           0.77       124
   macro avg       0.75      0.75      0.75       124
weighted avg       0.77      0.77      0.77       124



In [14]:
knn = KNeighborsClassifier(n_neighbors=20, weights="uniform")
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [15]:
print('Train Set Score - n_neighbors=20')
y_pred = knn.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

Train Set Score - n_neighbors=20
[[382  57]
 [131 141]]
              precision    recall  f1-score   support

           0       0.74      0.87      0.80       439
           1       0.71      0.52      0.60       272

    accuracy                           0.74       711
   macro avg       0.73      0.69      0.70       711
weighted avg       0.73      0.74      0.73       711



In [16]:
print('Validate Set Score - n_neighbors=20')
y_pred = knn.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Validate Set Score - n_neighbors=20
[[62 15]
 [21 26]]
              precision    recall  f1-score   support

           0       0.75      0.81      0.77        77
           1       0.63      0.55      0.59        47

    accuracy                           0.71       124
   macro avg       0.69      0.68      0.68       124
weighted avg       0.70      0.71      0.71       124



What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

As I increase the n_neighbors parameter the accuracy score on the validate set decreases. I'm not sure why. I wonder in what situations an increase would lead to better performance.

Which model performs best on our out-of-sample data from validate?

n_neighbors = 5 performs the best on the validate set with an accuracy score of 79%