Create a new notebook, knn_model, and work with the titanic dataset to answer the following:

In [1]:
import acquire as a
import prepare as p
import model as m

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df = a.get_titanic_data()

File exists - reading CSV file


In [3]:
df = p.prep_titanic(df)

In [4]:
train, validate, test = p.split_data(df, 'survived')

In [5]:
train, validate, test = m.preprocess_titanic(train, validate, test)

In [6]:
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
580,1,2,1,1,30.0,0,0,1,0
140,0,3,0,2,15.2458,0,0,0,0
747,1,2,0,0,13.0,1,0,1,0
615,1,2,1,2,65.0,0,0,1,0
132,0,3,1,0,14.5,0,0,1,0


In [39]:
# baseline
1-train.survived.mean()

0.6161048689138577

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [7]:
X_train = train.drop(columns='survived')
y_train = train.survived

X_val = validate.drop(columns='survived')
y_val = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [8]:
knn = KNeighborsClassifier(n_neighbors=5)

In [9]:
knn = knn.fit(X_train,y_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [10]:
knn.score(X_train,y_train)

0.8426966292134831

In [11]:
knn.score(X_val,y_val)

0.7640449438202247

In [12]:
y_pred = knn.predict(X_train)

In [13]:
labels = (y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,1,0
1,294,35
0,49,156


In [14]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       329
           1       0.82      0.76      0.79       205

    accuracy                           0.84       534
   macro avg       0.84      0.83      0.83       534
weighted avg       0.84      0.84      0.84       534



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [15]:
def compute(tp,tn,fp,fn):
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    true_pos_rate = tp/(tp+fn)
    false_pos_rate = fp/(fp+tn)
    true_neg_rate = tn/(tn+fp)
    false_neg_rate = fn/(tp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = (2*(precision * recall))/(precision + recall)
    support = tp + fn + fp + tn
    
    print(f'accuracy: {accuracy:2f}')
    print(f'true positive rate: {true_pos_rate:2f}')
    print(f'false positive rate: {false_pos_rate:2f}')
    print(f'true negative rate: {true_neg_rate:2f}')
    print(f'false negative rate: {false_neg_rate:2f}')
    print(f'precision: {precision:2f}')
    print(f'recall: {recall:2f}')
    print(f'f1-score: {f1_score:2f}')
    print(f'support: {support:2f}')

In [16]:
tp = 294
tn = 156
fp = 49
fn = 35

In [17]:
compute(tp,tn,fp,fn)

accuracy: 0.842697
true positive rate: 0.893617
false positive rate: 0.239024
true negative rate: 0.760976
false negative rate: 0.106383
precision: 0.857143
recall: 0.893617
f1-score: 0.875000
support: 534.000000


4. Run through steps 1-3 setting k to 10

In [18]:
knn = KNeighborsClassifier(n_neighbors=10)

In [19]:
knn = knn.fit(X_train,y_train)

In [20]:
knn.score(X_train,y_train)

0.7752808988764045

In [21]:
knn.score(X_val,y_val)

0.7303370786516854

In [22]:
y_pred = knn.predict(X_train)

In [23]:
labels = (y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,1,0
1,291,38
0,82,123


In [24]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       329
           1       0.76      0.60      0.67       205

    accuracy                           0.78       534
   macro avg       0.77      0.74      0.75       534
weighted avg       0.77      0.78      0.77       534



In [25]:
tp = 291
tn = 123
fp = 82
fn = 38

In [26]:
compute(tp,tn,fp,fn)

accuracy: 0.775281
true positive rate: 0.884498
false positive rate: 0.400000
true negative rate: 0.600000
false negative rate: 0.115502
precision: 0.780161
recall: 0.884498
f1-score: 0.829060
support: 534.000000


5. Run through steps 1-3 setting k to 20

In [27]:
knn = KNeighborsClassifier(n_neighbors=20)

In [28]:
knn = knn.fit(X_train,y_train)

In [29]:
knn.score(X_train,y_train)

0.7471910112359551

In [30]:
knn.score(X_val,y_val)

0.7528089887640449

In [31]:
y_pred = knn.predict(X_train)

In [32]:
labels = (y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,1,0
1,271,58
0,77,128


In [33]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       329
           1       0.69      0.62      0.65       205

    accuracy                           0.75       534
   macro avg       0.73      0.72      0.73       534
weighted avg       0.74      0.75      0.74       534



In [34]:
tp = 271
tn = 128
fp = 77
fn = 58

In [35]:
compute(tp,tn,fp,fn)

accuracy: 0.747191
true positive rate: 0.823708
false positive rate: 0.375610
true negative rate: 0.624390
false negative rate: 0.176292
precision: 0.778736
recall: 0.823708
f1-score: 0.800591
support: 534.000000


6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> KNeighborsClassifier(n_neighbors=5) had a precision, recall, & accuracy that were quite a bit higher than the others.

> KNeighborsClassifier(n_neighbors=5) because the high precision, recall, & accuracy scores.

7. Which model performs best on our out-of-sample data from validate?

> KNeighborsClassifier(n_neighbors=20)