In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix 

import acquire_b
from prepare_b import prep_titanic_data

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Acquire Step
df = acquire_b.get_titanic_data()
# prepare the data
train, validate, test = prep_titanic_data(df)

# drop object columns and create X_train of features only 
# and y_train of survived only. 
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

# check the shape
X_train.shape, X_validate.shape, X_test.shape

((498, 9), (214, 9), (179, 9))

In [3]:
baseline_prediction = y_train.mode()

In [4]:
pd.Series((baseline_prediction[0]), range(len(y_train)))

0      0
1      0
2      0
3      0
4      0
      ..
493    0
494    0
495    0
496    0
497    0
Length: 498, dtype: int64

In [5]:
# write a function to compute the baseline for a classification model

def establish_baseline(y_train):
    #  establish the value we will predict for all observations
    baseline_prediction = y_train.mode()

    # create a series of predictions with that value, 
    # the same length as our training set
    y_train_pred = pd.Series((baseline_prediction[0]), range(len(y_train)))

    # compute accuracy of baseline
    cm = confusion_matrix(y_train, y_train_pred)
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp+tn)/(tn+fp+fn+tp)
    return accuracy

In [6]:
establish_baseline(y_train)

0.6164658634538153

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [7]:
# MAKE the thing
knn = KNeighborsClassifier()

# FIT the thing
knn.fit(X_train, y_train)

# USE the thing
y_train_pred = knn.predict(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [8]:
#get the accuracy score of train set
train_score = knn.score(X_train, y_train)
train_score

0.7951807228915663

In [9]:
#confusion matrix
cm = confusion_matrix(y_train, y_train_pred)
pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
             columns=['Pred 0', 'Pred 1'])

Unnamed: 0,Pred 0,Pred 1
Actual 0,258,49
Actual 1,53,138


In [10]:
#classification report
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       307
           1       0.74      0.72      0.73       191

    accuracy                           0.80       498
   macro avg       0.78      0.78      0.78       498
weighted avg       0.79      0.80      0.79       498



In [11]:
#classification report as a df
pd.DataFrame(classification_report(y_train, 
                                   y_train_pred, 
                                   output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.829582,0.840391,0.834951,307.0
1,0.737968,0.722513,0.730159,191.0
accuracy,0.795181,0.795181,0.795181,0.795181
macro avg,0.783775,0.781452,0.782555,498.0
weighted avg,0.794445,0.795181,0.79476,498.0


3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [12]:
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn)/(tn + fp + fn + tp)

true_positive_rate = tp/(tp + fn)
false_positive_rate = fp/(fp + tn)
true_negative_rate = tn/(tn + fp)
false_negative_rate = fn/(fn + tp)

precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1_score = 2*(precision*recall)/(precision+recall)

support_pos = tp + fn
support_neg = fp + tn

dict = {
    'metric' : ['accuracy'
                ,'true_positive_rate'
                ,'false_positive_rate'
                ,'true_negative_rate'
                ,'false_negative_rate'
                ,'precision'
                ,'recall'
                ,'f1_score'
                ,'support_pos'
                ,'support_neg']
    ,'score' : [accuracy
                ,true_positive_rate
                ,false_positive_rate
                ,true_negative_rate
                ,false_negative_rate
                ,precision
                ,recall
                ,f1_score
                ,support_pos
                ,support_neg]
}

pd.DataFrame(dict)

Unnamed: 0,metric,score
0,accuracy,0.795181
1,true_positive_rate,0.722513
2,false_positive_rate,0.159609
3,true_negative_rate,0.840391
4,false_negative_rate,0.277487
5,precision,0.737968
6,recall,0.722513
7,f1_score,0.730159
8,support_pos,191.0
9,support_neg,307.0


In [13]:
def print_cm_metrics(cm):
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn)/(tn + fp + fn + tp)

    true_positive_rate = tp/(tp + fn)
    false_positive_rate = fp/(fp + tn)
    true_negative_rate = tn/(tn + fp)
    false_negative_rate = fn/(fn + tp)

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2*(precision*recall)/(precision+recall)

    support_pos = tp + fn
    support_neg = fp + tn

    dict = {
        'metric' : ['accuracy'
                    ,'true_positive_rate'
                    ,'false_positive_rate'
                    ,'true_negative_rate'
                    ,'false_negative_rate'
                    ,'precision'
                    ,'recall'
                    ,'f1_score'
                    ,'support_pos'
                    ,'support_neg']
        ,'score' : [accuracy
                    ,true_positive_rate
                    ,false_positive_rate
                    ,true_negative_rate
                    ,false_negative_rate
                    ,precision
                    ,recall
                    ,f1_score
                    ,support_pos
                    ,support_neg]
    }

    return pd.DataFrame(dict)

4. Run through steps 1-3 setting k to 10

In [14]:
#Let's create a function to make, fit and use the model

def knn_fit_predict(k, X_train, y_train, X_validate):
    '''
    This function takes n_neighbors, X_train,  target  and X_val
    and returns knn, predictions for train set and validate set
    '''
    # MAKE the thing
    knn = KNeighborsClassifier(n_neighbors=k)

    # FIT the thing
    knn.fit(X_train, y_train)

    # USE the thing
    y_train_pred = knn.predict(X_train)
    y_validate_pred = knn.predict(X_validate)
    
    return knn, y_train_pred, y_validate_pred

In [20]:
#Now, let's make a function to return the 
#accuracy, confusion matrix and classification report

def evaluate_clf(model, X, y, y_pred):
    '''
    This function can be used on any classification model
    It takes in a model, features, target and prediction
    and returns the accuracy, confusion matrix and classification report
    '''
    # model score
    accuracy = model.score(X, y)

    # confusion matrix
    cm = confusion_matrix(y, y_pred)
    cmdf = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
                       columns=['Pred 0', 'Pred 1'])

    # classification report
    crdf = pd.DataFrame(classification_report(y, y_pred, output_dict=True))
    
    # confusion matrix metrics
    metrics = print_cm_metrics(cm)
    
    return accuracy, cmdf, crdf, metrics

In [21]:
#using the functions we created above, and store what's returned in variables:
k = 10
knn, y_train_pred, y_validate_pred = knn_fit_predict(k, 
                                                     X_train, 
                                                     y_train, 
                                                     X_validate)
accuracy_t, cmdf_t, crdf_t, metrics_t = evaluate_clf(knn, X_train, y_train, y_train_pred)

accuracy_v, cmdf_v, crdf_v, metrics_v = evaluate_clf(knn, X_validate, y_validate, y_validate_pred)


In [23]:
print(f"""KNN where K = {k}

********Train Evaluation********

Accuracy: {accuracy_t}

Confusion Matrix:
{cmdf_t}

Classification Report:
{crdf_t}

Metrics: 
{metrics_t}
 
________________________________________________

********Validate Evaluation********

Accuracy: {accuracy_v}

Confusion Matrix:
{cmdf_v}

Classification Report:
{crdf_v}

Metrics: 
{metrics_v}

""")

KNN where K = 10

********Train Evaluation********

Accuracy: 0.7449799196787149

Confusion Matrix:
          Pred 0  Pred 1
Actual 0     275      32
Actual 1      95      96

Classification Report:
                    0           1  accuracy   macro avg  weighted avg
precision    0.743243    0.750000   0.74498    0.746622      0.745835
recall       0.895765    0.502618   0.74498    0.699192      0.744980
f1-score     0.812408    0.601881   0.74498    0.707144      0.731663
support    307.000000  191.000000   0.74498  498.000000    498.000000

Metrics: 
                metric       score
0             accuracy    0.744980
1   true_positive_rate    0.502618
2  false_positive_rate    0.104235
3   true_negative_rate    0.895765
4  false_negative_rate    0.497382
5            precision    0.750000
6               recall    0.502618
7             f1_score    0.601881
8          support_pos  191.000000
9          support_neg  307.000000
 
________________________________________________

***

5. Run through steps 1-3 setting k to 20

In [25]:
k = 20
knn, y_train_pred, y_validate_pred = knn_fit_predict(k, 
                                                     X_train, 
                                                     y_train, 
                                                     X_validate)
accuracy_t, cmdf_t, crdf_t, met_t = evaluate_clf(knn, X_train, y_train, y_train_pred)

accuracy_v, cmdf_v, crdf_v, met_v = evaluate_clf(knn, X_validate, y_validate, y_validate_pred)

In [26]:
print(f"""KNN where K = {k}

********Train Evaluation********

Accuracy: {accuracy_t}

Confusion Matrix:
{cmdf_t}

Classification Report:
{crdf_t}

Metrics: 
{met_t}
 
________________________________________________

********Validate Evaluation********

Accuracy: {accuracy_v}

Confusion Matrix:
{cmdf_v}

Classification Report:
{crdf_v}

Metrics: 
{met_v}

""")

KNN where K = 20

********Train Evaluation********

Accuracy: 0.7188755020080321

Confusion Matrix:
          Pred 0  Pred 1
Actual 0     269      38
Actual 1     102      89

Classification Report:
                    0           1  accuracy   macro avg  weighted avg
precision    0.725067    0.700787  0.718876    0.712927      0.715755
recall       0.876221    0.465969  0.718876    0.671095      0.718876
f1-score     0.793510    0.559748  0.718876    0.676629      0.703855
support    307.000000  191.000000  0.718876  498.000000    498.000000

Metrics: 
                metric       score
0             accuracy    0.718876
1   true_positive_rate    0.465969
2  false_positive_rate    0.123779
3   true_negative_rate    0.876221
4  false_negative_rate    0.534031
5            precision    0.700787
6               recall    0.465969
7             f1_score    0.559748
8          support_pos  191.000000
9          support_neg  307.000000
 
________________________________________________

***