In [51]:
import pandas as pd
import numpy as np

#modeling imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#evaulation metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import acquire
import prepare

#### Use titanic for all these models

In [52]:
df = acquire.get_titanic_data()

this file exists, reading csv


In [53]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [54]:
df.shape

(891, 13)

In [55]:
df = prepare.clean_titanic(df)

In [56]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,Southampton,1


In [57]:
df.shape

(712, 10)

In [58]:
train, validate, test = prepare.splitting_data(df, 'survived')

In [59]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
702,702,0,3,female,18.0,0,1,14.4542,Cherbourg,0
199,199,0,2,female,24.0,0,0,13.0,Southampton,1
108,108,0,3,male,38.0,0,0,7.8958,Southampton,1
872,872,0,1,male,33.0,0,0,5.0,Southampton,1
827,827,1,2,male,1.0,0,2,37.0042,Cherbourg,0


In [60]:
def preprocess_titanic(train_df, val_df, test_df):
    '''
    preprocess_titanic will take in three pandas dataframes
    of our titanic data, expected as cleaned versions of this 
    titanic data set (see documentation on acquire.py and prepare.py)
    
    output:
    encoded, ML-ready versions of our clean data, with 
    columns sex and embark_town encoded in the one-hot fashion
    return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
    '''
    # with a looping structure:
    # for df in [train_df, val_df, test_df]:
    #     df.drop(blah blah blah)
    #     df['pclass'] = df['pclass'].astype(int)
    train_df = train_df.drop(columns='passenger_id')
    train_df['pclass'] = train_df['pclass'].astype(int)
    val_df = val_df.drop(columns='passenger_id')
    val_df['pclass'] = val_df['pclass'].astype(int)
    test_df = test_df.drop(columns='passenger_id')
    test_df['pclass'] = test_df['pclass'].astype(int)
    encoding_var = ['sex', 'embark_town']
    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(
            df[['embark_town', 'sex']],
              drop_first=True).astype(int)
        encoded_dfs.append(pd.concat(
            [df,
            df_encoded_cats],
            axis=1).drop(columns=['sex', 'embark_town']))
    return encoded_dfs

In [61]:
train, validate, test = preprocess_titanic(train, validate, test)

In [62]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
702,0,3,18.0,0,1,14.4542,0,0,0,0
199,0,2,24.0,0,0,13.0,1,0,1,0
108,0,3,38.0,0,0,7.8958,1,0,1,1
872,0,1,33.0,0,0,5.0,1,0,1,1
827,1,2,1.0,0,2,37.0042,0,0,0,1


In [63]:
train.dtypes

survived                     int64
pclass                       int64
age                        float64
sibsp                        int64
parch                        int64
fare                       float64
alone                        int64
embark_town_Queenstown       int64
embark_town_Southampton      int64
sex_male                     int64
dtype: object

In [64]:
X_train = train.drop(columns='survived')
X_validate = validate.drop(columns='survived')
X_test = test.drop(columns='survived')

In [65]:
y_train = train.survived
y_validate = validate.survived
y_test = test.survived

## Decision Tree

#### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [66]:
df.survived.mode()

0    0
Name: survived, dtype: int64

In [67]:
(df.survived == 0).mean()

0.5955056179775281

> my baseline prediction is 0 (not survived) and the accuracy is 62%

#### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [68]:
#create the object
tree = DecisionTreeClassifier()
tree

In [69]:
#fit the object
tree.fit(X_train, y_train) #fit only on TRAIN DATA!

In [70]:
#store my predicted values
y_pred = tree.predict(X_train)
y_pred[:10]

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

#### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [71]:
#accuracy score
tree.score(X_train, y_train)

0.990632318501171

In [72]:
confusion_matrix(y_train, y_pred) #rows, columns

array([[254,   0],
       [  4, 169]])

In [73]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,254,0
1,4,169


In [74]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       254
           1       1.00      0.98      0.99       173

    accuracy                           0.99       427
   macro avg       0.99      0.99      0.99       427
weighted avg       0.99      0.99      0.99       427



#### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

positive = 1 (survived)

In [78]:
counts = pd.crosstab(y_train, y_pred)
counts

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,254,0
1,4,169


In [79]:
tp = counts.iloc[1,1]
tn = counts.iloc[0,0]
fp = counts.iloc[0,1]
fn = counts.iloc[1,0]

In [80]:
tp, tn, fp, fn

(169, 254, 0, 4)

In [81]:
def compute_class_metrics(y_train, y_pred):
    
    counts = pd.crosstab(y_train, y_pred)
    TP = counts.iloc[1,1]
    TN = counts.iloc[0,0]
    FP = counts.iloc[0,1]
    FN = counts.iloc[1,0]
    
    
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN
    
    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [82]:
compute_class_metrics(y_train, y_pred)

Accuracy: 0.990632318501171

True Positive Rate/Sensitivity/Recall/Power: 0.976878612716763
False Positive Rate/False Alarm Ratio/Fall-out: 0.0
True Negative Rate/Specificity/Selectivity: 1.0
False Negative Rate/Miss Rate: 0.023121387283236993

Precision/PPV: 1.0
F1 Score: 0.9883040935672515

Support (0): 173
Support (1): 254


In [83]:
pd.crosstab(y_train, y_pred, normalize='index')

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,0.0
1,0.023121,0.976879


#### Run through steps 2-4 using a different max_depth value.

In [84]:
for x in range(1,21):
    print(x)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [85]:
for x in range(1,21):
    #create the object
    tree = DecisionTreeClassifier(max_depth=x)
    
    #fit the object
    tree.fit(X_train, y_train) #fit only on TRAIN DATA!
    
    #calculate the accuracy
    acc = tree.score(X_train, y_train)
    
    print(f'with a max depth of {x}, the accuracy is {round(acc,2)}')

with a max depth of 1, the accuracy is 0.79
with a max depth of 2, the accuracy is 0.8
with a max depth of 3, the accuracy is 0.81
with a max depth of 4, the accuracy is 0.84
with a max depth of 5, the accuracy is 0.85
with a max depth of 6, the accuracy is 0.87
with a max depth of 7, the accuracy is 0.89
with a max depth of 8, the accuracy is 0.91
with a max depth of 9, the accuracy is 0.93
with a max depth of 10, the accuracy is 0.94
with a max depth of 11, the accuracy is 0.96
with a max depth of 12, the accuracy is 0.97
with a max depth of 13, the accuracy is 0.98
with a max depth of 14, the accuracy is 0.99
with a max depth of 15, the accuracy is 0.99
with a max depth of 16, the accuracy is 0.99
with a max depth of 17, the accuracy is 0.99
with a max depth of 18, the accuracy is 0.99
with a max depth of 19, the accuracy is 0.99
with a max depth of 20, the accuracy is 0.99


#### Which model performs better on your in-sample data?

> model with max depth of 13 is best

#### Which model performs best on your out-of-sample data, the validate set?

In [86]:
for x in range(1,14):
    #create the object
    tree = DecisionTreeClassifier(max_depth=x)
    
    #fit the object
    tree.fit(X_train, y_train) #fit only on TRAIN DATA!
    
    #calculate the accuracy for train
    acc = tree.score(X_train, y_train)
    
    #calculate the accuracy for validate
    acc_v = tree.score(X_validate, y_validate)
    
    print(f'max depth of {x}, the accuracy train = {round(acc,2)}, val = {round(acc_v,2)}')

max depth of 1, the accuracy train = 0.79, val = 0.77
max depth of 2, the accuracy train = 0.8, val = 0.79
max depth of 3, the accuracy train = 0.81, val = 0.8
max depth of 4, the accuracy train = 0.84, val = 0.83
max depth of 5, the accuracy train = 0.85, val = 0.8
max depth of 6, the accuracy train = 0.87, val = 0.73
max depth of 7, the accuracy train = 0.89, val = 0.77
max depth of 8, the accuracy train = 0.91, val = 0.74
max depth of 9, the accuracy train = 0.93, val = 0.75
max depth of 10, the accuracy train = 0.94, val = 0.74
max depth of 11, the accuracy train = 0.96, val = 0.73
max depth of 12, the accuracy train = 0.97, val = 0.73
max depth of 13, the accuracy train = 0.99, val = 0.73


> many models overfit, the max depth of 3 performs best

## Random Forest

#### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [87]:
rf = RandomForestClassifier(min_samples_leaf=1, max_depth=10, random_state=123)

In [88]:
rf.fit(X_train, y_train)

In [89]:
y_pred = rf.predict(X_train)
y_pred[:10]

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [90]:
rf.score(X_train, y_train)

0.9648711943793911

In [91]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,253,1
1,14,159


In [92]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       254
           1       0.99      0.92      0.95       173

    accuracy                           0.96       427
   macro avg       0.97      0.96      0.96       427
weighted avg       0.97      0.96      0.96       427



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [93]:
compute_class_metrics(y_train, y_pred)

Accuracy: 0.9648711943793911

True Positive Rate/Sensitivity/Recall/Power: 0.9190751445086706
False Positive Rate/False Alarm Ratio/Fall-out: 0.003937007874015748
True Negative Rate/Specificity/Selectivity: 0.9960629921259843
False Negative Rate/Miss Rate: 0.08092485549132948

Precision/PPV: 0.99375
F1 Score: 0.954954954954955

Support (0): 173
Support (1): 254


#### Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [94]:
for x in range(1,11):
    rf = RandomForestClassifier(min_samples_leaf=x, max_depth=11-x, random_state=123)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)

    acc = rf.score(X_train, y_train)
    
    print(f'for min leaf samples = {x} and max depth = {11-x}, the accuracy is {round(acc,2)}')

for min leaf samples = 1 and max depth = 10, the accuracy is 0.96
for min leaf samples = 2 and max depth = 9, the accuracy is 0.92
for min leaf samples = 3 and max depth = 8, the accuracy is 0.9
for min leaf samples = 4 and max depth = 7, the accuracy is 0.88
for min leaf samples = 5 and max depth = 6, the accuracy is 0.86
for min leaf samples = 6 and max depth = 5, the accuracy is 0.86
for min leaf samples = 7 and max depth = 4, the accuracy is 0.84
for min leaf samples = 8 and max depth = 3, the accuracy is 0.84
for min leaf samples = 9 and max depth = 2, the accuracy is 0.82
for min leaf samples = 10 and max depth = 1, the accuracy is 0.81


#### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> max depth of 10 and min leaf sample of 1 works best because its asking the most questions of the data and requiring the least amount of samples

#### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [95]:
stats = []

for x in range(1,11):
    rf = RandomForestClassifier(min_samples_leaf=x, max_depth=11-x, random_state=123)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)

    acc = rf.score(X_train, y_train)
    acc_val = rf.score(X_validate, y_validate)
    
    stats.append([x, 11-x,round(acc,2),round(acc_val,2) ]) 
    
#     print(f'min_leaf_samples = {x}, max_depth = {11-x}, accuracy train = {round(acc,2)}, val = {round(acc_val,2)}')

In [96]:
stats_df = pd.DataFrame(stats, columns =['min_leaf','max_depth','train_acc','val_acc'])
stats_df

Unnamed: 0,min_leaf,max_depth,train_acc,val_acc
0,1,10,0.96,0.79
1,2,9,0.92,0.81
2,3,8,0.9,0.8
3,4,7,0.88,0.8
4,5,6,0.86,0.8
5,6,5,0.86,0.8
6,7,4,0.84,0.79
7,8,3,0.84,0.8
8,9,2,0.82,0.75
9,10,1,0.81,0.8


In [97]:
stats_df.sort_values('val_acc', ascending=False)

Unnamed: 0,min_leaf,max_depth,train_acc,val_acc
1,2,9,0.92,0.81
2,3,8,0.9,0.8
3,4,7,0.88,0.8
4,5,6,0.86,0.8
5,6,5,0.86,0.8
7,8,3,0.84,0.8
9,10,1,0.81,0.8
0,1,10,0.96,0.79
6,7,4,0.84,0.79
8,9,2,0.82,0.75


## KNN

#### Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [98]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [99]:
y_pred = knn.predict(X_train)
y_pred[:10]

array([1, 0, 0, 0, 1, 1, 1, 0, 0, 1])

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [100]:
knn.score(X_train, y_train)

0.7798594847775175

In [101]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,210,44
1,50,123


In [102]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       254
           1       0.74      0.71      0.72       173

    accuracy                           0.78       427
   macro avg       0.77      0.77      0.77       427
weighted avg       0.78      0.78      0.78       427



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [103]:
compute_class_metrics(y_train, y_pred)

Accuracy: 0.7798594847775175

True Positive Rate/Sensitivity/Recall/Power: 0.7109826589595376
False Positive Rate/False Alarm Ratio/Fall-out: 0.1732283464566929
True Negative Rate/Specificity/Selectivity: 0.8267716535433071
False Negative Rate/Miss Rate: 0.28901734104046245

Precision/PPV: 0.7365269461077845
F1 Score: 0.7235294117647059

Support (0): 173
Support (1): 254


#### Run through steps 1-3 setting k to 10

In [104]:
knn10 = KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train, y_train)
y_pred = knn10.predict(X_train)
compute_class_metrics(y_train, y_pred)

Accuracy: 0.7189695550351288

True Positive Rate/Sensitivity/Recall/Power: 0.47398843930635837
False Positive Rate/False Alarm Ratio/Fall-out: 0.1141732283464567
True Negative Rate/Specificity/Selectivity: 0.8858267716535433
False Negative Rate/Miss Rate: 0.5260115606936416

Precision/PPV: 0.7387387387387387
F1 Score: 0.5774647887323944

Support (0): 173
Support (1): 254


#### Run through steps 1-3 setting k to 20

In [105]:
knn20 = KNeighborsClassifier(n_neighbors=20)
knn20.fit(X_train, y_train)
y_pred = knn20.predict(X_train)
compute_class_metrics(y_train, y_pred)

Accuracy: 0.7236533957845434

True Positive Rate/Sensitivity/Recall/Power: 0.4797687861271676
False Positive Rate/False Alarm Ratio/Fall-out: 0.11023622047244094
True Negative Rate/Specificity/Selectivity: 0.889763779527559
False Negative Rate/Miss Rate: 0.5202312138728323

Precision/PPV: 0.7477477477477478
F1 Score: 0.5845070422535211

Support (0): 173
Support (1): 254


#### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> the model with 5 nearest neighbors performed the best

#### Which model performs best on our out-of-sample data from validate?

In [106]:
knn.score(X_train, y_train)

0.7798594847775175

In [107]:
knn10.score(X_train, y_train)

0.7189695550351288

In [108]:
knn20.score(X_train, y_train)

0.7236533957845434

> it performs best on the model with 5 nearest neighbors

## Logistic Regression

In [109]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
702,3,18.0,0,1,14.4542,0,0,0,0
199,2,24.0,0,0,13.0,1,0,1,0
108,3,38.0,0,0,7.8958,1,0,1,1
872,1,33.0,0,0,5.0,1,0,1,1
827,2,1.0,0,2,37.0042,0,0,0,1


#### Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [110]:
features = ['age','fare','pclass']
X_train[features].head()

Unnamed: 0,age,fare,pclass
702,18.0,14.4542,3
199,24.0,13.0,2
108,38.0,7.8958,3
872,33.0,5.0,1
827,1.0,37.0042,2


In [111]:
#make it
lr = LogisticRegression()

#fit it
lr.fit(X_train[features], y_train)

In [112]:
lr.score(X_train[features], y_train)

0.7353629976580797

#### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [113]:
features = ['age','fare','pclass', 'sex_male']
X_train[features].head()

Unnamed: 0,age,fare,pclass,sex_male
702,18.0,14.4542,3,0
199,24.0,13.0,2,0
108,38.0,7.8958,3,1
872,33.0,5.0,1,1
827,1.0,37.0042,2,1


In [114]:
lr1 = LogisticRegression()
lr1.fit(X_train[features], y_train)

In [115]:
lr1.score(X_train[features],y_train)

0.8032786885245902