In [142]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.metrics import ConfusionMatrixDisplay


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pandas as pd
from pydataset import data

import prepare
import acquire

In [146]:
# acquire the data
df = acquire.get_titanic_data()
df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [148]:
# split into train, test, and validate sections
train, validate, test = prepare.prep_titanic_data(df)

train.shape, validate.shape, test.shape

((498, 11), (214, 11), (179, 11))

In [162]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [149]:
# display head of df for reference
train.head(2)

Unnamed: 0.1,Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,36.0,0,0,40.125,1,1,0,0
165,165,1,3,9.0,0,2,20.525,0,1,0,1


In [152]:
#create a baseline prediction
baseline = y_train.mode()

In [156]:
#test the baseline model accuracy
matches_baseline = y_train == 0
matches_baseline.shape

(498,)

In [159]:
baseline_accuracy = matches_baseline.mean()
print(f'Baseline accuracy is {baseline_accuracy}')

Baseline accuracy is 0.6164658634538153


In [177]:
# initiate a random forest model
rf = rfc(min_samples_leaf=1, max_depth=1, random_state=123)

In [179]:
# fit the model
rf.fit(X_train, y_train)

In [180]:
rf.classes_

array([0, 1])

In [181]:
# Make predictions with the model
y_pred = rf.predict(X_train)

In [185]:
# classification report
scores = classification_report(y_train, y_pred, output_dict=True)
pd.DataFrame(scores).T

Unnamed: 0,precision,recall,f1-score,support
0,0.758186,0.980456,0.855114,307.0
1,0.940594,0.497382,0.650685,191.0
accuracy,0.795181,0.795181,0.795181,0.795181
macro avg,0.84939,0.738919,0.752899,498.0
weighted avg,0.828146,0.795181,0.776708,498.0


In [184]:
rf.score

<bound method ClassifierMixin.score of RandomForestClassifier(max_depth=1, random_state=123)>

In [183]:
# confusion matrix
conf = pd.DataFrame(confusion_matrix(y_pred, y_train))
conf

Unnamed: 0,0,1
0,301,96
1,6,95


In [174]:
# store values of negatives and positives
TP, TN, FP, FN = confusion_matrix(y_train, y_pred).ravel()

In [175]:
TP, TN, FP, FN

(301, 6, 96, 95)

In [189]:
all_pos_neg = TP + TN + FP + FN

In [190]:
accuracy = (TP + TN)/all_pos_neg
t_pos_rate = TP/(TP+FN)
f_pos_rate = FN/(FN+TP)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1_score = 2 * (precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

In [191]:
print(f'''
Accuracy = {accuracy}
True Positive Rate = {t_pos_rate}
False Positive Rate = {f_pos_rate}
Precision = {precision}
Recall = {recall}
F1 Score = {f1_score}
Positive Support = {support_pos}
Negative Support = {support_neg}
''')


Accuracy = 0.6164658634538153
True Positive Rate = 0.76010101010101
False Positive Rate = 0.2398989898989899
Precision = 0.7581863979848866
Recall = 0.76010101010101
F1 Score = 0.759142496847415
Positive Support = 396
Negative Support = 102



In [197]:
  
for i in range(1,11):  
    # initiate a random forest model
    rf = rfc(min_samples_leaf=i, max_depth=5, random_state=123)

    # fit the model
    rf.fit(X_train, y_train)

    # Make predictions with the model
    y_pred = rf.predict(X_train)

    # classification report
    scores = classification_report(y_train, y_pred, output_dict=True)
    pd.DataFrame(scores).T

    rf.score

    # confusion matrix
    conf = pd.DataFrame(confusion_matrix(y_pred, y_train))
    conf

    # store values of negatives and positives
    TP, TN, FP, FN = confusion_matrix(y_train, y_pred).ravel()

    TP, TN, FP, FN

    all_pos_neg = TP + TN + FP + FN

    accuracy = (TP + TN)/all_pos_neg
    t_pos_rate = TP/(TP+FN)
    f_pos_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2 * (precision*recall)/(precision+recall)
    support_pos = TP + FN
    support_neg = FP + TN

    print(f'''
    min_samples_leaf = {i}
    
    Accuracy = {accuracy}
    True Positive Rate = {t_pos_rate}
    False Positive Rate = {f_pos_rate}
    Precision = {precision}
    Recall = {recall}
    F1 Score = {f1_score}
    Positive Support = {support_pos}
    Negative Support = {support_neg}
    
    -----------------------------------
    ''')


    min_samples_leaf = 1
    
    Accuracy = 0.6164658634538153
    True Positive Rate = 0.6842105263157895
    False Positive Rate = 0.3157894736842105
    Precision = 0.8494318181818182
    Recall = 0.6842105263157895
    F1 Score = 0.7579214195183778
    Positive Support = 437
    Negative Support = 61
    
    -----------------------------------
    

    min_samples_leaf = 2
    
    Accuracy = 0.6164658634538153
    True Positive Rate = 0.6773455377574371
    False Positive Rate = 0.32265446224256294
    Precision = 0.8554913294797688
    Recall = 0.6773455377574371
    F1 Score = 0.7560664112388251
    Positive Support = 437
    Negative Support = 61
    
    -----------------------------------
    

    min_samples_leaf = 3
    
    Accuracy = 0.6164658634538153
    True Positive Rate = 0.6851851851851852
    False Positive Rate = 0.3148148148148148
    Precision = 0.8433048433048433
    Recall = 0.6851851851851852
    F1 Score = 0.7560664112388251
    Positive Support = 432
 

In [199]:
for i in range(1,11):  
    # initiate a random forest model
    rf = rfc(min_samples_leaf=5, max_depth=i, random_state=123)

    # fit the model
    rf.fit(X_train, y_train)

    # Make predictions with the model
    y_pred = rf.predict(X_train)

    # classification report
    scores = classification_report(y_train, y_pred, output_dict=True)
    pd.DataFrame(scores).T

    rf.score

    # confusion matrix
    conf = pd.DataFrame(confusion_matrix(y_pred, y_train))
    conf

    # store values of negatives and positives
    TP, TN, FP, FN = confusion_matrix(y_train, y_pred).ravel()

    TP, TN, FP, FN

    all_pos_neg = TP + TN + FP + FN

    accuracy = (TP + TN)/all_pos_neg
    t_pos_rate = TP/(TP+FN)
    f_pos_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2 * (precision*recall)/(precision+recall)
    support_pos = TP + FN
    support_neg = FP + TN

    print(f'''
    max_depth = {i}
    
    Accuracy = {accuracy}
    True Positive Rate = {t_pos_rate}
    False Positive Rate = {f_pos_rate}
    Precision = {precision}
    Recall = {recall}
    F1 Score = {f1_score}
    Positive Support = {support_pos}
    Negative Support = {support_neg}
    
    -----------------------------------
    ''')


    max_depth = 1
    
    Accuracy = 0.6164658634538153
    True Positive Rate = 0.76010101010101
    False Positive Rate = 0.2398989898989899
    Precision = 0.7581863979848866
    Recall = 0.76010101010101
    F1 Score = 0.759142496847415
    Positive Support = 396
    Negative Support = 102
    
    -----------------------------------
    

    max_depth = 2
    
    Accuracy = 0.6164658634538153
    True Positive Rate = 0.7279596977329975
    False Positive Rate = 0.27204030226700254
    Precision = 0.7768817204301075
    Recall = 0.7279596977329975
    F1 Score = 0.7516254876462939
    Positive Support = 397
    Negative Support = 101
    
    -----------------------------------
    

    max_depth = 3
    
    Accuracy = 0.6164658634538153
    True Positive Rate = 0.6900726392251816
    False Positive Rate = 0.3099273607748184
    Precision = 0.8189655172413793
    Recall = 0.6900726392251816
    F1 Score = 0.7490144546649146
    Positive Support = 413
    Negative Support = 85