In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import env
import acquire
import prepare

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
SEED = 21

In [4]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df = df.drop(columns=['sex','embark_town'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,0,1,0,0,1
1,1,1,1,0,71.2833,0,1,0,1,0,0
2,1,3,0,0,7.925,1,1,0,0,0,1
3,1,1,1,0,53.1,0,1,0,0,0,1
4,0,3,0,0,8.05,1,0,1,0,0,1


In [6]:
train, val, test = prepare.split_data(df,target='survived')
train.shape,val.shape,test.shape

((711, 11), (124, 11), (54, 11))

In [7]:
X_train = train.drop(columns='survived')
y_train = train.survived

X_val = val.drop(columns='survived')
y_val = val.survived

X_test = test.drop(columns='survived')
y_test = test.survived

Create RandomForest object and fit it to training data

In [9]:
rf = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10)
rf = rf.fit(X_train,y_train)

In [13]:
y_pred = rf.predict(X_train)

Create Confusion Matrix

In [14]:
confusion_matrix(y_train,y_pred)

array([[427,  12],
       [ 47, 225]])

Run Classification Report

In [16]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.94       439
           1       0.95      0.83      0.88       272

    accuracy                           0.92       711
   macro avg       0.93      0.90      0.91       711
weighted avg       0.92      0.92      0.92       711



In [17]:
y_pred_proba = rf.predict_proba(X_train)

How well do we predict Validate test set?

In [21]:
y_pred = rf.predict(X_val)
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91        77
           1       0.90      0.77      0.83        47

    accuracy                           0.88       124
   macro avg       0.88      0.86      0.87       124
weighted avg       0.88      0.88      0.88       124



Accuracy drops from 92% to 88% between our Training set and Validate set. Could be a sign of overfitting even though Random Forests are more resistant to it than Decision Trees

In [24]:
confusion_matrix(y_val,y_pred)

array([[73,  4],
       [11, 36]])

Increasing  min_samples_leaf and decreasing max_depth

### min_samples_leaf = 3, max_depth = 7

Training Set

In [28]:
rf = RandomForestClassifier(min_samples_leaf = 3, max_depth = 7)
rf = rf.fit(X_train,y_train)
y_pred = rf.predict(X_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[414  25]
 [ 79 193]]
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       439
           1       0.89      0.71      0.79       272

    accuracy                           0.85       711
   macro avg       0.86      0.83      0.84       711
weighted avg       0.86      0.85      0.85       711



Validate Set

In [34]:
y_pred = rf.predict(X_val)
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89        77
           1       0.84      0.77      0.80        47

    accuracy                           0.85       124
   macro avg       0.85      0.84      0.84       124
weighted avg       0.85      0.85      0.85       124

[[70  7]
 [11 36]]


### min_samples_leaf = 5, max_depth =5

Training Data

In [35]:
rf = RandomForestClassifier(min_samples_leaf = 5, max_depth =5)
rf = rf.fit(X_train,y_train)
y_pred = rf.predict(X_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[407  32]
 [100 172]]
              precision    recall  f1-score   support

           0       0.80      0.93      0.86       439
           1       0.84      0.63      0.72       272

    accuracy                           0.81       711
   macro avg       0.82      0.78      0.79       711
weighted avg       0.82      0.81      0.81       711



Validate Set

In [36]:
y_pred = rf.predict(X_val)
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.88        77
           1       0.87      0.70      0.78        47

    accuracy                           0.85       124
   macro avg       0.85      0.82      0.83       124
weighted avg       0.85      0.85      0.84       124

[[72  5]
 [14 33]]
