### Create a new notebook, random_forests, and work with titanic data to do the following:

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
import acquire as aq
import prepare

In [2]:
df = aq.get_titanic_data()

In [3]:
df = prepare.prep_titanic(df)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,S,Southampton,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg,0,0,0,0
2,1,3,female,26.0,0,0,7.925,S,Southampton,1,0,0,1
3,1,1,female,35.0,1,0,53.1,S,Southampton,0,0,0,1
4,0,3,male,35.0,0,0,8.05,S,Southampton,1,1,0,1


In [4]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [5]:
df['bl_survival_pred'] = 0

In [6]:
train_val, test = train_test_split(
    df,
    train_size = 0.8,
    random_state = 1349,
    stratify = df['survived']
)

train, validate = train_test_split(
    train_val,
    train_size = 0.7,
    random_state = 1349,
    stratify = train_val['survived']
)

In [7]:
train.shape, validate.shape, test.shape

((498, 14), (214, 14), (179, 14))

In [8]:
x_cols = train.columns.to_list()
y_cols = 'survived'

In [9]:
x_cols.remove('survived')
x_cols.remove('bl_survival_pred')
x_cols.remove('sex')
x_cols.remove('embarked')
x_cols.remove('embark_town')

##### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [13]:
rf = RandomForestClassifier(min_samples_leaf=1,
                            max_depth=10, 
                            random_state=123)

In [14]:
rf.fit(train[x_cols], train[y_cols])

In [16]:
y_pred = rf.predict(train[x_cols])

##### Evaluate your results using the model score, confusion matrix, and classification report.

In [17]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(train[x_cols], train[y_cols])))

Accuracy of random forest classifier on training set: 0.96


In [24]:
cm = confusion_matrix(train[y_cols], y_pred)
cm

array([[304,   3],
       [ 17, 174]])

In [19]:
print(classification_report(train[y_cols], y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       307
           1       0.98      0.91      0.95       191

    accuracy                           0.96       498
   macro avg       0.97      0.95      0.96       498
weighted avg       0.96      0.96      0.96       498



##### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [23]:
print(f' The accuracy score is {sklearn.metrics.accuracy_score(train[y_cols], y_pred)}')

 The accuracy score is 0.9598393574297188


In [25]:
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]

In [26]:
print(f'The TP rate is {TP}')
print(f'The FP rate is {FP}') 
print(f'The TN rate is {TN}')
print(f'The FN rate is {FN}')

The TP rate is 304
The FP rate is 17
The TN rate is 174
The FN rate is 3


In [28]:
prec, recall, fscore, support = sklearn.metrics.precision_recall_fscore_support(train[y_cols], y_pred, average='binary')

In [29]:
print(f'The precision score is {prec}')
print(f'The recall score is {recall}') 
print(f'The fscore is {fscore}')
print(f'The support is {support}')

The precision score is 0.9830508474576272
The recall score is 0.9109947643979057
The fscore is 0.9456521739130435
The support is None


##### Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [30]:
rf2 = RandomForestClassifier(min_samples_leaf=4,
                            max_depth=6, 
                            random_state=123)

In [31]:
rf2.fit(train[x_cols], train[y_cols])

In [32]:
y_pred2 = rf2.predict(train[x_cols])

In [34]:
print('Accuracy of model 2 on training set: {:.2f}'
     .format(rf2.score(train[x_cols], train[y_cols])))

Accuracy of model 2 on training set: 0.87


In [35]:
cm2 = confusion_matrix(train[y_cols], y_pred2)
cm2

array([[291,  16],
       [ 47, 144]])

In [36]:
print(classification_report(train[y_cols], y_pred2))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       307
           1       0.90      0.75      0.82       191

    accuracy                           0.87       498
   macro avg       0.88      0.85      0.86       498
weighted avg       0.88      0.87      0.87       498



In [37]:
print(f' The accuracy score is {sklearn.metrics.accuracy_score(train[y_cols], y_pred2)}')

 The accuracy score is 0.8734939759036144


In [38]:
TP2 = cm2[0][0]
FN2 = cm2[0][1]
FP2 = cm2[1][0]
TN2 = cm2[1][1]

In [39]:
print(f'The TP rate of model 2 is {TP2}')
print(f'The FP rate of model 2 is {FP2}') 
print(f'The TN rate of model 2 is {TN2}')
print(f'The FN rate of model 2 is {FN2}')

The TP rate of model 2 is 291
The FP rate of model 2 is 47
The TN rate of model 2 is 144
The FN rate of model 2 is 16


In [40]:
prec2, recall2, fscore2, support2 = sklearn.metrics.precision_recall_fscore_support(train[y_cols], y_pred2, average='binary')

In [41]:
print(f'The precision score of model 2 is {prec2}')
print(f'The recall score of model 2 is {recall2}') 
print(f'The fscore of model 2 is {fscore2}')
print(f'The support of model 2 is {support2}')

The precision score of model 2 is 0.9
The recall score of model 2 is 0.7539267015706806
The fscore of model 2 is 0.8205128205128205
The support of model 2 is None


##### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Model 1 performed much better accross all evaluation metrics. Model 1 has a max depth of 10 compared to a max depth of 6 on model 2. Model 1 performed better because it is likey overfitting on the train set. We will confirm this in the vailation phase

##### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [52]:
 print('Accuracy of random forest classifier model 1 on validate set: {:.4f}'
     .format(rf.score(validate[x_cols], validate[y_cols])))

Accuracy of random forest classifier model 1 on validate set: 0.7897


In [53]:
 print('Accuracy of random forest classifier model 2 on validate set: {:.4f}'
     .format(rf2.score(validate[x_cols], validate[y_cols])))

Accuracy of random forest classifier model 2 on validate set: 0.7850


##### model 1 vs model 2 validate set accuracy

model 1 train accuracy    - .96
model 1 validate accuracy - .78


model 2 train accuracy    - .87
model 2 validate accuracy - .78


Model 2 has accuracy scores with a much lower delta between train and validate. This is the best performing model.