# Part I
Create a decision tree model of the Titanic dataset that predicts survival from seaborn. You will need to compute some data wrangling before charging ahead. Make sure to complete the following wrangling tasks:

Recode string data
Remove missing data
Drop any variables that are redundant and will add to multicollinearity.

Once you have created a decision tree model, interpret the confusion matrix and classification report.

In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [170]:
Titanic = sns.load_dataset('titanic')

In [171]:
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [172]:
TitanicTrimmed = Titanic.drop(['sibsp','embarked','class','adult_male','embark_town'], axis= 1)

In [173]:
TitanicTrimmed.dropna(inplace=True)

In [174]:
TitanicTrimmed

Unnamed: 0,survived,pclass,sex,age,parch,fare,who,deck,alive,alone
1,1,1,female,38.0,0,71.2833,woman,C,yes,False
3,1,1,female,35.0,0,53.1000,woman,C,yes,False
6,0,1,male,54.0,0,51.8625,man,E,no,True
10,1,3,female,4.0,1,16.7000,child,G,yes,False
11,1,1,female,58.0,0,26.5500,woman,C,yes,True
...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,52.5542,woman,D,yes,False
872,0,1,male,33.0,0,5.0000,man,B,no,True
879,1,1,female,56.0,1,83.1583,woman,C,yes,False
887,1,1,female,19.0,0,30.0000,woman,B,yes,True


In [175]:
TitanicTrimmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184 entries, 1 to 889
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  184 non-null    int64   
 1   pclass    184 non-null    int64   
 2   sex       184 non-null    object  
 3   age       184 non-null    float64 
 4   parch     184 non-null    int64   
 5   fare      184 non-null    float64 
 6   who       184 non-null    object  
 7   deck      184 non-null    category
 8   alive     184 non-null    object  
 9   alone     184 non-null    bool    
dtypes: bool(1), category(1), float64(2), int64(3), object(3)
memory usage: 13.6+ KB


In [176]:
TitanicTrimmed

Unnamed: 0,survived,pclass,sex,age,parch,fare,who,deck,alive,alone
1,1,1,female,38.0,0,71.2833,woman,C,yes,False
3,1,1,female,35.0,0,53.1000,woman,C,yes,False
6,0,1,male,54.0,0,51.8625,man,E,no,True
10,1,3,female,4.0,1,16.7000,child,G,yes,False
11,1,1,female,58.0,0,26.5500,woman,C,yes,True
...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,52.5542,woman,D,yes,False
872,0,1,male,33.0,0,5.0000,man,B,no,True
879,1,1,female,56.0,1,83.1583,woman,C,yes,False
887,1,1,female,19.0,0,30.0000,woman,B,yes,True


In [177]:
TitanicTrimmed.deck.count

<bound method Series.count of 1      C
3      C
6      E
10     G
11     C
      ..
871    D
872    B
879    C
887    B
889    C
Name: deck, Length: 184, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']>

In [178]:
def deck_recode (series):
    if series == "A":
        return 0
    if series == "B":
        return 1
    if series == "C":
        return 2
    if series == "D":
        return 3
    if series == "E":
        return 4
    if series == "F":
        return 5
    if series == "G":
        return 6
TitanicTrimmed['deckR'] = TitanicTrimmed['deck'].apply(deck_recode)

In [179]:
def who_recode (series):
    if series == "man":
        return 1
    if series == "woman":
        return 2
TitanicTrimmed['whoR'] = TitanicTrimmed['who'].apply(who_recode)

In [180]:
def alive_recode (series):
    if series == "no":
        return 0
    if series == "yes":
        return 1
TitanicTrimmed['aliveR'] = TitanicTrimmed['alive'].apply(alive_recode)

In [181]:
TitanicTrimmed

Unnamed: 0,survived,pclass,sex,age,parch,fare,who,deck,alive,alone,deckR,whoR,aliveR
1,1,1,female,38.0,0,71.2833,woman,C,yes,False,2,2.0,1
3,1,1,female,35.0,0,53.1000,woman,C,yes,False,2,2.0,1
6,0,1,male,54.0,0,51.8625,man,E,no,True,4,1.0,0
10,1,3,female,4.0,1,16.7000,child,G,yes,False,6,,1
11,1,1,female,58.0,0,26.5500,woman,C,yes,True,2,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,52.5542,woman,D,yes,False,3,2.0,1
872,0,1,male,33.0,0,5.0000,man,B,no,True,1,1.0,0
879,1,1,female,56.0,1,83.1583,woman,C,yes,False,2,2.0,1
887,1,1,female,19.0,0,30.0000,woman,B,yes,True,1,2.0,1


In [182]:
TitanicTrimmed = TitanicTrimmed.drop(['who','alive','sex','whoR', 'deck'], axis= 1)

In [183]:
TitanicTrimmed.age = TitanicTrimmed.age.astype(int)
TitanicTrimmed.fare = TitanicTrimmed.fare.astype(int)
TitanicTrimmed.deckR = TitanicTrimmed.deckR.astype(int)

In [184]:
TitanicTrimmed

Unnamed: 0,survived,pclass,age,parch,fare,alone,deckR,aliveR
1,1,1,38,0,71,False,2,1
3,1,1,35,0,53,False,2,1
6,0,1,54,0,51,True,4,0
10,1,3,4,1,16,False,6,1
11,1,1,58,0,26,True,2,1
...,...,...,...,...,...,...,...,...
871,1,1,47,1,52,False,3,1
872,0,1,33,0,5,True,1,0
879,1,1,56,1,83,False,2,1
887,1,1,19,0,30,True,1,1


In [185]:
TitanicTrimmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184 entries, 1 to 889
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   survived  184 non-null    int64
 1   pclass    184 non-null    int64
 2   age       184 non-null    int32
 3   parch     184 non-null    int64
 4   fare      184 non-null    int32
 5   alone     184 non-null    bool 
 6   deckR     184 non-null    int32
 7   aliveR    184 non-null    int64
dtypes: bool(1), int32(3), int64(4)
memory usage: 9.5 KB


In [186]:
x = TitanicTrimmed.drop('survived', axis=1)

In [187]:
y = TitanicTrimmed['survived']

In [188]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=76)

In [189]:
decisionTree = DecisionTreeClassifier(random_state=76)
decisionTree.fit(x_train, y_train)

DecisionTreeClassifier(random_state=76)

In [190]:
treePredictions = decisionTree.predict(x_test)

In [191]:
print(confusion_matrix(y_test, treePredictions))

[[15  0]
 [ 0 41]]


In [192]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        41

    accuracy                           1.00        56
   macro avg       1.00      1.00      1.00        56
weighted avg       1.00      1.00      1.00        56



# Part II

Now create a random forest model of the Titanic dataset that predicts survival. Interpret the confusion matrix and classification report. How did the predictive value change from the decision tree?

## Trimmed Titanic

In [193]:
forest = RandomForestClassifier(n_estimators=500, random_state=76)
forest.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=76)

In [194]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[15  0]
 [ 0 41]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        41

    accuracy                           1.00        56
   macro avg       1.00      1.00      1.00        56
weighted avg       1.00      1.00      1.00        56



There was no difference in accuracy between the decision tree and the random forest.