# Decision Trees and Random Forests

In [1]:
#Load in Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load in Data Set
Titanic = sns.load_dataset('titanic')

In [3]:
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [10]:
#dropping Null Values 
Titanic.dropna(inplace=True)

In [11]:
#Only have 182 values
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     182 non-null    int64   
 1   pclass       182 non-null    int64   
 2   sex          182 non-null    object  
 3   age          182 non-null    float64 
 4   sibsp        182 non-null    int64   
 5   parch        182 non-null    int64   
 6   fare         182 non-null    float64 
 7   embarked     182 non-null    object  
 8   class        182 non-null    category
 9   who          182 non-null    object  
 10  adult_male   182 non-null    bool    
 11  deck         182 non-null    category
 12  embark_town  182 non-null    object  
 13  alive        182 non-null    object  
 14  alone        182 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 18.2+ KB


In [13]:
#recode sex
def sex_Recode (series): 
    if series == "male":
        return 0
    if series == "female":
        return 1
Titanic['sexR'] = Titanic['sex'].apply(sex_Recode)

In [7]:
Titanic.embarked.value_counts()
#S is South Hapmton
#C is Cherbourg
#Q is Queenstown

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [14]:
# recode embarked
def embarked_Recode (series): 
    if series == "S":
        return 0
    if series == "C":
        return 1
    if series == "Q":
        return 2
Titanic['embarkedR'] = Titanic['embarked'].apply(embarked_Recode)

In [8]:
Titanic.deck.value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [17]:
# recode deck
def deck_Recode (series): 
    if series == "C":
        return 0
    if series == "B":
        return 1
    if series == "D":
        return 2
    if series == "E":
        return 3
    if series == "A":
        return 4
    if series == "F":
        return 5
    if series == "G":
        return 6
Titanic['deckR'] = Titanic['deck'].apply(deck_Recode)

In [19]:
Titanic.deckR.value_counts()

0    51
1    43
2    31
3    30
4    12
5    11
6     4
Name: deckR, dtype: int64

In [9]:
Titanic.sibsp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: sibsp, dtype: int64

In [18]:
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     182 non-null    int64   
 1   pclass       182 non-null    int64   
 2   sex          182 non-null    object  
 3   age          182 non-null    float64 
 4   sibsp        182 non-null    int64   
 5   parch        182 non-null    int64   
 6   fare         182 non-null    float64 
 7   embarked     182 non-null    object  
 8   class        182 non-null    category
 9   who          182 non-null    object  
 10  adult_male   182 non-null    bool    
 11  deck         182 non-null    category
 12  embark_town  182 non-null    object  
 13  alive        182 non-null    object  
 14  alone        182 non-null    bool    
 15  sexR         182 non-null    int64   
 16  embarkedR    182 non-null    int64   
 17  deckR        182 non-null    category
dtypes: bool(2), category(3), float

## Part 1
### Create a decision tree model of the Titanic dataset that predicts survival from seaborn.
### You will need to compute some data wrangling before charging ahead. Make sure to complete the following wrangling tasks:

#### Recode string data
#### Remove missing data
#### Drop any variables that are redundant and will add to multicollinearity.
#### Once you have created a decision tree model, interpret the confusion matrix and classification report.

In [6]:
#having trouble here, this is supposed to be where I subset my data
#drop class, who, adult_male, alive
#titanic1 = Titanic.drop(['class','who', 'adult_male', 'embark_town', 'alive'] axis=1)
#x = Titanic.drop('survived', axis=1)
#y = Titanic['survived']

SyntaxError: invalid syntax (<ipython-input-6-13287711d126>, line 3)

## Train test split

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=76)

### Initial Decision Tree

In [None]:
#decisionTree = DecisionTreeClassifier(random_state=76)
#decisionTree.fit(x_train, y_train)

### Acess the model

In [None]:
#treePredictions = decisionTree.predict(x_test)

In [None]:
#read the confusion matrix
#print(confusion_matrix(y_test, treePredictions))

In [None]:
# how does the model fit?
#print(classification_report(y_test, treePredictions))

# Part II
### Now create a random forest model of the Titanic dataset that predicts survival. Interpret the confusion matrix and classification report. How did the predictive value change from the decision tree?

In [None]:
# load in packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#initial random forest model just copied from page 5
#forest = RandomForestClassifier(n_estimators=500, random_state=76)
#forest.fit(x_train, y_train)

In [None]:
#model fit evaluation again copied from page 5
#forestPredictions = forest.predict(x_test)
#print(confusion_matrix(y_test, forestPredictions))
#print(classification_report(y_test, forestPredictions))