# Module 9 - Decision Trees

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
location = "datasets/titanic.xls"

df = pd.read_excel(location)
df.head()

### Clean the data

In [None]:
#find columns that have missing values
df.isnull().sum()

In [None]:
#fill missing values for age based on survival status, sex, and passenger class
df['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

In [None]:
#only 2 missing values so we'll fill with most common embarkation point
df['embarked'].value_counts()

In [None]:
#fill missing values
df['embarked'].fillna('S', inplace=True)

In [None]:
df.isnull().sum()

In [None]:
modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

In [None]:
#columns left in our dataframe
modeldf.columns

Create dummy variables for categorical values

In [None]:
#dummy variables for passenger class embarkation port
#get_dummies will auto-drop columns that dummies were created from
modeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])
#modeldf.head()

In [None]:
#change sex values to binary
#female=0, male=1
modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})
#modeldf.head()

In [None]:
#create new column based on number of family members
#drop sibsp and parch columns
modeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']
modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)
#modeldf.head()

In [None]:
modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)
#modeldf.head()

### Build a Decision Tree

In [None]:
#extract target variable
#make copy of 'survived' column
y = modeldf['survived']

In [None]:
#copy of modeldf without 'survived' column
X = modeldf.drop(['survived'], axis=1)

In [None]:
#80% for training data, 20% for test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [None]:
#assign decision tree function to model variable
tree = tree.DecisionTreeClassifier()

In [None]:
#develop model using training data
#defining arguments in the model can help prevent overfitting
tree.fit(X_train, y_train)

In [None]:
#accuracy score of model on training data
tree.score(X_train, y_train)

In [None]:
#run the predictions on the test data
y_predict = tree.predict(X_test)

In [None]:
#accuracy score of model on test data
tree.score(X_test, y_test)

In [None]:
#look at true and false predictions
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

In [None]:
#from precision column, model is better at predicting passengers that do not survive
print(classification_report(y_test, y_predict))