## Who Survived the Titanic?

In [4]:
import pandas as pd 
tdf = pd.read_csv('titanic.csv', sep = ',', header=0)

# Helpful tools to learn about yoru data
#print(tdf.info())          # gives the structure of the data (rows/cols)
#print(tdf.head())          # shows the top five data entries
#print(tdf.describe())      # gives the statistics on the data in the DF

### Set up the data for the decision tree analysis

In [5]:
# Only keep the features we want to use and place the "target" at the end
tdf = tdf[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]

# Change sex to numeric value so we can use DecisionTreeClassifier() <-- string okay for target, but not as an input
tdf['Sex'] = tdf['Sex'].map({'male': 0, 'female': 1})

# Drop rows with missing fields
tdf = tdf.dropna() 
#print(df.info())
columns = list(tdf)

### Separate the independent variables (AKA Features) from the dependent labels (AKA Target)

In [2]:
X = tdf.iloc[:, 0:6]   # load features into X DF
Y = tdf.iloc[:, 6]     # Load target into Y DF

### Split the Training and Testing Data

In [None]:
# Ensure that the decision tree is deterministic
import numpy as np
np.random.seed(101)

In [3]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=1)

### Generate and evaluate the model

In [None]:
from sklearn.tree import DecisionTreeClassifier 

# Use entropy = no limit on samples for split
model_ent = DecisionTreeClassifier(criterion='entropy').fit(X_train, y_train) 
y_ent_pred = model_ent.predict(X_test)

# Use information gain (default) limit min_samples to 4
model_gini = DecisionTreeClassifier(min_samples_leaf=4).fit(X_train, y_train)
y_gini_pred = model_gini.predict(X_test)

# NOTE: You should, when testing models, only vary 1 thing at a time. 

In [None]:
# Generate an accuracy Score
from sklearn.metrics import accuracy_score

print("Entropy accuracy is : {}%".format(accuracy_score(y_test, y_ent_pred)*100))
print("Gini accuracy is : {}%".format(accuracy_score(y_test, y_gini_pred)*100))

## Visualize the results

In [None]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_gini_pred),
    columns=['Predicted Died', 'Predicted Survived'],
    index=['True Died', 'True Survived']
)

In [None]:
#create a visualization of the tree -- must install graphviz and pydot2 packages for this to work
from sklearn import tree
dotfile = open("dtree.dot", 'w')
tree.export_graphviz(model_gini, out_file = dotfile, feature_names = columns[0:6])
dotfile.close()

# Convert the dot file to a png
#import os
#os.system("dot -Tpng dtree.dot -o dtree.png")

# Convert the dot file to a png
from subprocess import check_call
check_call(['dot','-Tpng','dtree.dot','-o','dtree.png'])

In [None]:
# show the png
from IPython.display import Image
Image(filename='dtree.png', width=1000, height=1000) 