## Predict survivability on titanic using decision trees

In [137]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score

### Read data and clean up Nan

In [177]:
data = pd.read_csv('./data/train.csv')
data = data.dropna()

### Extract the relevant colums that will be passed into the machine learning algorithm

In [179]:
training_data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
training_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
1,1,1,female,38.0,1,0,C
3,1,1,female,35.0,1,0,S
6,0,1,male,54.0,0,0,S
10,1,3,female,4.0,1,1,S
11,1,1,female,58.0,0,0,S


### Intuitive analysis of each feature in the decision tree

In [103]:
def compare(group, data):
    return data.groupby([group])['Survived'].sum()*100/data.groupby([group])['Survived'].count()

In [174]:
print(compare('Pclass', data))
print(compare('Sex', data))
print(compare("Embarked", data))
print(compare('SibSp', data))
print(compare('Parch', data))

Pclass
1    67.088608
2    80.000000
3    50.000000
Name: Survived, dtype: float64
Sex
female    93.181818
male      43.157895
Name: Survived, dtype: float64
Embarked
C    73.846154
Q    50.000000
S    63.793103
Name: Survived, dtype: float64
SibSp
0    62.727273
1    73.437500
2    83.333333
3    66.666667
Name: Survived, dtype: float64
Parch
0    65.573770
1    70.270270
2    73.913043
4     0.000000
Name: Survived, dtype: float64


### Convert any categorical data to numeric 

In [111]:
def cat_to_num(series):
    series = series.astype('category')
    return series.cat.codes   

In [112]:
training_data['Gender'] = data[['Sex']].apply(cat_to_num)
training_data['Port'] = data[['Embarked']].apply(cat_to_num)
training_data = training_data[['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Port']]
train, test = train_test_split(training_data, test_size=0.2)
train.head()
#training_data.head()
#len(training_data)

Unnamed: 0,Survived,Pclass,Gender,Age,SibSp,Parch,Port
102,0,1,1,21.0,0,1,2
449,1,1,1,52.0,0,0,2
136,1,1,0,19.0,0,2,2
862,1,1,0,48.0,0,0,2
689,1,1,0,15.0,0,1,2


### Set up an instance of the classifier and feed training data

In [167]:
classifier = DecisionTreeClassifier(max_leaf_nodes=15)

In [168]:
classifier = classifier.fit(train[['Pclass','Gender', 'Age', 'SibSp', 'Parch', 'Port']], train['Survived'])

In [169]:
classifier

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=15, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [170]:
classifier.feature_importances_

array([ 0.01208825,  0.45069711,  0.48773006,  0.0225238 ,  0.02696078,  0.        ])

In [171]:
with open('./out/titanic.dot', 'w') as f:
    f = tree.export_graphviz(classifier, feature_names=['Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Port'], out_file=f)

### Use classifier to predict survivability in test group

In [172]:
predictions = classifier.predict(test[['Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Port']])

In [173]:
accuracy_score(test['Survived'], predictions)

0.81081081081081086