## Predict survivability on titanic using decision trees

In [192]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

### Read data and clean up Nan

In [177]:
data = pd.read_csv('./data/train.csv')
data = data.dropna()

### Extract the relevant colums that will be passed into the machine learning algorithm

In [179]:
training_data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
training_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
1,1,1,female,38.0,1,0,C
3,1,1,female,35.0,1,0,S
6,0,1,male,54.0,0,0,S
10,1,3,female,4.0,1,1,S
11,1,1,female,58.0,0,0,S


### Intuitive analysis of each feature in the decision tree

In [180]:
def compare(group, data):
    return data.groupby([group])['Survived'].sum()*100/data.groupby([group])['Survived'].count()

In [181]:
print(compare('Pclass', data))
print(compare('Sex', data))
print(compare("Embarked", data))
print(compare('SibSp', data))
print(compare('Parch', data))

Pclass
1    67.088608
2    80.000000
3    50.000000
Name: Survived, dtype: float64
Sex
female    93.181818
male      43.157895
Name: Survived, dtype: float64
Embarked
C    73.846154
Q    50.000000
S    63.793103
Name: Survived, dtype: float64
SibSp
0    62.727273
1    73.437500
2    83.333333
3    66.666667
Name: Survived, dtype: float64
Parch
0    65.573770
1    70.270270
2    73.913043
4     0.000000
Name: Survived, dtype: float64


### Convert any categorical data to numeric 

In [111]:
def cat_to_num(series):
    series = series.astype('category')
    return series.cat.codes   

In [184]:
training_data['Gender'] = data[['Sex']].apply(cat_to_num)
training_data['Port'] = data[['Embarked']].apply(cat_to_num)
training_data = training_data[['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Port']]
train, test = train_test_split(training_data, test_size=0.2)
train.head()

Unnamed: 0,Survived,Pclass,Gender,Age,SibSp,Parch,Port
853,1,1,0,16.0,0,1,2
512,1,1,1,36.0,0,0,2
209,1,1,1,40.0,0,0,0
585,1,1,0,18.0,0,2,2
625,0,1,1,61.0,0,0,2


### Set up an instance of the classifier and feed training data

In [207]:
classifier = DecisionTreeClassifier(max_leaf_nodes=15)

### Wrap the classifier in a function for reusability

In [210]:
def check_classifier_accuracy(clf):
    clf = clf.fit(train[['Pclass','Gender', 'Age', 'SibSp', 'Parch', 'Port']], train['Survived'])
    predictions = clf.predict(test[['Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Port']])
    return accuracy_score(test['Survived'], predictions)

In [211]:
check_classifier_accuracy(classifier)

0.72972972972972971

### Explore feature importance and the tree thus formed

In [188]:
classifier.feature_importances_

array([ 0.01502172,  0.35335172,  0.63162656,  0.        ,  0.        ,  0.        ])

In [189]:
with open('./out/titanic.dot', 'w') as f:
    f = tree.export_graphviz(classifier, feature_names=['Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Port'], out_file=f)

## Use Random Forest Classifier on the same data set

In [213]:
rclf = RandomForestClassifier(n_estimators=1000)

In [214]:
check_classifier_accuracy(rclf)

0.78378378378378377