### Import Package

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

from sklearn.tree import export_graphviz
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

### 1. Get Titanic Datas

In [2]:
df = pd.read_csv('http://dato.com/files/titanic.csv', index_col=0)
df.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### 2. Preprocessing

In [3]:
# select columns
feature_names = ["Pclass", "Age", "Sex"]
dfX = df[feature_names].reset_index().drop(["PassengerId"], axis=1)
dfy = df["Survived"]
dfX.tail()

Unnamed: 0,Pclass,Age,Sex
886,2,27.0,male
887,1,19.0,female
888,3,,female
889,1,26.0,male
890,3,32.0,male


In [4]:
# remove nan to mean of total age
dfX.Age.fillna(int(dfX.Age.mean()), inplace=True)
dfX.tail()

Unnamed: 0,Pclass,Age,Sex
886,2,27.0,male
887,1,19.0,female
888,3,29.0,female
889,1,26.0,male
890,3,32.0,male


In [5]:
# LabelEncoder : change char category to number
from sklearn.preprocessing import LabelEncoder
dfX.Sex = LabelEncoder().fit_transform(dfX.Sex)
dfX.tail()

Unnamed: 0,Pclass,Age,Sex
886,2,27.0,1
887,1,19.0,0
888,3,29.0,0
889,1,26.0,1
890,3,32.0,1


In [6]:
# OneHotEncoding : chage Pclass data to three columns
from sklearn.preprocessing import OneHotEncoder
dfX2 = pd.DataFrame(OneHotEncoder().fit_transform(dfX["Pclass"].as_matrix()[:,np.newaxis]).toarray(), 
                    columns=['first_class', 'second_class', 'third_class'], index=dfX.index)
dfX = pd.concat([dfX, dfX2], axis=1)
del(dfX["Pclass"])
dfX.tail()

Unnamed: 0,Age,Sex,first_class,second_class,third_class
886,27.0,1,0.0,1.0,0.0
887,19.0,0,1.0,0.0,0.0
888,29.0,0,0.0,0.0,1.0
889,26.0,1,1.0,0.0,0.0
890,32.0,1,0.0,0.0,1.0


### 3. Separate Trainig Data and Test Data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.5, random_state=1)

In [13]:
# min_samples_leaf = stop the number of result under min_samples_leaf before max_depth
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5).fit(X_train, y_train)

In [14]:
confusion_matrix(y_test, model.predict(X_test))

array([[231,  32],
       [ 63, 120]])

In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.79      0.88      0.83       263
          1       0.79      0.66      0.72       183

avg / total       0.79      0.79      0.78       446



### 4. Predict

In [36]:
# age : 26, sax : female, Pclass : first_class
predict_test1 = np.array([26,0,1,0,0]).reshape(1, -1)

# age : 53, sax : male, Pclass : third_class
predict_test2 = np.array([53,1,0,0,1]).reshape(1, -1)

print(model.predict(predict_test1)) # servive
print(model.predict(predict_test2)) # no servive

[1]
[0]
