In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [34]:
df = pd.read_csv('data.csv')
df['HasCabin'] = df["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = df.FamilySize.apply(lambda x: 1 if x == 1 else 0)
df['IsMale'] = df['Sex'].map( {'female': 0, 'male': 1} )
df['Embarked'] = df['Embarked'].fillna('S')
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )
df['Age'] = df['Age'].fillna(df.Age.mean())
df['Age'] = df.Age.apply(lambda x: 0 if x < 16 else 1 if x < 32 else 2 if x < 48 else 3 if x < 64 else 4)
df['Fare'] = df.Fare.apply(lambda x: 0 if x < 7.91 else 1 if x < 14.45 else 2 if x < 31 else 3)
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin,FamilySize,IsAlone,IsMale
0,1,0,3,"Braund, Mr. Owen Harris",male,1,1,0,A/5 21171,0,,0,0,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2,1,0,PC 17599,3,C85,1,1,2,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,1,0,0,STON/O2. 3101282,1,,0,0,1,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2,1,0,113803,3,C123,0,1,2,0,0
4,5,0,3,"Allen, Mr. William Henry",male,2,0,0,373450,1,,0,0,1,1,1


In [36]:
y = df.Survived
X = df.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Sex'], axis = 1)
X.head(3)

Unnamed: 0,Pclass,Age,Fare,Embarked,HasCabin,FamilySize,IsAlone,IsMale
0,3,1,0,0,0,2,0,1
1,1,2,3,1,1,2,0,0
2,3,1,1,0,0,1,1,0


In [37]:
cv = KFold(n_splits=10)

In [50]:
for depth in range(1, 20):
    model = tree.DecisionTreeClassifier(max_depth=depth)
    train_scores, test_scores = [], []
    for train_i, test_i in cv.split(X):
        Xr, yr, Xt, yt = X.loc[train_i], y.loc[train_i], X.loc[test_i], y.loc[test_i]
        model.fit(Xr, yr)
        test_scores.append(model.score(Xt, yt))
        train_scores.append(model.score(Xr, yr))
    print('depth:', depth, 'test scores:', sum(test_scores)/len(test_scores), 'train scores:', sum(train_scores)/len(train_scores))

depth: 1 test scores: 0.786729088639 train scores: 0.786756112216
depth: 2 test scores: 0.768851435705 train scores: 0.78900174657
depth: 3 test scores: 0.815942571785 train scores: 0.818930046918
depth: 4 test scores: 0.827178526841 train scores: 0.834019196702
depth: 5 test scores: 0.822684144819 train scores: 0.842124868852
depth: 6 test scores: 0.817053682896 train scores: 0.848734437315
depth: 7 test scores: 0.813695380774 train scores: 0.858087459254
depth: 8 test scores: 0.813732833958 train scores: 0.870308311618
depth: 9 test scores: 0.814868913858 train scores: 0.881905878251
depth: 10 test scores: 0.812634207241 train scores: 0.885647149293
depth: 11 test scores: 0.814868913858 train scores: 0.888639823662
depth: 12 test scores: 0.803632958801 train scores: 0.890634991796
depth: 13 test scores: 0.810374531835 train scores: 0.891507965417
depth: 14 test scores: 0.805880149813 train scores: 0.891507965417
depth: 15 test scores: 0.807003745318 train scores: 0.891507965417
depth

In [51]:
final_model = tree.DecisionTreeClassifier(max_depth=4)

In [52]:
final_model.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [53]:
from sklearn.externals import joblib

In [54]:
joblib.dump(final_model, 'tree.pkl') 

['tree.pkl']

In [55]:
hydrated = joblib.load('tree.pkl') 

In [56]:
hydrated.score(X,y)

0.83389450056116721

In [57]:
X.head(3)

Unnamed: 0,Pclass,Age,Fare,Embarked,HasCabin,FamilySize,IsAlone,IsMale
0,3,1,0,0,0,2,0,1
1,1,2,3,1,1,2,0,0
2,3,1,1,0,0,1,1,0


In [58]:
hydrated.predict([[3,1,0,0,0,2,0,1]])

array([0])

In [59]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin,FamilySize,IsAlone,IsMale
0,1,0,3,"Braund, Mr. Owen Harris",male,1,1,0,A/5 21171,0,,0,0,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2,1,0,PC 17599,3,C85,1,1,2,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,1,0,0,STON/O2. 3101282,1,,0,0,1,1,0


In [60]:
dir(hydrated)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_estimator_type',
 '_get_param_names',
 '_validate_X_predict',
 'apply',
 'class_weight',
 'classes_',
 'criterion',
 'decision_path',
 'feature_importances_',
 'fit',
 'get_params',
 'max_depth',
 'max_features',
 'max_features_',
 'max_leaf_nodes',
 'min_impurity_decrease',
 'min_impurity_split',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'n_classes_',
 'n_features_',
 'n_outputs_',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'presort',

In [61]:
hydrated.feature_importances_

array([ 0.15125957,  0.05311138,  0.02189177,  0.01404981,  0.06699221,
        0.10144614,  0.        ,  0.59124912])

In [62]:
X.columns

Index(['Pclass', 'Age', 'Fare', 'Embarked', 'HasCabin', 'FamilySize',
       'IsAlone', 'IsMale'],
      dtype='object')