In [1]:
#Importing libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
import sklearn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from sklearn.externals.six import StringIO
import pydotplus

##from sklearn import cross_validation
from sklearn.model_selection import train_test_split, cross_val_score
#from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_curve, auc




In [2]:
train_original = pd.read_csv('train.csv')
test_original = pd.read_csv('test.csv')

In [3]:
# Exclude some features to reduce data dimension
train=train_original.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test=test_original.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
total = [train,test]

train.shape, test.shape

((891, 8), (418, 7))

In [4]:
## Create function to replace NaN with the median value for each ticket class
def fill_missing_age(dataset):
    for i in range(1,4):
        median_age=dataset[dataset["Pclass"]==i]["Age"].median()
        dataset["Age"]=dataset["Age"].fillna(median_age)
        return dataset

train = fill_missing_age(train)

In [5]:
## Replace missing cases with C
train["Embarked"] = train["Embarked"].fillna('C')

In [6]:
test = fill_missing_age(test)

## Create function to replace NaN with the median fare with given conditions
def fill_missing_fare(dataset):
    median_fare=dataset[(dataset["Pclass"]==3) & (dataset["Embarked"]=="S")]["Fare"].median()
    dataset["Fare"]=dataset["Fare"].fillna(median_fare)
    return dataset

test = fill_missing_fare(test)

In [7]:
## discretise Age feature
for dataset in total:
    dataset.loc[dataset["Age"] <= 9, "Age"] = 0
    dataset.loc[(dataset["Age"] > 9) & (dataset["Age"] <= 19), "Age"] = 1
    dataset.loc[(dataset["Age"] > 19) & (dataset["Age"] <= 29), "Age"] = 2
    dataset.loc[(dataset["Age"] > 29) & (dataset["Age"] <= 39), "Age"] = 3
    dataset.loc[dataset["Age"] > 39, "Age"] = 4

In [8]:
## discretise Fare
pd.qcut(train["Fare"], 8).value_counts()

(-0.001, 7.75]       140
(9.841, 14.454]      113
(69.488, 512.329]    112
(24.479, 31.0]       112
(7.91, 9.841]        111
(31.0, 69.488]       110
(14.454, 24.479]     110
(7.75, 7.91]          83
Name: Fare, dtype: int64

In [9]:
for dataset in total:
    dataset.loc[dataset["Fare"] <= 7.75, "Fare"] = 0
    dataset.loc[(dataset["Fare"] > 7.75) & (dataset["Fare"] <= 7.91), "Fare"] = 1
    dataset.loc[(dataset["Fare"] > 7.91) & (dataset["Fare"] <= 9.841), "Fare"] = 2
    dataset.loc[(dataset["Fare"] > 9.841) & (dataset["Fare"] <= 14.454), "Fare"] = 3   
    dataset.loc[(dataset["Fare"] > 14.454) & (dataset["Fare"] <= 24.479), "Fare"] = 4
    dataset.loc[(dataset["Fare"] >24.479) & (dataset["Fare"] <= 31), "Fare"] = 5   
    dataset.loc[(dataset["Fare"] > 31) & (dataset["Fare"] <= 69.487), "Fare"] = 6
    dataset.loc[dataset["Fare"] > 69.487, "Fare"] = 7   

In [10]:
## Convert SibSp into binary feature
for dataset in total:
    dataset.loc[dataset["SibSp"]==0, "SibSp"]=0
    dataset.loc[dataset["SibSp"]!=0, "SibSp"]=1

## Convert Parch into binary feature
for dataset in total:
    dataset.loc[dataset["Parch"]==0, "Parch"]=0
    dataset.loc[dataset["Parch"]!=0, "Parch"]=1

In [11]:
# **Convert categorical features to numeric**
## Scikit learn estimators require numeric features
sex = {'female':0,'male':1}
embarked = {'C':0,'Q':1,'S':2}  

In [12]:
## Convert categorical features to numeric using mapping function
for dataset in total:
    dataset['Sex'] = dataset['Sex'].map(sex)
    dataset['Embarked'] = dataset['Embarked'].map(embarked)

In [13]:
## Seperate input features from target feature
x = train.drop("Survived", axis=1)
y = train["Survived"]
## Split the data into training and validation sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.25,random_state=1)

In [14]:
# Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=1)
## Run 10 fold cross validation
cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)
## Show cross validation score mean and std
print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

[0.79329609 0.7752809  0.83146067 0.80337079 0.84269663]
Accuracy: 0.8092 (+/- 0.0494)


In [15]:
## Fit the model with data
clf.fit(x_train, y_train)
## Accuracy
acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))
# We split the data into 75% training and 25% validation sets, and fitted the model. Based on this split, classifier accuracy of the model turned out to be 0.9027. Thus, we may conclude that 90.27% of validation set tuples were correctly classified by this model.
## Predict y given validation set
predictions = clf.predict(x_test)
## Take a look at the confusion matrix ([TN,FN],[FP,TP])
confusion_matrix(y_test,predictions)
## Precision
print("Precision: %0.4f" % precision_score(y_test, predictions))
# The precision is 0.8310. Thus, we may conclude that 83.10% of tuples that the classifier labeled as positive are actually positive by this model.
## Recall score
print("Recall: %0.4f" % recall_score(y_test, predictions))
# The recall is 0.6211. Thus, we may conclude that 62.11% of real positive tuples were classified by the decision tree classifier.
## Print classification report
print(classification_report(y_test, predictions))
## Get data to plot ROC Curve
fp, tp, th = roc_curve(y_test, predictions)
roc_auc = auc(fp, tp)

Accuracy: 0.9027
Precision: 0.8243
Recall: 0.6421
              precision    recall  f1-score   support

           0       0.77      0.90      0.83       128
           1       0.82      0.64      0.72        95

    accuracy                           0.79       223
   macro avg       0.80      0.77      0.78       223
weighted avg       0.79      0.79      0.78       223



In [16]:
clf = DecisionTreeClassifier(random_state=1)

cvs = cross_val_score(clf,x,y,cv=3)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

#acc_decision_tree = round(clf.score(x_train, y_train), 4)
#print("Accuracy: %0.4f" % (acc_decision_tree))




[0.77441077 0.77441077 0.79124579]
Accuracy: 0.7800 (+/- 0.0159)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [17]:

clf = DecisionTreeClassifier(random_state=1,criterion='entropy')

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))


[0.79329609 0.78651685 0.8258427  0.80898876 0.85393258]
Accuracy: 0.8137 (+/- 0.0485)
Accuracy: 0.9027


In [18]:
clf = DecisionTreeClassifier(random_state=1,max_depth=4)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))

clf_maxdepth4=clf

[0.75977654 0.7752809  0.82022472 0.79213483 0.83707865]
Accuracy: 0.7969 (+/- 0.0568)
Accuracy: 0.8473


In [19]:

clf = DecisionTreeClassifier(random_state=1,max_depth=7)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))



[0.79888268 0.80337079 0.85393258 0.80898876 0.81460674]
Accuracy: 0.8160 (+/- 0.0394)
Accuracy: 0.8713


In [20]:

clf = DecisionTreeClassifier(random_state=1,max_depth=10)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))



[0.80446927 0.7752809  0.8258427  0.80898876 0.84269663]
Accuracy: 0.8115 (+/- 0.0451)
Accuracy: 0.8997


In [21]:

clf = DecisionTreeClassifier(random_state=1,max_leaf_nodes=10)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))

clf_nodes_10=clf

[0.79888268 0.80337079 0.82022472 0.79213483 0.85955056]
Accuracy: 0.8148 (+/- 0.0484)
Accuracy: 0.8503


In [22]:

clf = DecisionTreeClassifier(random_state=1,max_leaf_nodes=16)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))




[0.77094972 0.78089888 0.82022472 0.79775281 0.83707865]
Accuracy: 0.8014 (+/- 0.0489)
Accuracy: 0.8563


In [23]:

clf = DecisionTreeClassifier(random_state=1,max_leaf_nodes=23)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))




[0.77653631 0.78089888 0.82022472 0.80337079 0.85955056]
Accuracy: 0.8081 (+/- 0.0604)
Accuracy: 0.8653


In [24]:

clf = DecisionTreeClassifier(random_state=1,max_leaf_nodes=33)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))


clf_nodes_33=clf

[0.7877095  0.7752809  0.80337079 0.82022472 0.85393258]
Accuracy: 0.8081 (+/- 0.0549)
Accuracy: 0.8743


In [25]:

clf = DecisionTreeClassifier(random_state=1,max_leaf_nodes=50)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))


[0.79888268 0.76966292 0.83707865 0.83707865 0.85955056]
Accuracy: 0.8205 (+/- 0.0640)
Accuracy: 0.8892


In [26]:

clf = DecisionTreeClassifier(random_state=1,max_leaf_nodes=100)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))




[0.79888268 0.7752809  0.84831461 0.81460674 0.84269663]
Accuracy: 0.8160 (+/- 0.0545)
Accuracy: 0.9012


In [27]:

clf = DecisionTreeClassifier(random_state=1,splitter='random',max_leaf_nodes=33)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))


[0.79888268 0.78089888 0.81460674 0.82022472 0.86516854]
Accuracy: 0.8160 (+/- 0.0563)
Accuracy: 0.8698


In [28]:

clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=3)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))

clf_sample_3=clf

[0.78212291 0.78089888 0.83146067 0.8258427  0.8258427 ]
Accuracy: 0.8092 (+/- 0.0455)
Accuracy: 0.8668


In [29]:

clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2)

cvs = cross_val_score(clf,x,y,cv=5)
print(cvs)

print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std()*2))

clf.fit(x_train, y_train)

acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))



[0.81564246 0.78089888 0.84831461 0.82022472 0.79775281]
Accuracy: 0.8126 (+/- 0.0453)
Accuracy: 0.8802


In [30]:
# Constructing and evaluating the decision tree
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
import sklearn

# Basic statistics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Vizualization of Decision Tree Classifier
from sklearn.externals.six import StringIO
import pydotplus

In [31]:
def graph_export(clf, data, file):
    dot_data = StringIO()
    sklearn.tree.export_graphviz(
        decision_tree=clf,
        out_file=dot_data,
        #feature_names=['Embarked','Sex','Age','Parch','SibSp','Fare','Pclass'],
        feature_names=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'],
        class_names=['0','1'],
        filled=True,
        rounded=True,
        impurity=True
    )
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(file)

In [None]:
#total=train.append(test)
graph_export(clf, train, "titanic_basic_model.pdf")

In [98]:
graph_export(clf_maxdepth4, train, "titanic_max_depth_4.pdf")

In [105]:
graph_export(clf_nodes_10, train, "titanic_notes_10.pdf")

In [104]:
graph_export(clf_nodes_33, train, "titanic_nodes_33.pdf")

In [110]:
graph_export(clf_sample_3, train, "titanic_min_sample_3.pdf")