In [1]:
import numpy as np 
import pandas as pd 
import matplotlib as mpl 
import matplotlib.pyplot as plt 

In [2]:
# get titanic and test csv files as a DataFrame 
train = pd.read_csv("./data/raw/titanic/train.csv")
print(train.shape)

(891, 12)


In [3]:
# checking for missing data 
NAs = pd.concat([train.isnull().sum()], axis=1, keys=["Train"])
NAs[NAs.sum(axis=1) > 0]

Unnamed: 0,Train
Age,177
Cabin,687
Embarked,2


Remove "Cabin", "Name" and "Ticket" columns as they require some pre processing 

In [4]:
# At this point we will drop the Cabin Feature since it is missing a lot of Data 
train.pop("Cabin")
train.pop("Name")
train.pop("Ticket")
train.shape

(891, 9)

In [5]:
# Filling missing age values with ,ost common value 
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])

In [6]:
# Pclass is a categorical feature so we convert its value to strings 
train["Pclass"] = train["Pclass"].apply("str")

# Basic one hot encoding of categorical features 
# Getting Dummies from all other categorical vars 
for col in train.dtypes[train.dtypes=="object"].index:
    for_dummy = train.pop(col)
    train = pd.concat([train, pd.get_dummies(for_dummy, prefix=col)], axis=1)
    
train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,"Pclass_0 3 1 1 2 3 3 1 4 3 5 3 6 1 7 3 8 3 9 2 10 3 11 1 12 3 13 3 14 3 15 2 16 3 17 2 18 3 19 3 20 2 21 2 22 3 23 1 24 3 25 3 26 3 27 1 28 3 29 3  .. 861 2 862 1 863 3 864 2 865 2 866 2 867 1 868 3 869 3 870 3 871 1 872 1 873 3 874 2 875 3 876 3 877 3 878 3 879 1 880 2 881 3 882 3 883 2 884 3 885 3 886 2 887 1 888 3 889 1 890 3 Name: Pclass, Length: 891, dtype: int64",Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,1,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,1,0,1,0,0
2,3,1,26.0,0,0,7.925,1,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,1,0,0,0,1
4,5,0,35.0,0,0,8.05,1,0,1,0,0,1


In [7]:
# prepare data for training models 
labe = train.pop("Survived")

In [8]:
# Split the data into training and testing sets 75% train and 25% for test 
from sklearn.model_selection import train_test_split 

(x_train, x_test, y_train, y_test) = train_test_split(train, labe, test_size=0.25)

In [12]:
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

In [13]:
# Train the decision tree model 
from sklearn.tree import DecisionTreeClassifier 
dt = DecisionTreeClassifier() 

# DecisionTreeClassifier(class_weight=None, citerion="gini", max_depth=None, 
#                       max_features=None, max_leaf_nodes=None, 
#                       min_impurity_split=1e-7, min_samples_leaf=1, 
#                       min_samples_split=2, min_weight_fraction_leaf=0.0, 
#                       presort=False, random_state=None, splitter="best")


dt.fit(x_train, y_train) 

y_pred = dt.predict(x_test)

In [14]:
# evaluate the model
from sklearn.metrics import accuracy_score 
print("accuracy", accuracy_score(y_test, y_pred))

accuracy 0.7488789237668162


We will use ROC (Receptor Operator Curve) and AUC (Area undeer Curve) as the evaluation metric. Our output is a binary variable and it is a good way to evaluate binary variables 

In [17]:
# 
from sklearn.metrics import roc_curve, auc 

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)

0.7464487489911219


### Max Depth 

In [None]:
max_depths = np.linspace(1, 32, 32, endpoint=True)

train_results = []
test_results = [] 
for max_depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    
    train_pred = dt.predict(x_test)
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, train_pred)
    roc_acc = auc(false_positive_rate, true_posi)