In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV 
from sklearn import metrics
from sklearn.metrics import classification_report


In [2]:
url="https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic = pd.read_csv(url)
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ti cket','Fare','Cabin','E mbarked']

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ti cket,Fare,Cabin,E mbarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic['Fare'].value_counts().loc[lambda x: x<30]

10.5000    24
7.9250     18
7.7750     16
26.5500    15
0.0000     15
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 243, dtype: int64

Data Description:
1. pclass: A proxy for socio-economic status (SES) -  1: upper, 2: middle, 3: lower
2. SibSp : Sibling spouse aboard
3. Parch : Parent Child Aboard

You use only Pclass, Sex, Age, SibSp (Siblings aboard), Parch (Parents/children aboard), and Fare to predict whether a passenger survived.


In [5]:
titanic.shape

(891, 12)

In [6]:
titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ti cket          0
Fare             0
Cabin          687
E mbarked        2
dtype: int64

Age and Cabin which should be used for prediction contains lot of null values.
Its Better to avoid Cabin as 687 out of 891 is not available


In [7]:
titanic['Age'].value_counts().loc[lambda x: x>10]

24.0    30
22.0    27
18.0    26
19.0    25
30.0    25
28.0    25
21.0    24
25.0    23
36.0    22
29.0    20
32.0    18
27.0    18
35.0    18
26.0    18
16.0    17
31.0    17
20.0    15
33.0    15
23.0    15
34.0    15
39.0    14
17.0    13
42.0    13
40.0    13
45.0    12
38.0    11
Name: Age, dtype: int64

In [8]:
print(titanic['Age'].mean())
print(titanic['Age'].median())

29.69911764705882
28.0


In [9]:
titanic['Age'].fillna(titanic['Age'].median(),inplace=True)

In [10]:
titanic=titanic.drop('Cabin', 1).copy()

In [11]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ti cket,Fare,E mbarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [12]:
titanic['E mbarked'].value_counts()

S    644
C    168
Q     77
Name: E mbarked, dtype: int64

S has more values compared to other 2 parameters.So lets have the NA value as S

In [13]:
titanic['E mbarked'].fillna('S',inplace=True)

In [14]:
titanic.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ti cket        0
Fare           0
E mbarked      0
dtype: int64

In [15]:
titanic['Sex'] = titanic['Sex'].map(lambda i : 1 if i=='male' else 0)

In [16]:
X=titanic[['Pclass','Sex','Age','SibSp','Fare','Parch']]
X.isna().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Fare      0
Parch     0
dtype: int64

In [18]:
y=titanic['Survived']

In [19]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state= 355)

In [20]:
#let's first visualize the tree on the data without doing any pre processing
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

DecisionTreeClassifier()

In [21]:
clf.score(x_train,y_train)

0.9839486356340289

In [22]:
py_pred = clf.predict(x_test)
# accuracy of our classification tree
clf.score(x_test,y_test)

0.7350746268656716

In [23]:
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [24]:
grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [25]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 32),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'splitter': ['best', 'random']})

In [26]:
best_parameters = grid_search.best_params_
print(best_parameters)

{'criterion': 'gini', 'max_depth': 26, 'min_samples_leaf': 5, 'min_samples_split': 3, 'splitter': 'random'}


In [27]:
clf2 = DecisionTreeClassifier(criterion = 'gini', max_depth =26, min_samples_leaf= 5, min_samples_split= 3, splitter ='random')
clf2.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=26, min_samples_leaf=5, min_samples_split=3,
                       splitter='random')

In [28]:
clf2.score(x_train,y_train)

0.8154093097913323

In [29]:
py_pred = clf2.predict(x_test)
# accuracy of our classification tree
clf2.score(x_test,y_test)

0.8134328358208955