# Predicting Survival in the Titanic Data Set
We will be using a decision tree to make predictions about the Titanic data
set from Kaggle. This data set provides information on the Titanic
passengers and can be used to predict whether a passenger survived or
not.


You use only Pclass, Sex, Age, SibSp (Siblings aboard), Parch
(Parents/children aboard), and Fare to predict whether a passenger
survived.

In [42]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [43]:
Url="https://raw.githubusercontent.com/krishnaik06/Feature-Engineering-Live-sessions/master/titanic.csv"
titanic = pd.read_csv(Url, error_bad_lines=False)
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','E mbarked']
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,E mbarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
titanic.drop(columns=['PassengerId','Name','Ticket','Cabin','E mbarked'], inplace=True)

In [45]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [46]:
titanic.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
886,0,2,male,27.0,0,0,13.0
887,1,1,female,19.0,0,0,30.0
888,0,3,female,,1,2,23.45
889,1,1,male,26.0,0,0,30.0
890,0,3,male,32.0,0,0,7.75


In [47]:
titanic.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [48]:
titanic.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [49]:
titanic.Age.fillna(titanic.Age.mean(), inplace=True)

In [50]:
titanic.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [54]:
df = pd.get_dummies(titanic, drop_first=True)
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,0,3,22.0,1,0,7.25,1
1,1,1,38.0,1,0,71.2833,0
2,1,3,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,0,3,35.0,0,0,8.05,1


In [55]:
X = df.drop(columns=['Survived'])
Y = df.Survived

In [56]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [57]:
Y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [58]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.30,random_state=400)

In [59]:
from sklearn.tree import DecisionTreeClassifier

In [60]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

In [61]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.30,random_state=400)

In [63]:
dcModel = DecisionTreeClassifier()
dcModel.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [64]:
dcModel.score(x_test,y_test)

0.7873134328358209

In [68]:
grids = {
    'criterion': ['gini', 'entropy'],
    'max_features': range(2,4), 
    'max_leaf_nodes': range(0,3), 
    'min_impurity_split':range(0,1), 
    'min_samples_split':range(2,4),
    'min_weight_fraction_leaf':range(0,1),
    'random_state':range(300,350,2),
    'splitter':['best', 'random']
}

In [69]:
g_search = GridSearchCV(estimator=dcModel, param_grid=grids, n_jobs=-1, cv=10)g

In [70]:
g_search.fit(x_train,y_train)



GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='d...obs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_fe

In [71]:
g_search.best_params_

{'criterion': 'gini',
 'max_features': 2,
 'max_leaf_nodes': 2,
 'min_impurity_split': 0,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0,
 'random_state': 300,
 'splitter': 'random'}

In [72]:
dcModelWithGridSearch = DecisionTreeClassifier(criterion='gini',
 max_features=2,
 max_leaf_nodes=2,
 min_impurity_split= 0,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0,
 random_state= 300,
 splitter='random')
dcModelWithGridSearch.fit(x_train,y_train)



DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=2,
                       min_impurity_decrease=0.0, min_impurity_split=0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0, presort='deprecated',
                       random_state=300, splitter='random')

In [73]:
dcModelWithGridSearch.score(x_test,y_test)

0.8470149253731343

In [83]:
df.Survived.unique()

array([0, 1], dtype=int64)

In [81]:
dcModelWithGridSearch.predict([[3,34,6,5,30.50,1]])

array([0], dtype=int64)