## Decision Tree

In [1]:
# import all packages visualization and other operations

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read the dataset using pandas

df = pd.read_csv("titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Use only Pclass, Sex, Age, SibSp (Siblings onboard), Parch (Parents/Children aboard), and Fare to predict a passenger survival

In [4]:
new_df = df.drop(["PassengerId","Name","Ticket","Cabin","Embarked"],axis=1)

In [5]:
new_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [6]:
new_df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [7]:
new_df["Age"].fillna(new_df['Age'].mean(),inplace=True)

##### The Sex column is in Categorical format so we must change that into a format which will make the machine understand the features for this we will be using something called as Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
LE = LabelEncoder()

In [10]:
new_df["Sex"]=LE.fit_transform(new_df["Sex"])

In [11]:
new_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


1 - Male | 0 - Female

##### Lets build the model and train it for prediction
###### We will use decision tree for the prediction

In [12]:
# We will first seperate the Independent and Dependent features

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = new_df.drop(columns="Survived")

In [15]:
y = new_df["Survived"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=110)

In [17]:
# Build the model 

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

In [18]:
dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [19]:
prediction = dt.predict(X_test)

In [20]:
dt.score(X_test,y_test)

0.75

From the score we are seeing around 76% accuracy of the model which is being trained. We see further if we can perform Hyperparameter tuning and increase the accuracy.

In [21]:
grid_param = {
    "criterion":['gini',"entrophy"],
    "max_depth":range(2,50,1),
     "min_samples_split":range(2,50,1),
    "min_samples_leaf":range(4,50,1),
    "splitter":["random","best"]
}

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
gs = GridSearchCV( estimator=dt,
    param_grid=grid_param,cv=5,n_jobs=-1)

In [24]:
gs.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entrophy'],
                         'max_depth': range(2, 50),
                         'min_samples_leaf': range(4, 50),
                         'min_samples_split': range(2, 50),
                         'splitter': ['random', 'best']})

In [25]:
 gs.best_estimator_

DecisionTreeClassifier(max_depth=9, min_samples_leaf=6, min_samples_split=4)

In [45]:
DT = DecisionTreeClassifier(criterion='gini',max_depth=9,min_samples_split=4,min_samples_leaf=6)

In [46]:
DT.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=9, min_samples_leaf=6, min_samples_split=4)

In [47]:
DT.score(X_test,y_test)

0.8208955223880597

In [56]:
prediction = DT.predict([[3,1,35.0,0,0,8.0500]])


In [57]:
if prediction == 0:
    print("Survived")
else:
    print("Not Survived")

Survived
