# Titanic data

In [1]:
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('./titanic.csv')
df.head(2)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [5]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [6]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [7]:
df.drop(['name','home.dest','cabin', 'boat', 'body'],axis = 1, inplace = True)

In [8]:
df.head(2)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,embarked
0,1,1,female,29.0,0,0,24160,211.3375,S
1,1,1,male,0.9167,1,2,113781,151.55,S


In [9]:
df.rename({"sex":'gender'},axis = 1, inplace = True)

In [10]:
df.head()

Unnamed: 0,pclass,survived,gender,age,sibsp,parch,ticket,fare,embarked
0,1,1,female,29.0,0,0,24160,211.3375,S
1,1,1,male,0.9167,1,2,113781,151.55,S
2,1,0,female,2.0,1,2,113781,151.55,S
3,1,0,male,30.0,1,2,113781,151.55,S
4,1,0,female,25.0,1,2,113781,151.55,S


In [19]:
df.isnull().sum()

pclass      0
survived    0
gender      0
age         0
sibsp       0
parch       0
ticket      0
fare        0
embarked    0
dtype: int64

In [12]:
df['age'] = df['age'].fillna(df['age'].mean())

In [16]:
df['fare'] = df['fare'].fillna(df['fare'].mode()[0])

In [18]:
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

In [20]:
from sklearn.preprocessing import LabelEncoder

# encode the columns
df['gender'] = LabelEncoder().fit_transform(df['gender'])
df['ticket'] = LabelEncoder().fit_transform(df['ticket'])
df['embarked'] = LabelEncoder().fit_transform(df['embarked'])

In [21]:
df.corr()

Unnamed: 0,pclass,survived,gender,age,sibsp,parch,ticket,fare,embarked
pclass,1.0,-0.312469,0.124617,-0.36637,0.060832,0.018322,0.309695,-0.55874,0.185479
survived,-0.312469,1.0,-0.528693,-0.050199,-0.027825,0.08266,-0.125869,0.244479,-0.175313
gender,0.124617,-0.528693,1.0,0.057398,-0.109609,-0.213125,0.024725,-0.185744,0.09796
age,-0.36637,-0.050199,0.057398,1.0,-0.190747,-0.130872,-0.08508,0.170618,-0.071181
sibsp,0.060832,-0.027825,-0.109609,-0.190747,1.0,0.373587,0.063921,0.160388,0.065567
parch,0.018322,0.08266,-0.213125,-0.130872,0.373587,1.0,0.053389,0.221668,0.044772
ticket,0.309695,-0.125869,0.024725,-0.08508,0.063921,0.053389,1.0,-0.014959,0.031453
fare,-0.55874,0.244479,-0.185744,0.170618,0.160388,0.221668,-0.014959,1.0,-0.238181
embarked,0.185479,-0.175313,0.09796,-0.071181,0.065567,0.044772,0.031453,-0.238181,1.0


In [22]:
# create x
x = df[['pclass', 'gender', 'ticket', 'fare', 'embarked']]

# create y
y = df['survived']

In [23]:
from sklearn.model_selection import train_test_split

# split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, random_state=123456
)

In [32]:
from sklearn.tree import DecisionTreeClassifier

# create the model
model = DecisionTreeClassifier(max_depth= 2)

# train the model
model.fit(x_train, y_train)

In [30]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'max_depth': range(1,10)
}

grid_search_dtc = GridSearchCV(model,parameters)
grid_search_dtc.fit(x_test,y_test)

In [31]:
grid_search_dtc.best_params_

{'max_depth': 2}

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"accuracy = {accuracy:.2f}, precision = {precision:.2f}, recall = {recall:.2f}, f1 = {f1:.2f}")

accuracy = 0.73, precision = 0.64, recall = 0.62, f1 = 0.63


In [34]:
y_pred = model.predict(x_train)
accuracy = accuracy_score(y_train, y_pred)
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)
f1 = f1_score(y_train, y_pred)
print(f"accuracy = {accuracy:.2f}, precision = {precision:.2f}, recall = {recall:.2f}, f1 = {f1:.2f}")

accuracy = 0.79, precision = 0.75, recall = 0.69, f1 = 0.72
