In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

# Dataset

In [2]:
df = pd.read_csv('./database/csv_files/Csv_titanic.csv', usecols=['Survived','Pclass','Sex','Age','Fare'] )
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [3]:
df.shape

(891, 5)

# Null Value Imputation

In [4]:
df.isnull().sum().sum()

177

In [5]:
df.isnull().mean()*100

Survived     0.00000
Pclass       0.00000
Sex          0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [6]:
df.Sex.unique()

array(['male', 'female'], dtype=object)

In [7]:
male_age = df[df.loc[:,'Sex'] == 'male']
male_age = male_age.fillna(male_age.Age.median())

In [8]:
female_age = df[df.loc[:,'Sex'] == 'female']
female_age = female_age.fillna(female_age.Age.median())

In [9]:
df.update(male_age)
df.update(female_age)

In [10]:
df.isnull().sum().sum()

0

In [11]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0.0,3.0,male,22.0,7.2500
1,1.0,1.0,female,38.0,71.2833
2,1.0,3.0,female,26.0,7.9250
3,1.0,1.0,female,35.0,53.1000
4,0.0,3.0,male,35.0,8.0500
...,...,...,...,...,...
886,0.0,2.0,male,27.0,13.0000
887,1.0,1.0,female,19.0,30.0000
888,0.0,3.0,female,27.0,23.4500
889,1.0,1.0,male,26.0,30.0000


# Creating Dummy Cols

In [12]:
df2 = pd.get_dummies(df,drop_first=True)
df2

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_male
0,0.0,3.0,22.0,7.2500,1
1,1.0,1.0,38.0,71.2833,0
2,1.0,3.0,26.0,7.9250,0
3,1.0,1.0,35.0,53.1000,0
4,0.0,3.0,35.0,8.0500,1
...,...,...,...,...,...
886,0.0,2.0,27.0,13.0000,1
887,1.0,1.0,19.0,30.0000,0
888,0.0,3.0,27.0,23.4500,0
889,1.0,1.0,26.0,30.0000,1


# Normalization

In [13]:
normalizer = MinMaxScaler()

In [14]:
df3 = pd.DataFrame(normalizer.fit_transform(df2), columns=df2.columns)
df3

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_male
0,0.0,1.0,0.271174,0.014151,1.0
1,1.0,0.0,0.472229,0.139136,0.0
2,1.0,1.0,0.321438,0.015469,0.0
3,1.0,0.0,0.434531,0.103644,0.0
4,0.0,1.0,0.434531,0.015713,1.0
...,...,...,...,...,...
886,0.0,0.5,0.334004,0.025374,1.0
887,1.0,0.0,0.233476,0.058556,0.0
888,0.0,1.0,0.334004,0.045771,0.0
889,1.0,0.0,0.321438,0.058556,1.0


# train_test_split

In [15]:
x = df3.drop('Survived',axis = 1)
y = df3.Survived

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)

# Descision Tree Classifier

In [17]:
model_tree = DecisionTreeClassifier()
model_tree.fit(x_train,y_train)
model_tree.score(x_test,y_test)

0.8100558659217877