# Prediction of Survival Using Decision Tree
This notebook is working on a dataset consisting of details of the passengers of Titanic ship. Decision Tree algorithm is used to see whether a person is survived or not in the accident based on other features of the person.

In [417]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

In [418]:
# Load data
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing Data

In [419]:
# Filter data (we are using selected columns only)
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']]
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [420]:
# Check the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [421]:
# Check the null count
null_count = df.isnull().sum()
null_count[null_count>0]

Age    177
dtype: int64

In [422]:
# For simplicity, removing the rows consisting of null values
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    object 
 3   Age       714 non-null    float64
 4   Fare      714 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 33.5+ KB


In [423]:
# One hot encoding of "Sex" feature
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
oh_output = pd.DataFrame(oh_encoder.fit_transform(df[['Sex']]), index=df.index)
oh_output.head()

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0


In [424]:
# Concatenated the one hot encoding columns to the data frame and removedc categorical features
df = pd.concat([df, oh_output], axis=1)
df.drop(columns=['Sex'], inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,0,1
0,0,3,22.0,7.25,0.0,1.0
1,1,1,38.0,71.2833,1.0,0.0
2,1,3,26.0,7.925,1.0,0.0
3,1,1,35.0,53.1,1.0,0.0
4,0,3,35.0,8.05,0.0,1.0


In [425]:
# Check the data types of column labels
df.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', 0, 1], dtype='object')

In [426]:
# Make every column labels data type to string
df.columns = df.columns.astype('str')
df.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', '0', '1'], dtype='object')

In [427]:
target = df['Survived']
input = df.drop(columns='Survived')
input.head()

Unnamed: 0,Pclass,Age,Fare,0,1
0,3,22.0,7.25,0.0,1.0
1,1,38.0,71.2833,1.0,0.0
2,3,26.0,7.925,1.0,0.0
3,1,35.0,53.1,1.0,0.0
4,3,35.0,8.05,0.0,1.0


In [428]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [429]:
# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.20, random_state=42)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 571 entries, 328 to 130
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  571 non-null    int64  
 1   Age     571 non-null    float64
 2   Fare    571 non-null    float64
 3   0       571 non-null    float64
 4   1       571 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 26.8 KB


In [430]:
# Make the model and fit to the training dataset
dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train, y_train)

## Model Score


In [431]:
# Get the model score (accuracy)
dtc_model.score(X_test, y_test)

0.7272727272727273