In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import KBinsDiscretizer, Binarizer
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('E:/Python Programs/titanic.csv')[['Age', 'Fare', 'SibSp', 'Parch', 'Survived']]

In [3]:
df.dropna(inplace=True)

In [4]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,34.5,7.8292,0,0,0
1,47.0,7.0,1,0,1
2,62.0,9.6875,0,0,0
3,27.0,8.6625,0,0,0
4,22.0,12.2875,1,1,1


In [5]:
# Joining SibSpouse and ParentChild column into a Single column

df['Family'] = df['SibSp'] + df['Parch']

In [6]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,Family
0,34.5,7.8292,0,0,0,0
1,47.0,7.0,1,0,1,1
2,62.0,9.6875,0,0,0,0
3,27.0,8.6625,0,0,0,0
4,22.0,12.2875,1,1,1,2


In [7]:
df.drop(columns=['SibSp','Parch'],inplace=True)

In [8]:
df.head()

Unnamed: 0,Age,Fare,Survived,Family
0,34.5,7.8292,0,0
1,47.0,7.0,1,1
2,62.0,9.6875,0,0
3,27.0,8.6625,0,0
4,22.0,12.2875,1,2


In [10]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((265, 3), (265,), (67, 3), (67,))

## Without Binarization

In [12]:
dt1 = DecisionTreeClassifier()

dt1.fit(X_train, y_train)

y_pred_dt1 = dt1.predict(X_test)

accuracy_score(y_test, y_pred_dt1)

0.582089552238806

In [13]:
np.mean(cross_val_score(DecisionTreeClassifier(), X,y, cv=10, scoring='accuracy'))

0.5964349376114083

##### Now we decide to apply Binarization on Family column. If a Family value is 0 that means No Family members are travelling with the person.
##### If the Family value is greater than 0 that means some Family members are travelling with the person.
##### Therefore we will apply Binarization stating that if the person is travelling alone, then it should be 0 and if the person is 
##### not travelling alone, then it should be 1.

## Applying Binarization

In [14]:
trf = ColumnTransformer([
    ('bin',Binarizer(copy=False),['Family'])
], remainder='passthrough')

In [15]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [16]:
pd.DataFrame(X_train_trf, columns=['Family','Age','Fare'])

Unnamed: 0,Family,Age,Fare
0,1.0,9.0,15.2458
1,1.0,76.0,78.8500
2,1.0,19.0,15.7417
3,1.0,27.0,7.9250
4,1.0,22.0,61.9792
...,...,...,...
260,1.0,64.0,75.2500
261,0.0,27.0,7.8792
262,0.0,43.0,7.8958
263,1.0,58.0,512.3292


In [17]:
dt2 = DecisionTreeClassifier()

dt2.fit(X_train_trf, y_train)

y_pred_dt2 = dt2.predict(X_test_trf)

accuracy_score(y_test, y_pred_dt2)

0.582089552238806

In [18]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(), X_trf,y, cv=10, scoring='accuracy'))

0.5963458110516935