## Binarization
In this technique we convert the continuous value into the binary form

In [1]:
#@ Importing the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [2]:
#@ Loading the dataset
data = pd.read_csv('titanic.csv')[['Age', 'Fare', 'SibSp', 'Parch', 'Survived']]
data.dropna(inplace = True)
data.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1
3,35.0,53.1,1,0,1
4,35.0,8.05,0,0,0


In [3]:
#@ Here, we will combine SibSp and Parch and make 'Family' column
data['Family'] = data['SibSp'] + data['Parch']
data.head()
#@ Droping SibSp and Parch column
data.drop(columns = ['SibSp', 'Parch'], inplace = True)
data.head()

Unnamed: 0,Age,Fare,Survived,Family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [4]:
#@ train/test/split
X = data.drop(columns = ['Survived'])
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.head()

Unnamed: 0,Age,Fare,Family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


In [5]:
#@ Without binarizaiton
clf = DecisionTreeClassifier()           # creating an object
clf.fit(X_train, y_train)                # training
y_pred = clf.predict(X_test)             # predicting
acc = accuracy_score(y_test, y_pred)
print(f'The accuracy of the model without using binarization is {acc}')

The accuracy of the model without using binarization is 0.6223776223776224


In [6]:
#@ cross validation
np.mean(cross_val_score(DecisionTreeClassifier(), X, y, cv = 10, scoring = 'accuracy'))

0.6499217527386542

    Here, we will transform the 'Family' column into two parts:
        if they are travelling alone
        or, they are travelling with family

In [7]:
#@ Applying Binarization
from sklearn.preprocessing import Binarizer
trf = ColumnTransformer([
    ('bin', Binarizer(copy = False), ['Family'])
], remainder = 'passthrough')

In [8]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)
pd.DataFrame(X_train_trf, columns = ['Family', 'Age', 'Fare'])

Unnamed: 0,Family,Age,Fare
0,1.0,31.0,20.5250
1,1.0,26.0,14.4542
2,1.0,30.0,16.1000
3,0.0,33.0,7.7750
4,0.0,25.0,13.0000
...,...,...,...
566,1.0,46.0,61.1750
567,0.0,25.0,13.0000
568,0.0,41.0,134.5000
569,1.0,33.0,20.5250


In [9]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf, y_train)
y_pred2 = clf.predict(X_test_trf)
accuracy_score(y_test, y_pred2)

0.6153846153846154

In [10]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(), X_trf, y, cv = 10, scoring = 'accuracy'))

0.6289906103286385