In [73]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score


from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer 

In [74]:
df= pd.read_csv('Titanic Train.csv',usecols=['Age','Fare','Survived'])

In [75]:
df.dropna(inplace=True)  ## drop the rows which have the missing values

In [76]:
df.shape

(714, 3)

In [77]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [78]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [79]:
X= df.iloc[:,1:]
y= df.iloc[:,0]

In [80]:
X_train,X_test, y_train, y_test  = train_test_split(X,y,test_size=0.2,random_state=42)

In [81]:
X_train.head()

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542
253,30.0,16.1
719,33.0,7.775
666,25.0,13.0


In [82]:
clf= DecisionTreeClassifier()

In [83]:
clf.fit(X_train,y_train)
y_pred= clf.predict(X_test)

In [84]:
accuracy_score(y_test,y_pred)  ## without appling any transformation

0.6363636363636364

In [85]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.6330790297339592

In [86]:
## apply discritizer

kbin_age= KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')
kbin_fare= KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')

In [87]:
## made column transformer

trf= ColumnTransformer([
    ('first',kbin_age,[0]), ## 0 iscolumn of age in X train
    ('second',kbin_fare,[1])
])

In [88]:
X_train_trf= trf.fit_transform(X_train)
X_test_trf= trf.transform(X_test)

In [89]:
trf.named_transformers_

{'first': KBinsDiscretizer(encode='ordinal', n_bins=10),
 'second': KBinsDiscretizer(encode='ordinal', n_bins=10)}

In [90]:
trf.named_transformers_['first'].n_bins

10

In [91]:
# 10 bins is created

In [92]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 14.  , 19.  , 22.  , 25.  , 28.5 , 32.  , 36.  , 42.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [93]:
output = pd.DataFrame({
    'age': X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trf[:,1]
})

In [94]:
output['age_labels']= pd.cut(x=X_train['Age'],
                            bins=trf.named_transformers_['first'].bin_edges_[0].tolist())

output['fare_labels']= pd.cut(x=X_train['Fare'],
                            bins=trf.named_transformers_['second'].bin_edges_[0].tolist())


In [95]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
785,25.0,4.0,7.25,0.0,"(22.0, 25.0]","(0.0, 7.75]"
728,25.0,4.0,26.0,6.0,"(22.0, 25.0]","(15.75, 26.0]"
60,22.0,3.0,7.2292,0.0,"(19.0, 22.0]","(0.0, 7.75]"
857,51.0,9.0,26.55,6.0,"(50.0, 80.0]","(26.0, 29.125]"
655,24.0,3.0,73.5,8.0,"(22.0, 25.0]","(51.479, 82.171]"


In [96]:
## now we calculate the accuracy after transformation

In [97]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

In [98]:
accuracy_score(y_test,y_pred2)

0.6223776223776224

In [99]:
## if we increase the value of bin then accuracy increasee

In [121]:
def discretize(bins, strategy):

    kbin_age = KBinsDiscretizer(n_bins=bins, encode='ordinal', strateg=strategy) 
    kbin_fare = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)

    trf =ColumnTransformer([ 
        ('first', kbin_age,[0]), 
        ('second',kbin_fare, [1])
    ])


    X_trf = trf.fit_transform(X)
    print(np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10, scoring='accuracy')))

    plt.figure(figsize=(14,4))
    plt.subplot(121)
    plt.hist(X['Age'])
    plt.title("Before")

    plt.subplot(122)

    plt.hist(X_trf[:,0],color='red')

    plt.title("After")

    plt.show()      
    
    
    ### this is the sum up of all above code 


# Binarization

In [101]:
df1= pd.read_csv('Titanic Train.csv')[['Age','Fare','SibSp','Parch','Survived']]

In [102]:
df1.dropna(inplace=True)

In [103]:
df1.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1
3,35.0,53.1,1,0,1
4,35.0,8.05,0,0,0


In [104]:
df1['family']= df1['SibSp']+df1["Parch"]  ## merge 2 columns

In [105]:
df1.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,family
0,22.0,7.25,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.925,0,0,1,0
3,35.0,53.1,1,0,1,1
4,35.0,8.05,0,0,0,0


In [106]:
df1.drop(columns=['SibSp','Parch'],inplace=True)

In [107]:
df1.head()

Unnamed: 0,Age,Fare,Survived,family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [108]:
X= df1.drop(columns=['Survived'])
y= df1['Survived']

In [109]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [110]:
X_train.head()

Unnamed: 0,Age,Fare,family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


In [111]:
## without binarization

clf= DecisionTreeClassifier()

clf.fit(X_train,y_train)
y_pred= clf.predict(X_test)

accuracy_score(y_test,y_pred)

0.6293706293706294

In [112]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.6499217527386542

In [113]:
## Applying Binarization

from sklearn.preprocessing import Binarizer

In [114]:
trf= ColumnTransformer([
    ('bin',Binarizer(copy=False),['family'])   ## we apply false because we want change in the existing column 
],remainder='passthrough')

In [115]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf= trf.transform(X_test)

In [116]:
pd.DataFrame(X_train_trf,columns=['family','Age','Fare'])

Unnamed: 0,family,Age,Fare
0,1.0,31.0,20.5250
1,1.0,26.0,14.4542
2,1.0,30.0,16.1000
3,0.0,33.0,7.7750
4,0.0,25.0,13.0000
...,...,...,...
566,1.0,46.0,61.1750
567,0.0,25.0,13.0000
568,0.0,41.0,134.5000
569,1.0,33.0,20.5250


In [117]:
# in family column 0 means he traviling alone 

In [118]:
clf= DecisionTreeClassifier()

clf.fit(X_train_trf,y_train)
y_pred= clf.predict(X_test_trf)

accuracy_score(y_test,y_pred)

0.6153846153846154

In [119]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.6513302034428794