In [168]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split as split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score as score
from sklearn.model_selection import cross_val_score as cross

In [169]:
df = pd.read_csv('train.csv', usecols=['Age', 'Fare', 'Survived'])

In [170]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [171]:
train_x, test_x, train_y, test_y = split(df.drop(columns=['Survived']), df['Survived'], test_size = 0.2)

# without kbins

In [172]:
transformer = ColumnTransformer(transformers=[
    ('fill', SimpleImputer(), ['Age'])
], remainder = 'passthrough')

In [173]:
new_train_x = transformer.fit_transform(train_x)
new_test_x = transformer.transform(test_x)

In [174]:
t_train_x = pd.DataFrame(new_train_x, columns=['Age', 'Fare'])
t_test_x = pd.DataFrame(new_test_x, columns=['Age', 'Fare'])

In [175]:
decision = DecisionTreeClassifier()

In [176]:
decision.fit(t_train_x, train_y)

In [177]:
pred = decision.predict(t_test_x)

In [178]:
score(pred, test_y)

0.6256983240223464

In [179]:
np.mean(cross(DecisionTreeClassifier(),t_train_x, train_y ,cv=100,scoring='accuracy'))

0.65875

# With Kbins

In [180]:
transformer2 = ColumnTransformer( transformers=[
    ('fill', SimpleImputer(), ['Age']),
    ('discritiser1', KBinsDiscretizer(n_bins = 15, encode = 'ordinal', strategy = 'equal_width'), ['Age']),
    ('discritiser2', KBinsDiscretizer(n_bins = 15, encode = 'ordinal', strategy = 'equal_width'), ['Fare'])

], remainder = 'passthrough')

# quantile
# uniform
# equal_width

In [181]:
new2_train_x = transformer.fit_transform(train_x)
new2_test_x = transformer.transform(test_x)

In [182]:
t2_train_x = pd.DataFrame(new2_train_x, columns=['Age', 'Fare'])
t2_test_x = pd.DataFrame(new2_test_x, columns=['Age', 'Fare'])

In [183]:
decision2 = DecisionTreeClassifier()

In [184]:
decision2.fit(t_train_x, train_y)

In [185]:
pred2 = decision2.predict(t_test_x)

In [186]:
score(pred2, test_y)

0.6256983240223464

In [187]:
np.mean(cross(DecisionTreeClassifier(),t2_train_x, train_y ,cv=100,scoring='accuracy'))

0.6501785714285715

# Binarisarion

In [188]:
train = pd.read_csv('train.csv',usecols = ['Age', 'Fare', 'SibSp', 'Parch', 'Survived'])

In [191]:
train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Family
0,0,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,1
2,1,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,1
4,0,35.0,0,0,8.05,0


In [190]:
train['Family'] = train['SibSp']+train['Parch']

In [200]:
train = train.drop(columns=['SibSp', 'Parch'])

In [194]:
from sklearn.preprocessing import Binarizer

In [201]:
train_x, test_x, train_y, test_y = split(train.drop(columns=['Survived']), train['Survived'], test_size = 0.2)

In [202]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 629 to 218
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     569 non-null    float64
 1   Fare    712 non-null    float64
 2   Family  712 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 22.2 KB


In [203]:
tranaformer = ColumnTransformer(transformers = [
    ('fill', SimpleImputer(), ['Age']),
    ('binarisation1',Binarizer(), ['Family']),
    ('binarisation2', Binarizer(), ['Fare']),
    ('binarisation3', Binarizer(), ['Age'])
], remainder = 'passthrough')

In [204]:
new_train_x = transformer.fit_transform(train_x)
new_test_x = transformer.transform(test_x)

In [205]:
t_train_x = pd.DataFrame(new_train_x, columns=['Age', 'Fare', 'Survived'])
t_test_x = pd.DataFrame(new_test_x, columns=['Age', 'Fare', 'Survived'])

In [206]:
decision = DecisionTreeClassifier()

In [207]:
decision.fit(t_train_x, train_y)

In [208]:
pred = decision.predict(t_test_x)

In [209]:
score(pred, test_y)

0.5921787709497207

In [213]:
np.mean(cross(decision, t_train_x, train_y, cv=100, scoring = 'accuracy'))

0.6537499999999998