In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

In [2]:
df = sns.load_dataset('titanic')
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [3]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked']]
df.sample(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
424,0,3,male,18.0,1,1,20.2125,S
873,0,3,male,47.0,0,0,9.0,S
241,1,3,female,,1,0,15.5,Q


In [4]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split 

X, y = df.drop(columns = 'survived'), df['survived']

xTrain, xTest, yTrain, yTest = train_test_split(X,y,
                                                test_size = 0.32,
                                                random_state = 23)

In [6]:
xTrain.shape, xTest.shape, len(yTrain), len(yTest)

((605, 7), (286, 7), 605, 286)

In [7]:
contCols = ['age', 'fare']
nominalCols = ['sex', 'embarked']

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
# pipeline for handling cont data

# we know there are missing values in age
# we also know the distributions of these cols are not normal

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer

contPipeline = Pipeline(steps = [
    ('impute',SimpleImputer(strategy = ('median'))),
    ('yeoJohnsonTransformation', PowerTransformer()),
    ('scaler', StandardScaler()) 
])

In [10]:
# Pipeline for handling categorical cols

# we knew there are missing values in embarked
# we will be using one hot encoding to transform these nominal categorical data

from sklearn.preprocessing import OneHotEncoder

catPipeline = Pipeline(steps = [
    ('impute',SimpleImputer(strategy = ('most_frequent'))),
    ('oneHotEncoding', OneHotEncoder( sparse_output=False, handle_unknown='ignore'))
])

In [11]:
# we shall use column transformer to run these two pipelines parallely

preprocessor = ColumnTransformer(transformers = [
    ('contDataPipeLine', contPipeline, contCols),
    ('catNominalDataPipeLine', catPipeline, nominalCols)
    
],
remainder = 'passthrough')

In [12]:
# we will create another pipeline which acts as an final pipeline with estimator

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtcPipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('dtcModel', dtc)
])

In [13]:
dtcPipeline.fit(xTrain, yTrain)

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
yTrainPred = dtcPipeline.predict(xTrain)
accuracy_score(yTrain, yTrainPred)

0.9818181818181818

In [16]:
yTestPred = dtcPipeline.predict(xTest)
accuracy_score(yTest, yTestPred)

0.7867132867132867

In [17]:
# the overfitting might be a resultant of letting the tree grow to its max depth.
'''
import matplotlib.pyplot as plt
plt.figure(figsize = (15,15))
sklearn.tree.plot_tree(dtc)
'''

'\nimport matplotlib.pyplot as plt\nplt.figure(figsize = (15,15))\nsklearn.tree.plot_tree(dtc)\n'

In [18]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [19]:
# we can use our final pipeline to perform k-fold cross validation using randomized search

from sklearn.model_selection import RandomizedSearchCV 



hyperParametersGrid =  {'dtcModel__criterion':['gini', 'entropy', 'log_loss'],
                        'dtcModel__max_depth' : np.arange(2,11),
                        'preprocessor__contDataPipeLine__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
                        'preprocessor__catNominalDataPipeLine__oneHotEncoding__drop':[None, 'first']} # model's name in the pipeline followed by two(2) '_'(underscores) and hyperparmetername


rsDtc = RandomizedSearchCV(dtcPipeline,
                     hyperParametersGrid,
                     n_iter=20,
                     cv = 15,
                     scoring = 'f1'
                     )

In [20]:
rsDtc.fit(xTrain, yTrain)

In [21]:
rsDtc.best_params_

{'preprocessor__contDataPipeLine__scaler': MinMaxScaler(),
 'preprocessor__catNominalDataPipeLine__oneHotEncoding__drop': 'first',
 'dtcModel__max_depth': 6,
 'dtcModel__criterion': 'entropy'}

In [22]:
yTestRsPred = rsDtc.predict(xTest)

In [23]:
yTrainRsPred = rsDtc.predict(xTrain)

In [24]:
accuracy_score(yTrain, yTrainRsPred)

0.8611570247933884

In [25]:
accuracy_score(yTest, yTestRsPred)

0.8251748251748252

In [26]:
rsHyper_df = pd.DataFrame(rsDtc.cv_results_)
rsHyper_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__contDataPipeLine__scaler,param_preprocessor__catNominalDataPipeLine__oneHotEncoding__drop,param_dtcModel__max_depth,param_dtcModel__criterion,params,split0_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
0,0.017526,0.004724,0.004158,0.006895,MinMaxScaler(),first,3,entropy,{'preprocessor__contDataPipeLine__scaler': Min...,0.666667,...,0.6,0.740741,0.6,0.774194,0.8,0.857143,0.709677,0.722117,0.081832,11
1,0.010947,0.007018,0.006908,0.007532,StandardScaler(),first,2,log_loss,{'preprocessor__contDataPipeLine__scaler': Sta...,0.689655,...,0.645161,0.666667,0.5625,0.774194,0.666667,0.827586,0.545455,0.666628,0.069704,19
2,0.012338,0.006236,0.00548,0.007242,StandardScaler(),,10,log_loss,{'preprocessor__contDataPipeLine__scaler': Sta...,0.787879,...,0.6,0.571429,0.689655,0.866667,0.774194,0.769231,0.740741,0.705491,0.112752,18
3,0.014832,0.006981,0.007501,0.008113,RobustScaler(),,6,gini,{'preprocessor__contDataPipeLine__scaler': Rob...,0.787879,...,0.645161,0.689655,0.647059,0.8,0.814815,0.774194,0.774194,0.725877,0.069514,9
4,0.013572,0.004542,0.005783,0.004877,MinMaxScaler(),first,8,entropy,{'preprocessor__contDataPipeLine__scaler': Min...,0.764706,...,0.62069,0.6,0.685714,0.8,0.827586,0.740741,0.666667,0.707109,0.07324,16
5,0.011413,0.008428,0.006262,0.007669,RobustScaler(),,8,log_loss,{'preprocessor__contDataPipeLine__scaler': Rob...,0.764706,...,0.62069,0.571429,0.705882,0.8,0.785714,0.740741,0.666667,0.706122,0.076193,17
6,0.017414,0.00524,0.007215,0.007691,StandardScaler(),first,3,gini,{'preprocessor__contDataPipeLine__scaler': Sta...,0.666667,...,0.6,0.666667,0.62069,0.827586,0.8,0.857143,0.666667,0.720941,0.083841,13
7,0.018191,0.007636,0.006202,0.005442,MinMaxScaler(),first,9,entropy,{'preprocessor__contDataPipeLine__scaler': Min...,0.787879,...,0.709677,0.62069,0.705882,0.827586,0.8,0.769231,0.8,0.727341,0.101798,8
8,0.015412,0.005493,0.006189,0.006058,StandardScaler(),,6,entropy,{'preprocessor__contDataPipeLine__scaler': Sta...,0.8125,...,0.709677,0.689655,0.625,0.8,0.866667,0.774194,0.785714,0.753985,0.08945,3
9,0.014754,0.010339,0.007304,0.007161,RobustScaler(),,2,entropy,{'preprocessor__contDataPipeLine__scaler': Rob...,0.689655,...,0.645161,0.666667,0.5625,0.774194,0.666667,0.827586,0.545455,0.666628,0.069704,19


In [27]:
rsHyper_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_preprocessor__contDataPipeLine__scaler',
       'param_preprocessor__catNominalDataPipeLine__oneHotEncoding__drop',
       'param_dtcModel__max_depth', 'param_dtcModel__criterion', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'split10_test_score', 'split11_test_score',
       'split12_test_score', 'split13_test_score', 'split14_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

In [28]:
rsHyper_df.sort_values(by='mean_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__contDataPipeLine__scaler,param_preprocessor__catNominalDataPipeLine__oneHotEncoding__drop,param_dtcModel__max_depth,param_dtcModel__criterion,params,split0_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
9,0.014754,0.010339,0.007304,0.007161,RobustScaler(),,2,entropy,{'preprocessor__contDataPipeLine__scaler': Rob...,0.689655,...,0.645161,0.666667,0.5625,0.774194,0.666667,0.827586,0.545455,0.666628,0.069704,19
1,0.010947,0.007018,0.006908,0.007532,StandardScaler(),first,2,log_loss,{'preprocessor__contDataPipeLine__scaler': Sta...,0.689655,...,0.645161,0.666667,0.5625,0.774194,0.666667,0.827586,0.545455,0.666628,0.069704,19
2,0.012338,0.006236,0.00548,0.007242,StandardScaler(),,10,log_loss,{'preprocessor__contDataPipeLine__scaler': Sta...,0.787879,...,0.6,0.571429,0.689655,0.866667,0.774194,0.769231,0.740741,0.705491,0.112752,18
5,0.011413,0.008428,0.006262,0.007669,RobustScaler(),,8,log_loss,{'preprocessor__contDataPipeLine__scaler': Rob...,0.764706,...,0.62069,0.571429,0.705882,0.8,0.785714,0.740741,0.666667,0.706122,0.076193,17
4,0.013572,0.004542,0.005783,0.004877,MinMaxScaler(),first,8,entropy,{'preprocessor__contDataPipeLine__scaler': Min...,0.764706,...,0.62069,0.6,0.685714,0.8,0.827586,0.740741,0.666667,0.707109,0.07324,16
18,0.017171,0.005161,0.006698,0.002037,MinMaxScaler(),,9,gini,{'preprocessor__contDataPipeLine__scaler': Min...,0.774194,...,0.709677,0.645161,0.647059,0.8,0.8,0.666667,0.714286,0.719087,0.070637,15
6,0.017414,0.00524,0.007215,0.007691,StandardScaler(),first,3,gini,{'preprocessor__contDataPipeLine__scaler': Sta...,0.666667,...,0.6,0.666667,0.62069,0.827586,0.8,0.857143,0.666667,0.720941,0.083841,13
11,0.012214,0.006237,0.003125,0.006249,RobustScaler(),first,3,gini,{'preprocessor__contDataPipeLine__scaler': Rob...,0.666667,...,0.6,0.666667,0.62069,0.827586,0.8,0.857143,0.666667,0.720941,0.083841,13
17,0.016446,0.006807,0.006246,0.002748,RobustScaler(),,3,entropy,{'preprocessor__contDataPipeLine__scaler': Rob...,0.666667,...,0.6,0.740741,0.6,0.774194,0.8,0.857143,0.709677,0.722117,0.081832,11
0,0.017526,0.004724,0.004158,0.006895,MinMaxScaler(),first,3,entropy,{'preprocessor__contDataPipeLine__scaler': Min...,0.666667,...,0.6,0.740741,0.6,0.774194,0.8,0.857143,0.709677,0.722117,0.081832,11


In [29]:
(rsHyper_df['mean_test_score'])

0     0.722117
1     0.666628
2     0.705491
3     0.725877
4     0.707109
5     0.706122
6     0.720941
7     0.727341
8     0.753985
9     0.666628
10    0.754386
11    0.720941
12    0.747662
13    0.754819
14    0.753590
15    0.725371
16    0.745823
17    0.722117
18    0.719087
19    0.737849
Name: mean_test_score, dtype: float64