In [25]:
import numpy as np, pandas as pd, seaborn as sns, sklearn

In [26]:
df = sns.load_dataset('titanic')
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [27]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked']]
df.sample(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
574,0,3,male,16.0,0,0,8.05,S
244,0,3,male,30.0,0,0,7.225,C
449,1,1,male,52.0,0,0,30.5,S


In [28]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [29]:
from sklearn.model_selection import train_test_split 

X, y = df.drop(columns = 'survived'), df['survived']

xTrain, xTest, yTrain, yTest = train_test_split(X,y,
                                                test_size = 0.32,
                                                random_state = 23)

In [30]:
xTrain.shape, xTest.shape, len(yTrain), len(yTest)

((605, 7), (286, 7), 605, 286)

In [31]:
contCols = ['age', 'fare']
nominalCols = ['sex', 'embarked']

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [33]:
# pipeline for handling cont data

# we know there are missing values in age
# we also know the distributions of these cols are not normal

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer

contPipeline = Pipeline(steps = [
    ('impute',SimpleImputer(strategy = ('median'))),
    ('yeoJohnsonTransformation', PowerTransformer()),
    ('scaler', StandardScaler()) 
])

In [34]:
# Pipeline for handling categorical cols

# we knew there are missing values in embarked
# we will be using one hot encoding to transform these nominal categorical data

from sklearn.preprocessing import OneHotEncoder

catPipeline = Pipeline(steps = [
    ('impute',SimpleImputer(strategy = ('most_frequent'))),
    ('oneHotEncoding', OneHotEncoder( sparse_output=False, handle_unknown='ignore'))
])

In [35]:
# we shall use column transformer to run these two pipelines parallely

preprocessor = ColumnTransformer(transformers = [
    ('contDataPipeLine', contPipeline, contCols),
    ('catNominalDataPipeLine', catPipeline, nominalCols)
    
],
remainder = 'passthrough')

In [36]:
# we will create another pipeline which acts as an final pipeline with estimator

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtcPipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('dtcModel', dtc)
])

In [37]:
dtcPipeline.fit(xTrain, yTrain)

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
yTrainPred = dtcPipeline.predict(xTrain)
accuracy_score(yTrain, yTrainPred)

0.9818181818181818

In [16]:
yTestPred = dtcPipeline.predict(xTest)
accuracy_score(yTest, yTestPred)

0.7832167832167832

In [17]:
# the overfitting might be a resultant of letting the tree grow to its max depth.
'''
import matplotlib.pyplot as plt
plt.figure(figsize = (15,15))
sklearn.tree.plot_tree(dtc)
'''

'\nimport matplotlib.pyplot as plt\nplt.figure(figsize = (15,15))\nsklearn.tree.plot_tree(dtc)\n'

In [39]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [54]:
# we can use our final pipeline to perform k-fold cros validation using grid search

from sklearn.model_selection import GridSearchCV 


# We will mention all the hyperparameters we want to tune
# We can also see what the best approach is like scaling
hyperParametersGrid =  {'dtcModel__criterion':['gini', 'entropy', 'log_loss'],
                        'dtcModel__max_depth' : np.arange(2,11),
                        'preprocessor__contDataPipeLine__scaler':[StandardScaler(),RobustScaler(),MinMaxScaler()],
                        'preprocessor__catNominalDataPipeLine__oneHotEncoding__drop':[None, 'first']} # model's name in the pipeline followed by two(2) '_'(underscores) and hyperparmetername


# We can mention what scoring to use for best performance
gsDtc = GridSearchCV(dtcPipeline,
                     hyperParametersGrid,
                     cv = 15,
                     scoring = 'f1'
                     )

In [55]:
# preprocessor__catNominalDataPipeLine__oneHotEncoding__drop here if we want to tune it we will access using __ (double underscore)

In [57]:
gsDtc.fit(xTrain, yTrain)

In [58]:
# To list out the best parameters

In [59]:
gsDtc.best_params_

{'dtcModel__criterion': 'gini',
 'dtcModel__max_depth': 5,
 'preprocessor__catNominalDataPipeLine__oneHotEncoding__drop': 'first',
 'preprocessor__contDataPipeLine__scaler': RobustScaler()}

In [60]:
yTestGsPred = gsDtc.predict(xTest)

In [61]:
yTrainGsPred = gsDtc.predict(xTrain)

In [62]:
accuracy_score(yTrain, yTrainGsPred)

0.859504132231405

In [63]:
accuracy_score(yTest, yTestGsPred)

0.8251748251748252

In [65]:
# See the results in a Dataframe of Grid Search results
gsHyper_df = pd.DataFrame(gsDtc.cv_results_)
gsHyper_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dtcModel__criterion,param_dtcModel__max_depth,param_preprocessor__catNominalDataPipeLine__oneHotEncoding__drop,param_preprocessor__contDataPipeLine__scaler,params,split0_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034793,0.005606,0.013875,0.004249,gini,2,,StandardScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
1,0.036510,0.007767,0.013067,0.005194,gini,2,,RobustScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
2,0.032529,0.004467,0.015750,0.001934,gini,2,,MinMaxScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
3,0.033881,0.007181,0.012434,0.006276,gini,2,first,StandardScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
4,0.033503,0.006443,0.016834,0.003059,gini,2,first,RobustScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0.036102,0.007810,0.015037,0.007011,log_loss,10,,RobustScaler(),"{'dtcModel__criterion': 'log_loss', 'dtcModel_...",0.787879,...,0.645161,0.571429,0.689655,0.866667,0.774194,0.769231,0.740741,0.716784,0.112268,119
158,0.033991,0.005126,0.016841,0.005373,log_loss,10,,MinMaxScaler(),"{'dtcModel__criterion': 'log_loss', 'dtcModel_...",0.787879,...,0.600000,0.642857,0.705882,0.866667,0.800000,0.769231,0.769231,0.713853,0.112519,126
159,0.036230,0.005218,0.011099,0.006730,log_loss,10,first,StandardScaler(),"{'dtcModel__criterion': 'log_loss', 'dtcModel_...",0.787879,...,0.645161,0.571429,0.727273,0.866667,0.785714,0.769231,0.769231,0.722840,0.106024,91
160,0.037072,0.006609,0.013761,0.005168,log_loss,10,first,RobustScaler(),"{'dtcModel__criterion': 'log_loss', 'dtcModel_...",0.787879,...,0.600000,0.642857,0.714286,0.827586,0.785714,0.769231,0.769231,0.713129,0.111835,129


In [66]:
gsHyper_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_dtcModel__criterion', 'param_dtcModel__max_depth',
       'param_preprocessor__catNominalDataPipeLine__oneHotEncoding__drop',
       'param_preprocessor__contDataPipeLine__scaler', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'split10_test_score', 'split11_test_score',
       'split12_test_score', 'split13_test_score', 'split14_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

In [67]:
gsHyper_df.sort_values(by=['mean_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dtcModel__criterion,param_dtcModel__max_depth,param_preprocessor__catNominalDataPipeLine__oneHotEncoding__drop,param_preprocessor__contDataPipeLine__scaler,params,split0_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034793,0.005606,0.013875,0.004249,gini,2,,StandardScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
1,0.036510,0.007767,0.013067,0.005194,gini,2,,RobustScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
2,0.032529,0.004467,0.015750,0.001934,gini,2,,MinMaxScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
3,0.033881,0.007181,0.012434,0.006276,gini,2,first,StandardScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
4,0.033503,0.006443,0.016834,0.003059,gini,2,first,RobustScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.689655,...,0.645161,0.666667,0.562500,0.727273,0.666667,0.827586,0.545455,0.662230,0.065489,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.035232,0.006808,0.015116,0.002022,entropy,4,first,RobustScaler(),"{'dtcModel__criterion': 'entropy', 'dtcModel__...",0.733333,...,0.689655,0.727273,0.687500,0.827586,0.857143,0.774194,0.758621,0.753590,0.062761,5
136,0.036116,0.006712,0.014774,0.002859,log_loss,6,first,RobustScaler(),"{'dtcModel__criterion': 'log_loss', 'dtcModel_...",0.812500,...,0.709677,0.666667,0.666667,0.800000,0.896552,0.774194,0.785714,0.754819,0.090080,4
21,0.032825,0.002829,0.013589,0.005689,gini,5,first,StandardScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.764706,...,0.709677,0.689655,0.666667,0.800000,0.827586,0.774194,0.758621,0.755104,0.058494,2
23,0.034301,0.005580,0.014840,0.004741,gini,5,first,MinMaxScaler(),"{'dtcModel__criterion': 'gini', 'dtcModel__max...",0.764706,...,0.709677,0.689655,0.666667,0.800000,0.827586,0.774194,0.758621,0.755104,0.058494,2
