In [1]:
import pandas as pd
import numpy as np
from plotnine import *
from model_diagnostics import model_diagnostics, skf_preds, model_diagnostics_skf, summarise_continuous_feature

#pd.set_option("display.max_rows", 20)

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

data_full = fetch_openml(
    "titanic", version=1, as_frame=True
)

data = pd.concat([data_full['data'], data_full['target']], axis = 1)

drop_cols = ['boat', 'body', 'home.dest']

data.drop(columns = drop_cols, inplace = True)

# change data types to match the csv data types in kaggle
data = data.astype({'pclass': 'int32', 'sex': 'object', 'sibsp': 'int32', 'parch': 'int32', 'fare': 'float32', 'embarked': 'object', 'survived': 'int32'})
data = data.sample(frac = 1)
data.head()



Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
684,3,"Bourke, Mrs. John (Catherine)",female,32.0,1,1,364849,15.5,,Q,0
9,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,0
525,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S,0
1172,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.550003,,S,0
133,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,17453,89.104202,C92,C,1


# Feature Engineering

## Construct deck feature

In [3]:
(
    data
    .assign(deck = data['cabin'].str[0].fillna('M'),
            cabin_no = data['cabin'].str.split('(\d+)', expand = True)[1].fillna(0).astype(int))
    .head(1)
)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived,deck,cabin_no
684,3,"Bourke, Mrs. John (Catherine)",female,32.0,1,1,364849,15.5,,Q,0,M,0


In [4]:
data['deck'] = data['cabin'].str[0]
#data.loc[data['cabin'].isnull(), 'Deck'] = 'M'
data['deck'] = data['deck'].fillna('M')
data['cabin_no'] = data['cabin'].str.split('(\d+)', expand = True)[1].fillna(0).astype(int)

In [5]:
survival_deck = (
    data
    .groupby('deck')
    .agg(n = ('sibsp', 'count'),
         pct_survived = ('survived', 'mean'))
    .reset_index()
)

survival_deck

Unnamed: 0,deck,n,pct_survived
0,A,22,0.5
1,B,65,0.723077
2,C,94,0.606383
3,D,46,0.695652
4,E,41,0.731707
5,F,21,0.619048
6,G,5,0.6
7,M,1014,0.302761
8,T,1,0.0


In [6]:
#df_all['deck'] = df_all['deck'].replace(['A', 'B', 'C'], 'ABC')
data['deck'] = data['deck'].replace(['A', 'T'], 'A')
data['deck'] = data['deck'].replace(['F', 'G'], 'FG')
data['deck'].value_counts()

M     1014
C       94
B       65
D       46
E       41
FG      26
A       23
Name: deck, dtype: int64

## Extract Title feature

In [7]:
data['title'] = data['name'].str.split(",", expand = True)[1].str.split('.', expand = True)[0].str.strip()
data['title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Ms                2
Mlle              2
Major             2
the Countess      1
Dona              1
Sir               1
Jonkheer          1
Mme               1
Don               1
Capt              1
Lady              1
Name: title, dtype: int64

In [8]:
data.query('name.str.contains("Mme")', engine = "python")

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived,deck,cabin_no,title
12,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.300003,B35,C,1,B,35,Mme


In [9]:
data['title'] = data['title'].replace(['Ms'], 'Miss')

noble_list = ['Dr', 'Rev', 'Mlle', 'Major', 'Col', 'the Countess', 'Capt', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
#noble_list = ['Mlle', 'the Countess', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
#military_clergy_list = ['Rev', 'Major', 'Col', 'Capt']

data['title'] = data['title'].replace(noble_list, 'Noble')
#data['title'] = data['title'].replace(military_clergy_list, 'Military_Clergy')
#data['title'] = data['title'].replace('Dr', 'Mr')

data['title'].value_counts()

Mr        757
Miss      262
Mrs       197
Master     61
Noble      31
Dona        1
Name: title, dtype: int64

In [10]:
survival_title = (
    data
    .groupby('title')
    .agg(n = ('name', 'count'),
         avg_survival = ('survived', 'mean'))
    .reset_index()
)

survival_title

Unnamed: 0,title,n,avg_survival
0,Dona,1,1.0
1,Master,61,0.508197
2,Miss,262,0.675573
3,Mr,757,0.162483
4,Mrs,197,0.786802
5,Noble,31,0.419355


In [11]:
data['married'] = 0
data.loc[data['title'] == 'Mrs', 'married'] = 1

## One hot encode categoricals

In [12]:
cat_columns = ['sex', 'pclass', 'embarked', 'title', 'deck']
pd.concat([data, pd.get_dummies(data, columns = cat_columns, dummy_na = True, drop_first = True)], axis = 1).head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,title_Mrs,title_Noble,title_nan,deck_B,deck_C,deck_D,deck_E,deck_FG,deck_M,deck_nan
684,3,"Bourke, Mrs. John (Catherine)",female,32.0,1,1,364849,15.5,,Q,...,1,0,0,0,0,0,0,0,1,0
9,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,...,0,0,0,0,0,0,0,0,1,0
525,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S,...,0,1,0,0,0,0,0,0,1,0
1172,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.550003,,S,...,0,0,0,0,0,0,0,0,1,0
133,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,17453,89.104202,C92,C,...,0,0,0,0,1,0,0,0,0,0


# Test whether you can get feature names out of sklearn pipeline dummies

In [26]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
import xgboost as xgb
from sklearn import set_config

# Initial setup for classification, setting up train/test splits etc
set_config(transform_output="pandas")

target = 'survived'
numeric_cols = ['sibsp', 'parch', 'fare', 'age']
categorical_cols = ['pclass', 'sex', 'embarked', 'deck', 'title']

X, y = data[numeric_cols + categorical_cols].copy(), np.asarray(data[target], dtype = 'int8')

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state = 20230507)

skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 20230301)

# Pipeline steps for preprocessing data (encode categoricals, impute nulls, discretize / scale numerics) 

ct_encode = ColumnTransformer([
#    ('scaler', StandardScaler(), numeric_cols), #if just this line, the pipeline will only return the four numeric columns, scaled
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('pass_through_numerics', 'passthrough', numeric_cols)
])

# option 1, discretize the data
ct_disc = ColumnTransformer(
    transformers = [('disc_age', KBinsDiscretizer(n_bins = 10, encode = 'ordinal'), ['pass_through_numerics__age']),
                    ('disc_fare', KBinsDiscretizer(n_bins = 15, encode = 'ordinal'), ['pass_through_numerics__fare']),
                    ],
    remainder = 'passthrough'
)

# option 2, use a standard scaler
ct_scale = ColumnTransformer(
    transformers = [('scale', StandardScaler(), ['pass_through_numerics__age', 'pass_through_numerics__fare'])],
    remainder = 'passthrough'
)

xgb_model = xgb.XGBClassifier(eval_metric = 'logloss')

pipe = Pipeline([
    ('encode_cats', ct_encode),
    ('scale', ct_scale),
    ('imputer', IterativeImputer()),
#    ('disc', ct_disc),
    ('clf', xgb_model),
])

pipe.fit(X_train_full, y_train_full)



In [31]:
from sklearn.metrics import classification_report

y_pred = pipe.predict(X_test)
y_pred_proba = pipe.predict_proba(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       181
           1       0.81      0.74      0.77       147

    accuracy                           0.80       328
   macro avg       0.81      0.80      0.80       328
weighted avg       0.81      0.80      0.80       328



In [35]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_pred_proba[:,1])

0.8611079791032433

In [15]:
pipe.feature_names_in_

array(['sibsp', 'parch', 'fare', 'age', 'pclass', 'sex', 'embarked',
       'deck', 'title'], dtype=object)

In [16]:
pipe.named_steps['encode_cats'].get_feature_names_out()

array(['encoder__pclass_1', 'encoder__pclass_2', 'encoder__pclass_3',
       'encoder__sex_female', 'encoder__sex_male', 'encoder__embarked_C',
       'encoder__embarked_Q', 'encoder__embarked_S',
       'encoder__embarked_nan', 'encoder__deck_A', 'encoder__deck_B',
       'encoder__deck_C', 'encoder__deck_D', 'encoder__deck_E',
       'encoder__deck_FG', 'encoder__deck_M', 'encoder__title_Dona',
       'encoder__title_Master', 'encoder__title_Miss',
       'encoder__title_Mr', 'encoder__title_Mrs', 'encoder__title_Noble',
       'pass_through_numerics__sibsp', 'pass_through_numerics__parch',
       'pass_through_numerics__fare', 'pass_through_numerics__age'],
      dtype=object)

In [17]:
pipe.named_steps['imputer'].feature_names_in_

array(['encoder__pclass_1', 'encoder__pclass_2', 'encoder__pclass_3',
       'encoder__sex_female', 'encoder__sex_male', 'encoder__embarked_C',
       'encoder__embarked_Q', 'encoder__embarked_S',
       'encoder__embarked_nan', 'encoder__deck_A', 'encoder__deck_B',
       'encoder__deck_C', 'encoder__deck_D', 'encoder__deck_E',
       'encoder__deck_FG', 'encoder__deck_M', 'encoder__title_Dona',
       'encoder__title_Master', 'encoder__title_Miss',
       'encoder__title_Mr', 'encoder__title_Mrs', 'encoder__title_Noble',
       'pass_through_numerics__sibsp', 'pass_through_numerics__parch',
       'pass_through_numerics__fare', 'pass_through_numerics__age'],
      dtype=object)

In [18]:
pipe.named_steps['imputer'].get_feature_names_out()

array(['encoder__pclass_1', 'encoder__pclass_2', 'encoder__pclass_3',
       'encoder__sex_female', 'encoder__sex_male', 'encoder__embarked_C',
       'encoder__embarked_Q', 'encoder__embarked_S',
       'encoder__embarked_nan', 'encoder__deck_A', 'encoder__deck_B',
       'encoder__deck_C', 'encoder__deck_D', 'encoder__deck_E',
       'encoder__deck_FG', 'encoder__deck_M', 'encoder__title_Dona',
       'encoder__title_Master', 'encoder__title_Miss',
       'encoder__title_Mr', 'encoder__title_Mrs', 'encoder__title_Noble',
       'pass_through_numerics__sibsp', 'pass_through_numerics__parch',
       'pass_through_numerics__fare', 'pass_through_numerics__age'],
      dtype=object)

In [19]:
pipe.named_steps['disc'].get_feature_names_out()

array(['disc_age__pass_through_numerics__age',
       'disc_fare__pass_through_numerics__fare',
       'remainder__encoder__pclass_1', 'remainder__encoder__pclass_2',
       'remainder__encoder__pclass_3', 'remainder__encoder__sex_female',
       'remainder__encoder__sex_male', 'remainder__encoder__embarked_C',
       'remainder__encoder__embarked_Q', 'remainder__encoder__embarked_S',
       'remainder__encoder__embarked_nan', 'remainder__encoder__deck_A',
       'remainder__encoder__deck_B', 'remainder__encoder__deck_C',
       'remainder__encoder__deck_D', 'remainder__encoder__deck_E',
       'remainder__encoder__deck_FG', 'remainder__encoder__deck_M',
       'remainder__encoder__title_Dona',
       'remainder__encoder__title_Master',
       'remainder__encoder__title_Miss', 'remainder__encoder__title_Mr',
       'remainder__encoder__title_Mrs', 'remainder__encoder__title_Noble',
       'remainder__pass_through_numerics__sibsp',
       'remainder__pass_through_numerics__parch'], dtyp

# Appendix: Annoying times working with sklearn OneHotEncoder

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# enc = OneHotEncoder()
# X = X[['Sex', 'Pclass', 'Embarked']].copy()
# enc.fit_transform(X).toarray()

In [None]:
# enc.get_feature_names_out()

In [None]:
# cat_features = ['Pclass', 'Sex', 'Embarked']
# encoded_features = []
# dfs = [X]

# for df in dfs:
#     for feature in cat_features:
#         encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
#         n = df[feature].nunique()
#         cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
#         encoded_df = pd.XFrame(encoded_feat, columns=cols)
#         encoded_df.index = df.index
#         encoded_features.append(encoded_df)


In [None]:
# cols 

In [None]:
# encoded_feat