In [5]:
import pandas as pd
import os
import numpy as np
import datetime
from plotnine import *
import matplotlib.pyplot as plt
from model_diagnostics import model_diagnostics, skf_preds, model_diagnostics_skf, summarise_continuous_feature

#pd.set_option("display.max_rows", 20)

In [6]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

data_full = fetch_openml(
    "titanic", version=1, as_frame=True
)

data = pd.concat([data_full['data'], data_full['target']], axis = 1)

drop_cols = ['boat', 'body', 'home.dest']

data.drop(columns = drop_cols, inplace = True)

# change data types to match the csv data types in kaggle
data = data.astype({'pclass': 'int32', 'sex': 'object', 'sibsp': 'int32', 'parch': 'int32', 'fare': 'float32', 'embarked': 'object', 'survived': 'int32'})
data.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.550003,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.550003,C22 C26,S,0


# Feature Engineering

## Construct deck feature

In [7]:
(
    data
    .assign(deck = data['cabin'].str[0].fillna('M'),
            cabin_no = data['cabin'].str.split('(\d+)', expand = True)[1].fillna(0).astype(int))
    .head(1)
)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived,deck,cabin_no
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,S,1,B,5


In [8]:
data['deck'] = data['cabin'].str[0]
#data.loc[data['cabin'].isnull(), 'Deck'] = 'M'
data['deck'] = data['deck'].fillna('M')
data['cabin_no'] = data['cabin'].str.split('(\d+)', expand = True)[1].fillna(0).astype(int)

In [17]:
survival_deck = (
    data
    .groupby('deck')
    .agg(n = ('sibsp', 'count'),
         pct_survived = ('survived', 'mean'))
    .reset_index()
)

survival_deck

Unnamed: 0,deck,n,pct_survived
0,A,23,0.478261
1,B,65,0.723077
2,C,94,0.606383
3,D,46,0.695652
4,E,41,0.731707
5,FG,26,0.615385
6,M,1014,0.302761


In [9]:
#df_all['deck'] = df_all['deck'].replace(['A', 'B', 'C'], 'ABC')
data['deck'] = data['deck'].replace(['A', 'T'], 'A')
data['deck'] = data['deck'].replace(['F', 'G'], 'FG')
data['deck'].value_counts()

M     1014
C       94
B       65
D       46
E       41
FG      26
A       23
Name: deck, dtype: int64

## Extract Title feature

In [10]:
data['title'] = data['name'].str.split(",", expand = True)[1].str.split('.', expand = True)[0].str.strip()
data['title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Ms                2
Major             2
Capt              1
Sir               1
Dona              1
Jonkheer          1
the Countess      1
Don               1
Mme               1
Lady              1
Name: title, dtype: int64

In [11]:
data.query('name.str.contains("Mme")', engine = "python")

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived,deck,cabin_no,title
12,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.300003,B35,C,1,B,35,Mme


In [12]:
data['title'] = data['title'].replace(['Ms'], 'Miss')

noble_list = ['Dr', 'Rev', 'Mlle', 'Major', 'Col', 'the Countess', 'Capt', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
#noble_list = ['Mlle', 'the Countess', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
#military_clergy_list = ['Rev', 'Major', 'Col', 'Capt']

data['title'] = data['title'].replace(noble_list, 'Noble')
#data['title'] = data['title'].replace(military_clergy_list, 'Military_Clergy')
#data['title'] = data['title'].replace('Dr', 'Mr')

data['title'].value_counts()

Mr        757
Miss      262
Mrs       197
Master     61
Noble      31
Dona        1
Name: title, dtype: int64

In [18]:
survival_title = (
    data
    .groupby('title')
    .agg(n = ('name', 'count'),
         avg_survival = ('survived', 'mean'))
    .reset_index()
)

survival_title

Unnamed: 0,title,n,avg_survival
0,Dona,1,1.0
1,Master,61,0.508197
2,Miss,262,0.675573
3,Mr,757,0.162483
4,Mrs,197,0.786802
5,Noble,31,0.419355


In [19]:
data['married'] = 0
data.loc[data['title'] == 'Mrs', 'married'] = 1

## One hot encode categoricals

In [15]:
cat_columns = ['sex', 'pclass', 'embarked', 'title', 'deck']
pd.concat([data, pd.get_dummies(data, columns = cat_columns, dummy_na = True, drop_first = True)], axis = 1).head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,title_Mrs,title_Noble,title_nan,deck_B,deck_C,deck_D,deck_E,deck_FG,deck_M,deck_nan
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,S,...,0,0,0,1,0,0,0,0,0,0
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,S,...,0,0,0,0,1,0,0,0,0,0
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,S,...,0,0,0,0,1,0,0,0,0,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.550003,C22 C26,S,...,0,0,0,0,1,0,0,0,0,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.550003,C22 C26,S,...,1,0,0,0,1,0,0,0,0,0


# Test whether you can get feature names out of sklearn pipeline dummies

In [20]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
import xgboost as xgb
from sklearn import set_config

set_config(transform_output="pandas")

target = 'survived'
numeric_cols = ['sibsp', 'parch', 'fare', 'age']
categorical_cols = ['pclass', 'sex', 'embarked']

X, y = data[numeric_cols + categorical_cols].copy(), np.asarray(data[target], dtype = 'int8')



TypeError: set_config() got an unexpected keyword argument 'transform_output'

In [None]:

skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 20230301)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 20230118)


ct_encode = ColumnTransformer([
#    ('scaler', StandardScaler(), numeric_cols), #if just this line, the pipeline will only return the four numeric columns, scaled
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols), #this line adds dummies for class (3 values), sex (2 values) and embarked (4 values)
    ('pass_through_numerics', 'passthrough', numeric_cols)
])

ct_disc = ColumnTransformer(
    transformers = [('disc_age', KBinsDiscretizer(n_bins = 10, encode = 'ordinal'), [-1]),
                    ('disc_fare', KBinsDiscretizer(n_bins = 15, encode = 'ordinal'), [-2]),
                    #('disc_cabno', StandardScaler(), [-3])
                    ],
    remainder = 'passthrough'
)

ct_scale = ColumnTransformer(
    transformers = [('scale', StandardScaler(), [-1, -2])],
    remainder = 'passthrough'
)

xgb_model = xgb.XGBClassifier(eval_metric = 'logloss')

pipe = Pipeline([
    ('encode_cats', ct_encode),
    ('imputer', IterativeImputer()),
    ('disc', ct_disc),
    ('clf', xgb_model),
])

pipe.fit(X_train, y_train)

In [None]:
pipe.feature_names_in_

In [None]:
pipe.named_steps['encode_cats'].get_feature_names_out()

In [None]:
pipe.named_steps['imputer'].feature_names_in_

In [None]:
pipe.named_steps['imputer'].get_feature_names_out()

In [None]:
pipe.named_steps['disc'].get_feature_names_out()

In [None]:
pipe.named_steps['clf'].feature_names_in

# Appendix: Annoying times working with sklearn OneHotEncoder

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# enc = OneHotEncoder()
# X = X[['Sex', 'Pclass', 'Embarked']].copy()
# enc.fit_transform(X).toarray()

In [None]:
# enc.get_feature_names_out()

In [None]:
# cat_features = ['Pclass', 'Sex', 'Embarked']
# encoded_features = []
# dfs = [X]

# for df in dfs:
#     for feature in cat_features:
#         encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
#         n = df[feature].nunique()
#         cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
#         encoded_df = pd.XFrame(encoded_feat, columns=cols)
#         encoded_df.index = df.index
#         encoded_features.append(encoded_df)


In [None]:
# cols 

In [None]:
# encoded_feat