In [18]:
import pandas as pd
import numpy as np
from plotnine import *

# Read in data

In [19]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

data_full = fetch_openml(
    "titanic", version=1, as_frame=True
)

data = pd.concat([data_full['data'], data_full['target']], axis = 1)

drop_cols = ['boat', 'body', 'home.dest']

data.drop(columns = drop_cols, inplace = True)

# change data types to match the csv data types in kaggle
data = data.astype({'pclass': 'int32', 'sex': 'object', 'sibsp': 'int32', 'parch': 'int32', 'fare': 'float32', 'embarked': 'object', 'survived': 'int32'})
data.head()



Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.550003,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.550003,C22 C26,S,0


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int32  
 1   name      1309 non-null   object 
 2   sex       1309 non-null   object 
 3   age       1046 non-null   float64
 4   sibsp     1309 non-null   int32  
 5   parch     1309 non-null   int32  
 6   ticket    1309 non-null   object 
 7   fare      1308 non-null   float32
 8   cabin     295 non-null    object 
 9   embarked  1307 non-null   object 
 10  survived  1309 non-null   int32  
dtypes: float32(1), float64(1), int32(4), object(5)
memory usage: 87.1+ KB


In [47]:
numeric_cols = ['age', 'sibsp', 'parch', 'fare']
categorical_cols = ['pclass', 'sex', 'cabin', 'embarked']

In [39]:
data.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,survived
count,1309.0,1046.0,1309.0,1309.0,1308.0,1309.0
mean,2.294882,29.881135,0.498854,0.385027,33.295479,0.381971
std,0.837836,14.4135,1.041658,0.86556,51.758671,0.486055
min,1.0,0.1667,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958,0.0
50%,3.0,28.0,0.0,0.0,14.4542,0.0
75%,3.0,39.0,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,9.0,512.329224,1.0


# Initial data exploration

Target distribution (just get from describe)

In [28]:
data['survived'].mean()

0.3819709702062643

Percent null of columns (just use df.info())

In [38]:
def pct_null(df, column):
    return len(df[df[column].isna()][column]) / len(df[column])

pct_null(data, 'fare')

0.0007639419404125286

Compute correlation of features with output

In [42]:
for col in numeric_cols:
    print(f"{col}: {data[col].corr(data['survived'])}")

age: -0.055512520192146454

parch: 0.08265957038609854
fare: 0.24426546684798714


In [45]:
from scipy.stats import chi2_contingency

chi2, p_val = chi2_contingency(pd.crosstab(data['pclass'], data['survived']))[:2]

print(chi2)
print(p_val)

127.85915643930326
1.7208259588256175e-28


## Check for duplicated

In [21]:
len(data['name'].unique())

1307

In [22]:
data[data['name'].duplicated()]

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
726,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,0
925,3,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,,S,0


In [23]:
duplicates = data[data.duplicated(subset=['name'], keep=False)]
print(duplicates)

     pclass                  name     sex   age  sibsp  parch  ticket    fare  \
725       3  Connolly, Miss. Kate  female  22.0      0      0  370373  7.7500   
726       3  Connolly, Miss. Kate  female  30.0      0      0  330972  7.6292   
924       3      Kelly, Mr. James    male  34.5      0      0  330911  7.8292   
925       3      Kelly, Mr. James    male  44.0      0      0  363592  8.0500   

    cabin embarked  survived  
725   NaN        Q         1  
726   NaN        Q         0  
924   NaN        Q         0  
925   NaN        S         0  


# Model training

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
import xgboost as xgb
from sklearn import set_config

# Initial setup for classification, setting up train/test splits etc
set_config(transform_output="pandas")

numeric_cols = ['age', 'sibsp', 'parch', 'fare']
categorical_cols = ['pclass', 'sex', 'cabin', 'embarked']
target = ['survived']

X = data[numeric_cols + categorical_cols].copy()
y = np.asarray(data[target], dtype = 'int8')

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size = 0.2, random_state = 20251207)

skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 20251210)