In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

df = pd.read_csv('Titanic Dataset.csv')
X = df.drop(['survived', 'name', 'ticket', 'home.dest'], axis=1)
y = df['survived']

num_columns = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'body']
cat_columns = ['sex', 'cabin', 'embarked', 'boat']

In [None]:
nan_percentage = X.agg(lambda x: x.isna().sum() / x.size)
nan_columns = nan_percentage[nan_percentage > 0.01].index
nan_percentage

pclass      0.000000
sex         0.000000
age         0.200917
sibsp       0.000000
parch       0.000000
fare        0.000764
cabin       0.774637
embarked    0.001528
boat        0.628724
body        0.907563
dtype: float64

In [None]:
for column in nan_columns:
    for index in ['sex', 'boat']:
        if index != column:
            crosstab = pd.crosstab(X[index], X[column].isnull().astype(int), normalize='index')
            print(crosstab, end='\n\n')

age            0         1
sex                       
female  0.832618  0.167382
male    0.780546  0.219454

age             0         1
boat                       
1        0.800000  0.200000
10       0.931034  0.068966
11       0.960000  0.040000
12       1.000000  0.000000
13       0.794872  0.205128
13 15    1.000000  0.000000
13 15 B  1.000000  0.000000
14       0.909091  0.090909
15       0.918919  0.081081
15 16    0.000000  1.000000
16       0.434783  0.565217
2        1.000000  0.000000
3        0.923077  0.076923
4        0.967742  0.032258
5        0.888889  0.111111
5 7      0.500000  0.500000
5 9      1.000000  0.000000
6        0.850000  0.150000
7        0.869565  0.130435
8        0.956522  0.043478
8 10     1.000000  0.000000
9        0.920000  0.080000
A        0.818182  0.181818
B        0.777778  0.222222
C        0.736842  0.263158
C D      1.000000  0.000000
D        0.650000  0.350000

cabin          0         1
sex                       
female  0.302575  0.6974

In [None]:
def baseline_preprocessing(X: pd.DataFrame):
    X = X.copy()
    nan_percentage = X.agg(lambda x: x.isna().sum() / x.size)
    columns_to_remove = nan_percentage[nan_percentage > 0.1].index
    X = X.drop(columns_to_remove, axis=1)
    
    columns_to_fill = nan_percentage[nan_percentage <= 0.1].index
    cat_columns_to_fill = [c for c in columns_to_fill if c in cat_columns]
    num_columns_to_fill = [c for c in columns_to_fill if c in num_columns]
    
    X_num_to_fill = X[num_columns_to_fill]
    X[num_columns_to_fill] = X_num_to_fill.fillna(X_num_to_fill.mean())
    
    X_cat_to_fill = X[cat_columns_to_fill]
    X[cat_columns_to_fill] = X_cat_to_fill.fillna(X_cat_to_fill.mode().iloc[0])
    return X

X_prep = baseline_preprocessing(X)

X_prep = ColumnTransformer(transformers=[
    ('num', StandardScaler(), X_prep.columns.intersection(num_columns)),
    ('cat', OneHotEncoder(handle_unknown='ignore'), X_prep.columns.intersection(cat_columns))
]).fit_transform(X_prep)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import statsmodels.api as sm

error = 0.0
kf = KFold(shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(X, y):
    X_train, y_train = X_prep[train_idx], y.iloc[train_idx]
    X_test, y_test = X_prep[test_idx], y.iloc[test_idx]
    
    model = sm.Logit(y_train, X_train).fit(disp=0)
    y_pred = model.predict(X_test)
    error += roc_auc_score(y_test, y_pred)
    
print(f"Baseline ROC AUC: {error / 5:.3f}")

Baseline ROC AUC: 0.827




In [None]:
def advanced_preprocessing(X: pd.DataFrame):
    X = X.copy()
    X['cabin'] = X['cabin'].isna().astype(int)
    
    X['title'] = df['name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    title_age_medians = X.groupby('title')['age'].median()
    median_age_map = X['title'].map(title_age_medians)
    X['age'] = X['age'].fillna(median_age_map)
    
    nan_percentage = X.agg(lambda x: x.isna().sum() / x.size)
    columns_to_remove = nan_percentage[nan_percentage > 0.1].index
    X = X.drop(columns_to_remove, axis=1)
    
    columns_to_fill = nan_percentage[nan_percentage <= 0.1].index
    cat_columns_to_fill = [c for c in columns_to_fill if c in cat_columns]
    num_columns_to_fill = [c for c in columns_to_fill if c in num_columns]
    
    X_num_to_fill = X[num_columns_to_fill]
    X[num_columns_to_fill] = X_num_to_fill.fillna(X_num_to_fill.mean())
    
    X_cat_to_fill = X[cat_columns_to_fill]
    X[cat_columns_to_fill] = X_cat_to_fill.fillna(X_cat_to_fill.mode().iloc[0])
    return X

X_prep_adv = advanced_preprocessing(X)

X_prep_adv = ColumnTransformer(transformers=[
    ('num', StandardScaler(), X_prep_adv.columns.intersection(num_columns)),
    ('cat', OneHotEncoder(handle_unknown='ignore'), X_prep_adv.columns.intersection(cat_columns))
]).fit_transform(X_prep_adv)

In [None]:
error = 0.0
kf = KFold(shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(X, y):
    X_train, y_train = X_prep_adv[train_idx], y.iloc[train_idx]
    X_test, y_test = X_prep_adv[test_idx], y.iloc[test_idx]
    
    model = sm.Logit(y_train, X_train).fit(disp=0)
    y_pred = model.predict(X_test)
    error += roc_auc_score(y_test, y_pred)
    
print(f"Advanced ROC AUC: {error / 5:.3f}")

Advanced ROC AUC: 0.848


