# Survivability on Titanic (with Advanced Feature Engineering)

In [1]:
import pandas as pd
import numpy as np

import src.utils as utils

In [2]:
df_train = pd.read_csv('input/train.csv')
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived'].values
df_test = pd.read_csv('input/test.csv')
X_test = df_test.copy()

In [3]:
utils.skim_data(df_train)

Total duplicate rows: 0
DF shape: (891, 12)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Survived,int64,0.0,0.0,61.616,2,0.22,"[0, 1]"
2,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
3,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
4,Sex,object,0.0,-,-,2,0.22,"[male, female]"
5,Age,float64,19.865,0.0,0.0,88,9.88,"[22.0, 38.0, 26.0, 35.0, 54.0]"
6,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
7,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
8,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
9,Fare,float64,0.0,0.0,1.684,248,27.83,"[7.25, 71.2833, 7.925, 53.1, 8.05]"


In [4]:
utils.skim_data(X_test)

Total duplicate rows: 0
DF shape: (418, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,418,100.0,"[892, 893, 894, 895, 896]"
1,Pclass,int64,0.0,0.0,0.0,3,0.72,"[3, 2, 1]"
2,Name,object,0.0,-,-,418,100.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N..."
3,Sex,object,0.0,-,-,2,0.48,"[male, female]"
4,Age,float64,20.574,0.0,0.0,79,18.9,"[34.5, 47.0, 62.0, 27.0, 22.0]"
5,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
6,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
7,Ticket,object,0.0,-,-,363,86.84,"[330911, 363272, 240276, 315154, 3101298]"
8,Fare,float64,0.239,0.0,0.478,169,40.43,"[7.8292, 7.0, 9.6875, 8.6625, 12.2875]"
9,Cabin,object,78.23,-,-,76,18.18,"[B45, E31, B57 B59 B63 B66, B36, A21]"


## Data Preprocessing

### Age

In [5]:
df_age = df_train.groupby(['Sex', 'Pclass'])['Age'].median().reset_index(drop=False)
df_age

Unnamed: 0,Sex,Pclass,Age
0,female,1,35.0
1,female,2,28.0
2,female,3,21.5
3,male,1,40.0
4,male,2,30.0
5,male,3,25.0


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.age_agg_ = X.groupby(['Sex', 'Pclass'])['Age'].median()
        return self

    def transform(self, X):
        X_processed = X.copy()
        mapped_ages = X_processed.apply(lambda row: self.age_agg_[row['Sex'], row['Pclass']], axis=1)
        X_processed['Age'] = X_processed['Age'].fillna(mapped_ages)
        return X_processed

age_transformer = AgeTransformer()
X_train = age_transformer.fit_transform(X_train)
X_test = age_transformer.transform(X_test)

In [7]:
utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,object,0.0,-,-,2,0.22,"[male, female]"
4,Age,float64,0.0,0.0,0.0,89,9.99,"[22.0, 38.0, 26.0, 35.0, 25.0]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,float64,0.0,0.0,1.684,248,27.83,"[7.25, 71.2833, 7.925, 53.1, 8.05]"
9,Cabin,object,77.104,-,-,147,16.5,"[C85, C123, E46, G6, C103]"


In [8]:
utils.skim_data(X_test)

Total duplicate rows: 0
DF shape: (418, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,418,100.0,"[892, 893, 894, 895, 896]"
1,Pclass,int64,0.0,0.0,0.0,3,0.72,"[3, 2, 1]"
2,Name,object,0.0,-,-,418,100.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N..."
3,Sex,object,0.0,-,-,2,0.48,"[male, female]"
4,Age,float64,0.0,0.0,0.0,80,19.14,"[34.5, 47.0, 62.0, 27.0, 22.0]"
5,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
6,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
7,Ticket,object,0.0,-,-,363,86.84,"[330911, 363272, 240276, 315154, 3101298]"
8,Fare,float64,0.239,0.0,0.478,169,40.43,"[7.8292, 7.0, 9.6875, 8.6625, 12.2875]"
9,Cabin,object,78.23,-,-,76,18.18,"[B45, E31, B57 B59 B63 B66, B36, A21]"


### Cabin

In [9]:
def transform_cabin(X_train, X_test):
    X_train_proc = X_train.copy()
    X_test_proc = X_test.copy()

    # label missing values with 'M'
    X_train_proc['Deck'] = X_train_proc['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'M')
    X_test_proc['Deck'] = X_test_proc['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'M')

    # replace T value to A
    idx = X_train_proc[X_train_proc['Deck'] == 'T'].index
    X_train_proc.loc[idx, 'Deck'] = 'A'
    idx = X_test_proc[X_test_proc['Deck'] == 'T'].index
    X_test_proc.loc[idx, 'Deck'] = 'A'

    # group deck based on similarity
    X_train_proc['Deck'] = X_train_proc['Deck'].replace(['A', 'B', 'C'], 'ABC')
    X_train_proc['Deck'] = X_train_proc['Deck'].replace(['D', 'E'], 'DE')
    X_train_proc['Deck'] = X_train_proc['Deck'].replace(['F', 'G'], 'FG')

    X_test_proc['Deck'] = X_test_proc['Deck'].replace(['A', 'B', 'C'], 'ABC')
    X_test_proc['Deck'] = X_test_proc['Deck'].replace(['D', 'E'], 'DE')
    X_test_proc['Deck'] = X_test_proc['Deck'].replace(['F', 'G'], 'FG')

    # drop unused cabin
    X_train_proc.drop(['Cabin'], inplace=True, axis=1)
    X_test_proc.drop(['Cabin'], inplace=True, axis=1)

    return X_train_proc, X_test_proc

X_train, X_test = transform_cabin(X_train, X_test)
utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,object,0.0,-,-,2,0.22,"[male, female]"
4,Age,float64,0.0,0.0,0.0,89,9.99,"[22.0, 38.0, 26.0, 35.0, 25.0]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,float64,0.0,0.0,1.684,248,27.83,"[7.25, 71.2833, 7.925, 53.1, 8.05]"
9,Embarked,object,0.224,-,-,3,0.34,"[S, C, Q]"


### Fare

In [10]:
class FareTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.fare_agg_ = X.groupby(['Pclass', 'Parch', 'SibSp'])['Fare'].median()
        self.global_med_ = X['Fare'].median()
        return self

    def transform(self, X):
        X_processed = X.copy()
        mapped_fare = X_processed.apply(
            lambda row: self.fare_agg_.get(
                (row['Pclass'], row['Parch'], row['SibSp']),
                self.global_med_
            ),
            axis=1
        )
        X_processed['Fare'] = X_processed['Fare'].fillna(mapped_fare)
        return X_processed

fare_transformer = FareTransformer()
X_train = fare_transformer.fit_transform(X_train)
X_test = fare_transformer.transform(X_test)

utils.skim_data(X_test)

Total duplicate rows: 0
DF shape: (418, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,418,100.0,"[892, 893, 894, 895, 896]"
1,Pclass,int64,0.0,0.0,0.0,3,0.72,"[3, 2, 1]"
2,Name,object,0.0,-,-,418,100.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N..."
3,Sex,object,0.0,-,-,2,0.48,"[male, female]"
4,Age,float64,0.0,0.0,0.0,80,19.14,"[34.5, 47.0, 62.0, 27.0, 22.0]"
5,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
6,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
7,Ticket,object,0.0,-,-,363,86.84,"[330911, 363272, 240276, 315154, 3101298]"
8,Fare,float64,0.0,0.0,0.478,169,40.43,"[7.8292, 7.0, 9.6875, 8.6625, 12.2875]"
9,Embarked,object,0.0,-,-,3,0.72,"[Q, S, C]"


### Embarked

In [11]:
utils.skim_data(df_train)

Total duplicate rows: 0
DF shape: (891, 12)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Survived,int64,0.0,0.0,61.616,2,0.22,"[0, 1]"
2,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
3,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
4,Sex,object,0.0,-,-,2,0.22,"[male, female]"
5,Age,float64,19.865,0.0,0.0,88,9.88,"[22.0, 38.0, 26.0, 35.0, 54.0]"
6,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
7,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
8,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
9,Fare,float64,0.0,0.0,1.684,248,27.83,"[7.25, 71.2833, 7.925, 53.1, 8.05]"


In [12]:
utils.skim_data(X_test)

Total duplicate rows: 0
DF shape: (418, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,418,100.0,"[892, 893, 894, 895, 896]"
1,Pclass,int64,0.0,0.0,0.0,3,0.72,"[3, 2, 1]"
2,Name,object,0.0,-,-,418,100.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N..."
3,Sex,object,0.0,-,-,2,0.48,"[male, female]"
4,Age,float64,0.0,0.0,0.0,80,19.14,"[34.5, 47.0, 62.0, 27.0, 22.0]"
5,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
6,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
7,Ticket,object,0.0,-,-,363,86.84,"[330911, 363272, 240276, 315154, 3101298]"
8,Fare,float64,0.0,0.0,0.478,169,40.43,"[7.8292, 7.0, 9.6875, 8.6625, 12.2875]"
9,Embarked,object,0.0,-,-,3,0.72,"[Q, S, C]"


In [13]:
X_train[X_train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,,ABC
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,,ABC


In [14]:
from sklearn.impute import SimpleImputer

embarked_imputer = SimpleImputer(strategy='most_frequent')
embarked_imputer.set_output(transform='pandas')
X_train['Embarked'] = embarked_imputer.fit_transform(X_train[['Embarked']])
X_test['Embarked'] = embarked_imputer.transform(df_test[['Embarked']])

utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,object,0.0,-,-,2,0.22,"[male, female]"
4,Age,float64,0.0,0.0,0.0,89,9.99,"[22.0, 38.0, 26.0, 35.0, 25.0]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,float64,0.0,0.0,1.684,248,27.83,"[7.25, 71.2833, 7.925, 53.1, 8.05]"
9,Embarked,object,0.0,-,-,3,0.34,"[S, C, Q]"


## Feature Engineering

### Fare

In [15]:
_, bins = pd.qcut(X_train['Fare'], 13, retbins=True, labels=False, duplicates='drop')
bins[0] = -np.inf
bins[-1] = np.inf

X_train['Fare'] = pd.cut(X_train['Fare'], bins=bins, labels=False)
X_test['Fare'] = pd.cut(X_test['Fare'], bins=bins, labels=False)

In [16]:
utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,object,0.0,-,-,2,0.22,"[male, female]"
4,Age,float64,0.0,0.0,0.0,89,9.99,"[22.0, 38.0, 26.0, 35.0, 25.0]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,int64,0.0,0.0,7.856,13,1.46,"[1, 11, 3, 10, 4]"
9,Embarked,object,0.0,-,-,3,0.34,"[S, C, Q]"


### Age

In [17]:
_, age_bins = pd.qcut(X_train['Age'], 10, retbins=True, labels=False, duplicates='drop')
age_bins[0] = -np.inf
age_bins[-1] = np.inf

X_train['Age'] = pd.cut(X_train['Age'], bins=age_bins, labels=False)
X_test['Age'] = pd.cut(X_test['Age'], bins=age_bins, labels=False)

utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,object,0.0,-,-,2,0.22,"[male, female]"
4,Age,int64,0.0,0.0,11.223,10,1.12,"[2, 7, 4, 3, 9]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,int64,0.0,0.0,7.856,13,1.46,"[1, 11, 3, 10, 4]"
9,Embarked,object,0.0,-,-,3,0.34,"[S, C, Q]"


In [18]:
utils.skim_data(X_test)

Total duplicate rows: 0
DF shape: (418, 11)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,418,100.0,"[892, 893, 894, 895, 896]"
1,Pclass,int64,0.0,0.0,0.0,3,0.72,"[3, 2, 1]"
2,Name,object,0.0,-,-,418,100.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N..."
3,Sex,object,0.0,-,-,2,0.48,"[male, female]"
4,Age,int64,0.0,0.0,8.134,10,2.39,"[7, 8, 9, 5, 2]"
5,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
6,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
7,Ticket,object,0.0,-,-,363,86.84,"[330911, 363272, 240276, 315154, 3101298]"
8,Fare,int64,0.0,0.0,6.938,13,3.11,"[2, 0, 4, 5, 1]"
9,Embarked,object,0.0,-,-,3,0.72,"[Q, S, C]"


### Family Size

In [19]:
X_train['FamilySize'] = X_train['SibSp'] + X_train['Parch'] + 1

In [20]:
X_test['FamilySize'] = X_test['SibSp'] + X_test['Parch'] + 1

### Title

In [21]:
X_train['Title'] = X_train['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [22]:
X_test['Title'] = X_test['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

### IsMarried

In [23]:
X_train['IsMarried'] = 0
X_train.loc[X_train['Title'] == 'Mrs', 'IsMarried'] = 1

In [24]:
X_test['IsMarried'] = 0
X_test.loc[X_test['Title'] == 'Mrs', 'IsMarried'] = 1

In [25]:
X_train['Title'] = X_train['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
X_train['Title'] = X_train['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

In [26]:
X_test['Title'] = X_test['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
X_test['Title'] = X_test['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

In [27]:
utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 14)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,object,0.0,-,-,2,0.22,"[male, female]"
4,Age,int64,0.0,0.0,11.223,10,1.12,"[2, 7, 4, 3, 9]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,int64,0.0,0.0,7.856,13,1.46,"[1, 11, 3, 10, 4]"
9,Embarked,object,0.0,-,-,3,0.34,"[S, C, Q]"


In [28]:
utils.skim_data(X_test)

Total duplicate rows: 0
DF shape: (418, 14)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,418,100.0,"[892, 893, 894, 895, 896]"
1,Pclass,int64,0.0,0.0,0.0,3,0.72,"[3, 2, 1]"
2,Name,object,0.0,-,-,418,100.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N..."
3,Sex,object,0.0,-,-,2,0.48,"[male, female]"
4,Age,int64,0.0,0.0,8.134,10,2.39,"[7, 8, 9, 5, 2]"
5,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
6,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
7,Ticket,object,0.0,-,-,363,86.84,"[330911, 363272, 240276, 315154, 3101298]"
8,Fare,int64,0.0,0.0,6.938,13,3.11,"[2, 0, 4, 5, 1]"
9,Embarked,object,0.0,-,-,3,0.72,"[Q, S, C]"


## Feature Transformation

In [29]:
from sklearn.preprocessing import LabelEncoder

non_numeric_features = ['Sex', 'Age', 'Fare', 'Embarked', 'Deck', 'Title']
label_enc = LabelEncoder()

for feature in non_numeric_features:
    X_train[feature] = label_enc.fit_transform(X_train[feature])
    X_test[feature] = label_enc.transform(X_test[feature])

In [30]:
utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 14)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Pclass,int64,0.0,0.0,0.0,3,0.34,"[3, 1, 2]"
2,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
3,Sex,int64,0.0,0.0,35.241,2,0.22,"[1, 0]"
4,Age,int64,0.0,0.0,11.223,10,1.12,"[2, 7, 4, 3, 9]"
5,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
6,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
7,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
8,Fare,int64,0.0,0.0,7.856,13,1.46,"[1, 11, 3, 10, 4]"
9,Embarked,int64,0.0,0.0,18.855,3,0.34,"[2, 0, 1]"


In [31]:
from sklearn.preprocessing import OneHotEncoder

cat_features = ['Pclass', 'Sex', 'Deck', 'Embarked', 'Title']
ohe_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe_enc.set_output(transform='pandas')
ohe_enc.fit(X_test[cat_features])

X_train_encoded = ohe_enc.transform(X_train[cat_features])
X_test_encoded = ohe_enc.transform(X_test[cat_features])

X_train = X_train.drop(columns=cat_features)
X_train = pd.concat([X_train, X_train_encoded], axis=1)

X_test = X_test.drop(columns=cat_features)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

utils.skim_data(X_train)

Total duplicate rows: 0
DF shape: (891, 25)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,PassengerId,int64,0.0,0.0,0.0,891,100.0,"[1, 2, 3, 4, 5]"
1,Name,object,0.0,-,-,891,100.0,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B..."
2,Age,int64,0.0,0.0,11.223,10,1.12,"[2, 7, 4, 3, 9]"
3,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
4,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
5,Ticket,object,0.0,-,-,681,76.43,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803..."
6,Fare,int64,0.0,0.0,7.856,13,1.46,"[1, 11, 3, 10, 4]"
7,FamilySize,int64,0.0,0.0,0.0,9,1.01,"[2, 1, 5, 3, 7]"
8,IsMarried,int64,0.0,0.0,85.971,2,0.22,"[0, 1]"
9,Pclass_1,float64,0.0,0.0,75.758,2,0.22,"[0.0, 1.0]"


In [32]:
useless_features = ['PassengerId', 'Name', 'Ticket']

X_train = X_train.drop(columns=useless_features)
X_test = X_test.drop(columns=useless_features)

utils.skim_data(X_train)

Total duplicate rows: 330
DF shape: (891, 22)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,Age,int64,0.0,0.0,11.223,10,1.12,"[2, 7, 4, 3, 9]"
1,SibSp,int64,0.0,0.0,68.238,7,0.79,"[1, 0, 3, 4, 2]"
2,Parch,int64,0.0,0.0,76.094,7,0.79,"[0, 1, 2, 5, 3]"
3,Fare,int64,0.0,0.0,7.856,13,1.46,"[1, 11, 3, 10, 4]"
4,FamilySize,int64,0.0,0.0,0.0,9,1.01,"[2, 1, 5, 3, 7]"
5,IsMarried,int64,0.0,0.0,85.971,2,0.22,"[0, 1]"
6,Pclass_1,float64,0.0,0.0,75.758,2,0.22,"[0.0, 1.0]"
7,Pclass_2,float64,0.0,0.0,79.349,2,0.22,"[0.0, 1.0]"
8,Pclass_3,float64,0.0,0.0,44.893,2,0.22,"[1.0, 0.0]"
9,Sex_0,float64,0.0,0.0,64.759,2,0.22,"[0.0, 1.0]"


## Training Model

In [33]:
utils.skim_data(X_test)

Total duplicate rows: 105
DF shape: (418, 22)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,Age,int64,0.0,0.0,8.134,10,2.39,"[7, 8, 9, 5, 2]"
1,SibSp,int64,0.0,0.0,67.703,7,1.67,"[0, 1, 2, 3, 4]"
2,Parch,int64,0.0,0.0,77.512,8,1.91,"[0, 1, 3, 2, 4]"
3,Fare,int64,0.0,0.0,6.938,13,3.11,"[2, 0, 4, 5, 1]"
4,FamilySize,int64,0.0,0.0,0.0,9,2.15,"[1, 2, 3, 5, 4]"
5,IsMarried,int64,0.0,0.0,82.775,2,0.48,"[0, 1]"
6,Pclass_1,float64,0.0,0.0,74.402,2,0.48,"[0.0, 1.0]"
7,Pclass_2,float64,0.0,0.0,77.751,2,0.48,"[0.0, 1.0]"
8,Pclass_3,float64,0.0,0.0,47.847,2,0.48,"[1.0, 0.0]"
9,Sex_0,float64,0.0,0.0,63.636,2,0.48,"[0.0, 1.0]"


### Dummy (Benchmark)

In [34]:
from sklearn.dummy import DummyClassifier

def train_dummy_model(X_train, y_train, X_test):
    model = DummyClassifier(strategy='most_frequent', random_state=29)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    utils.create_submission(df_test['PassengerId'], y_pred)

train_dummy_model(X_train, y_train, X_test)


Submission file 'input/submission_1765525499.825681.csv' created successfully!


Kaggle result: 0.622

### RandomForest (Base)

In [35]:
from sklearn.ensemble import RandomForestClassifier

def train_base_model(X_train, y_train, X_test):
    model = RandomForestClassifier(random_state=29, n_jobs=-1, verbose=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    utils.create_submission(df_test['PassengerId'], y_pred)

train_base_model(X_train, y_train, X_test)


Submission file 'input/submission_1765525508.204384.csv' created successfully!


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


Kaggle result: 0.73923

### RandomForest (Best)

In [36]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

def train_best_rf_model(X_train, y_train, X_test):
    model = RandomForestClassifier(
        random_state=29,
    )
    param_grid = {
        'n_estimators': [100, 200, 500, 1000],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': [None, 5, 7, 11, 13],
        'min_samples_split': [2, 4, 6, 8],
        'min_samples_leaf': [1, 3, 5, 7],
        'max_features': ['sqrt', 'log2', None]
    }
    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=20,
        cv=StratifiedKFold(shuffle=True, random_state=29),
        n_jobs=4,
        verbose=1
    )
    random_search.fit(X_train, y_train)
    y_pred = random_search.predict(X_test)
    utils.create_submission(df_test['PassengerId'], y_pred)

train_best_rf_model(X_train, y_train, X_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits

Submission file 'input/submission_1765525533.014714.csv' created successfully!


Kaggle result: 0.77751

### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

def train_best_logreg_model(X_train, y_train, X_test):
    model = LogisticRegression(random_state=29)
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [np.inf, 1.0, 0.1, 0.01],
        'class_weight': ['balanced'],
        'solver': ['liblinear', 'lbfgs', 'sag', 'saga']
    }
    grid_search = GridSearchCV(
        model,
        param_grid=param_grid,
        n_jobs=4,
        verbose=1,
        return_train_score=True
    )
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    utils.create_submission(df_test['PassengerId'], y_pred)

train_best_logreg_model(X_train, y_train, X_test)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




KeyboardInterrupt: 