Basic notebook for applying a simple sklearn prediction method on the kaggle titanic dataset

In [1]:
import os

In [2]:
DATASET_COMP = 'titanic'
DATA_DIR = './data'
DATA_ZIP = 'titanic.zip'
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'

VALIDATION_SPLIT = 0.2
TARGET_VAR = 'Survived'

In [3]:
# load and prep datset from kaggle
if not os.listdir(DATA_DIR):
    print('data folder is empty. Downloading data...')
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi()
    api.authenticate()

    api.competition_download_files(DATASET_COMP, DATA_DIR, quiet = False)

if not os.path.exists(os.path.join(DATA_DIR,TRAIN_CSV)) and os.path.exists(os.path.join(DATA_DIR,DATA_ZIP)):
    print('data set downloaded, unzipping...')
    import zipfile
    with zipfile.ZipFile(os.path.join(DATA_DIR, DATA_ZIP), 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)


In [4]:
# load data into pandas dataframe
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(os.path.join(DATA_DIR, TRAIN_CSV))
test_df = pd.read_csv(os.path.join(DATA_DIR, TEST_CSV))

# create train and validation set from train_df, splitting out target column
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(TARGET_VAR, axis = 1), train_df[TARGET_VAR], test_size = VALIDATION_SPLIT, random_state = 42)

In [5]:
# take a quick look at the tarining data set
print('X_train shape: ', X_train.shape)
print('X_val shape: ', X_val.shape)
print('y_train shape: ', y_train.shape)
print('y_val shape: ', y_val.shape)

X_train.head()


X_train shape:  (712, 11)
X_val shape:  (179, 11)
y_train shape:  (712,)
y_val shape:  (179,)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [6]:
# print the number of unique values of each categorical column
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(col, ': ', X_train[col].nunique())

Name :  712
Sex :  2
Ticket :  558
Cabin :  117
Embarked :  3


### Preprocessing Piple
We will use a combination of Pipelines and column transformers to setup the data
1. Tranform "Cabin" feature to a binary "has_cabin" feature
2. Setup a column transformer for numerical values which impute with median and StandardScaler
3. Setup a column transformer for categorical features which ignore unknown(s)
4. Setup a column transformer to drop unwanted columns

In [7]:
from sklearn.pipeline import Pipeline
preprocess_pipeline = Pipeline(steps=[])

1. **Tranform "Cabin" feature to a binary "has_cabin" feature**

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class DummyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols = None):
        """
        Dummy class to minimize code duplication
        from: https://towardsdatascience.com/customizing-sklearn-pipelines-transformermixin-a54341d8d624
        """
        self.cols = cols
        self.dummies = None
        return None
    
    def fit(self, X = None, y = None):
        return self

    def transform(self, X = None, y = None):
        return self
    

In [9]:
class ImputeCabinTransformer(DummyTransformer):
    """Impute Cabin feature to has_cabin feature as binary"""
    def __init__(self, feature_name, new_name):
        self.feature_name = feature_name
        self.new_name = new_name

    def fit(self, X, y=None):
        X_ = X.rename(columns={self.feature_name:self.new_name})
        self.columns = [col for col in X_.columns]
        return self

    def transform(self, X):
        X_ = X.drop(labels = self.feature_name, axis = 1)
        X_[self.new_name] = X[self.feature_name].notna().replace({True: 1, False: 0})

        return X_


In [10]:
CabinImputer = ImputeCabinTransformer(feature_name = 'Cabin', new_name = 'hasCabin')
preprocess_pipeline.steps.append(('CabinImputer', CabinImputer))

In [11]:
display(preprocess_pipeline)

2. **Setup a column transformer for numerical values which impute with median and StandardScaler**

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [13]:
numerical_cols = ['Age','Fare']
numerical_pipeline = Pipeline(steps=[
    ('MedianImuter', SimpleImputer(strategy='median')),
    ('StandardScaler', StandardScaler())
])

3. **Setup a column transformer for categorical features which ignore unknown(s)**

In [15]:
categorical_cols = ['Pclass','Sex','Embarked']
categorical_vals = [[1,2,3],['male','female'],['C','Q','S']]
categorical_pipeline = Pipeline(steps=[
    ('OHE',OneHotEncoder(handle_unknown='ignore',sparse_output='false',dtype='int16',
                         categories=categorical_vals))
])

In [None]:
drop_cols = ['PassengerId','Name','Ticket']

ColumnTransform = ColumnTransformer()