Basic notebook for applying a simple sklearn prediction method on the kaggle titanic dataset

In [1]:
import os

In [2]:
DATASET_COMP = 'titanic'
DATA_DIR = './data'
DATA_ZIP = 'titanic.zip'
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'

VALIDATION_SPLIT = 0.2
TARGET_VAR = 'Survived'

In [3]:
# load and prep datset from kaggle
if not os.listdir(DATA_DIR):
    print('data folder is empty. Downloading data...')
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi()
    api.authenticate()

    api.competition_download_files(DATASET_COMP, DATA_DIR, quiet = False)

if not os.path.exists(os.path.join(DATA_DIR,TRAIN_CSV)) and os.path.exists(os.path.join(DATA_DIR,DATA_ZIP)):
    print('data set downloaded, unzipping...')
    import zipfile
    with zipfile.ZipFile(os.path.join(DATA_DIR, DATA_ZIP), 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)


In [4]:
# load data into pandas dataframe
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(os.path.join(DATA_DIR, TRAIN_CSV))
test_df = pd.read_csv(os.path.join(DATA_DIR, TEST_CSV))

# create train and validation set from train_df, splitting out target column
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(TARGET_VAR, axis = 1), train_df[TARGET_VAR], test_size = VALIDATION_SPLIT, random_state = 42)

In [5]:
# take a quick look at the tarining data set
print('X_train shape: ', X_train.shape)
print('X_val shape: ', X_val.shape)
print('y_train shape: ', y_train.shape)
print('y_val shape: ', y_val.shape)

X_train.head()


X_train shape:  (712, 11)
X_val shape:  (179, 11)
y_train shape:  (712,)
y_val shape:  (179,)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [6]:
# print the number of unique values of each categorical column
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(col, ': ', X_train[col].nunique())

Name :  712
Sex :  2
Ticket :  558
Cabin :  117
Embarked :  3


### Preprocessing Piple
We will use sklearns preprocessing capabailites by creating a pipeline. The steps will be
1. Drop unwanted features
2. Impute Missing Numerical Values
3. Scale Numerical Values
4. One-Hot Encode categorical values
5. Tranform "Cabin" feature to a binary "has_cabin" feature


In [12]:
from sklearn.pipeline import Pipeline
preprocess = Pipeline(steps=[])

1. **Drop unwanted features**

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class DummyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols = None):
        """
        Dummy class to minimize code duplication
        from: https://towardsdatascience.com/customizing-sklearn-pipelines-transformermixin-a54341d8d624
        """
        self.cols = cols
        self.dummies = None
        return None
    
    def fit(self, X = None, y = None):
        return self

    def transform(self, X = None, y = None):
        return self
    

In [9]:
class DropFeatures(DummyTransformer):
    """ Class that drops feature columns from a dataframe"""
    def __init__(self, features):
        self.features = features
    
    def transform(self, X):
        return X.drop(self.features, axis=1)

In [10]:
drop_features = ['Ticket','Name','PassengerId']
DropFeaturesStep = DropFeatures(drop_features)
preprocess.steps.append(('DropFeatures', DropFeaturesStep))

2. **Impute Missing Numerical Values**
