## Info from Kaggle

pclass: A proxy for socio-economic status (SES)
1. Upper
2. Middle
3. Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...

Sibling = brother, sister, stepbrother, stepsister

Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...

Parent = mother, father

Child = daughter, son, stepdaughter, stepson

Some children travelled only with a nanny, therefore parch=0 for them.

In [1]:
import datetime
import pickle

import numpy as np
import pandas as pd

from pandas.api.types import is_numeric_dtype
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [2]:
random_state = 0
data_folder = 'data'
train_data_path = f'{data_folder}/train.csv'
test_data_path = f'{data_folder}/test.csv'
na_fill_values_path = f'{data_folder}/na_values.pickle'
prediction_path = f'{data_folder}/prediction_{datetime.datetime.now().strftime("%f")}.csv'
label_col = 'Survived'
test_id_col = 'PassengerId'

drop_labels = ['PassengerId', 'Name']
drop_labels_experimental = ['Ticket', 'Cabin']

drop_labels += drop_labels_experimental

In [3]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def unique_categorical_values(df, drop_labels):
    for col in train_data:
        if col not in drop_labels and not is_numeric_dtype(train_data[col].dtypes):
            print(col)
            print(train_data[col].unique())

unique_categorical_values(train_data, drop_labels)
train_data.describe()

Sex
['male' 'female']
Embarked
['S' 'C' 'Q' nan]


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
def get_na_fill_values(path, df):
    try:
        with open(path, 'rb') as f:
            na_fill_values = pickle.load(f)
        print('read na fill value file')
    except FileNotFoundError:
        na_fill_values = dict()
        for col_name in df:
            col = df[col_name]
            # Check for numeric (mean) or categorical (mode)
            if is_numeric_dtype(col.dtypes):
                fill_value = col.mean()
            else:
                fill_value = col.mode()[0]
            na_fill_values[col_name] = fill_value
        with open(path, "wb") as f:
            pickle.dump(na_fill_values, f)
        print('wrote na fill value file')
    return na_fill_values

na_fill_values = get_na_fill_values(na_fill_values_path, train_data)
na_fill_values

read na fill value file


{'PassengerId': 446.0,
 'Survived': 0.3838383838383838,
 'Pclass': 2.308641975308642,
 'Name': 'Abbing, Mr. Anthony',
 'Sex': 'male',
 'Age': 29.69911764705882,
 'SibSp': 0.5230078563411896,
 'Parch': 0.38159371492704824,
 'Ticket': '1601',
 'Fare': 32.2042079685746,
 'Cabin': 'B96 B98',
 'Embarked': 'S'}

In [7]:
def get_na_cols(df):
    return df.columns[df.isna().sum() > 0]

def fill_na_cols(df, na_fill_values):
    for col_name in get_na_cols(df):
        value = na_fill_values[col_name]
        df[col_name].fillna(value, inplace=True)
    return df

def preprocess(df, na_fill_values, drop_labels):
    df = fill_na_cols(df, na_fill_values)
    assert len(get_na_cols(df)) == 0
    
    df.drop(drop_labels, axis=1, errors='ignore', inplace=True)
    
    categorical_cols = []
    col_names = df.columns
    for col_name in col_names:
        if not is_numeric_dtype(df[col_name].dtypes):
            categorical_cols.append(col_name)

    print(categorical_cols)
    print(df.columns)
    # return df.drop(drop_labels, axis=1, errors='ignore')
    return pd.get_dummies(df, columns=categorical_cols)

In [8]:
train_data = preprocess(train_data, na_fill_values, drop_labels)
train_data.head()

['Sex', 'Embarked']
Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [9]:
def get_cv_scores(clfs, X ,y):
    for clf in clfs:
        scores = cross_val_score(clf, X, y, cv=5)
        print(clf)
        print(scores)
        print(f'{scores.mean():.4f}')
    
X = train_data.drop(label_col, axis=1)
y = train_data[label_col]
dummy_clf = DummyClassifier(strategy='prior')
rf_clf = RandomForestClassifier(max_depth=10, random_state=random_state)
mlp_clf = MLPClassifier(random_state=random_state, max_iter=300)

clfs = [dummy_clf, rf_clf, mlp_clf]
get_cv_scores(clfs, X ,y)

DummyClassifier()
[0.61452514 0.61797753 0.61797753 0.61797753 0.61235955]
0.6162
RandomForestClassifier(max_depth=10, random_state=0)
[0.79329609 0.80898876 0.86516854 0.79775281 0.87078652]
0.8272
MLPClassifier(max_iter=300, random_state=0)
[0.79888268 0.82022472 0.79213483 0.78651685 0.83707865]
0.8070


In [10]:
clf = rf_clf.fit(X, y)
test_ids = test_data[test_id_col]
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
test_data = preprocess(test_data, na_fill_values, drop_labels)
# Nas should be filled
assert len(get_na_cols(test_data)) == 0
# Columns of train and test set must be the same 
assert (X.columns == test_data.columns).all()
test_data.head()

['Sex', 'Embarked']
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,0,1,0,1,0
1,3,47.0,1,0,7.0,1,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,1,0
3,3,27.0,0,0,8.6625,0,1,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,0,1


In [12]:
y_pred = clf.predict(test_data)
assert len(test_ids) == len(y_pred)
predictions = pd.concat([test_ids, pd.Series(y_pred)], ignore_index=True, axis=1)
predictions.rename(columns={0: test_id_col, 1: label_col}, inplace=True)
predictions.to_csv(prediction_path, index=False)