In [1]:
import pandas as pd
import os
import numpy as np
import datetime
from plotnine import *
import matplotlib.pyplot as plt
from model_diagnostics import model_diagnostics, skf_preds, model_diagnostics_skf, summarise_continuous_feature

#pd.set_option("display.max_rows", 20)

In [2]:
filename = os.getcwd() + "/train.csv"
data = pd.read_csv(filename)

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Feature Engineering

## Construct deck feature

In [4]:
(
    data
    .assign(Deck = data['Cabin'].str[0].fillna('M'),
            CabinNo = data['Cabin'].str.split('(\d+)', expand = True)[1].fillna(0).astype(int))
    .head(1)
)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,CabinNo
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,M,0


In [5]:
data['Deck'] = data['Cabin'].str[0]
#data.loc[data['Cabin'].isnull(), 'Deck'] = 'M'
data['Deck'] = data['Deck'].fillna('M')
data['CabinNo'] = data['Cabin'].str.split('(\d+)', expand = True)[1].fillna(0).astype(int)

In [6]:
survival_deck = (
    data
    .groupby('Deck')
    .agg(n = ('SibSp', 'count'),
         pct_survived = ('Survived', 'mean'))
    .reset_index()
)

survival_deck

Unnamed: 0,Deck,n,pct_survived
0,A,15,0.466667
1,B,47,0.744681
2,C,59,0.59322
3,D,33,0.757576
4,E,32,0.75
5,F,13,0.615385
6,G,4,0.5
7,M,687,0.299854
8,T,1,0.0


In [7]:
#df_all['Deck'] = df_all['Deck'].replace(['A', 'B', 'C'], 'ABC')
data['Deck'] = data['Deck'].replace(['A', 'T'], 'A')
data['Deck'] = data['Deck'].replace(['F', 'G'], 'FG')
data['Deck'].value_counts()

M     687
C      59
B      47
D      33
E      32
FG     17
A      16
Name: Deck, dtype: int64

## Extract Title feature

In [8]:
data['Title'] = data['Name'].str.split(",", expand = True)[1].str.split('.', expand = True)[0].str.strip()
data['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Title, dtype: int64

In [9]:
survival_title = (
    data
    .groupby('Title')
    .agg(n = ('Name', 'count'),
         avg_survival = ('Survived', 'mean'))
    .reset_index()
)

survival_title

Unnamed: 0,Title,n,avg_survival
0,Capt,1,0.0
1,Col,2,0.5
2,Don,1,0.0
3,Dr,7,0.428571
4,Jonkheer,1,0.0
5,Lady,1,1.0
6,Major,2,0.5
7,Master,40,0.575
8,Miss,182,0.697802
9,Mlle,2,1.0


In [10]:
data.query('Name.str.contains("Mme")', engine = "python")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,CabinNo,Title
369,370,1,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.3,B35,C,B,35,Mme


In [11]:
data['Title'] = data['Title'].replace(['Ms'], 'Miss')

noble_list = ['Dr', 'Rev', 'Mlle', 'Major', 'Col', 'the Countess', 'Capt', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
#noble_list = ['Mlle', 'the Countess', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
#military_clergy_list = ['Rev', 'Major', 'Col', 'Capt']

data['Title'] = data['Title'].replace(noble_list, 'Noble')
#data['Title'] = data['Title'].replace(military_clergy_list, 'Military_Clergy')
#data['Title'] = data['Title'].replace('Dr', 'Mr')

data['Title'].value_counts()

Mr        517
Miss      183
Mrs       125
Master     40
Noble      26
Name: Title, dtype: int64

In [12]:
survival_title = (
    data
    .groupby('Title')
    .agg(n = ('Name', 'count'),
         avg_survival = ('Survived', 'mean'))
    .reset_index()
)

survival_title

Unnamed: 0,Title,n,avg_survival
0,Master,40,0.575
1,Miss,183,0.699454
2,Mr,517,0.156673
3,Mrs,125,0.792
4,Noble,26,0.423077


In [13]:
data['Married'] = 0
data.loc[data['Title'] == 'Mrs', 'Married'] = 1

## One hot encode categoricals

In [14]:
cat_columns = ['Sex', 'Pclass', 'Embarked', 'Title', 'Deck']
pd.concat([data, pd.get_dummies(data, columns = cat_columns, dummy_na = True, drop_first = True)], axis = 1).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Title_Mrs,Title_Noble,Title_nan,Deck_B,Deck_C,Deck_D,Deck_E,Deck_FG,Deck_M,Deck_nan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,0,0,0,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,1,0,0,0,1,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,0,0,0,0,0,0,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,1,0,0,0,1,0,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,0,0,0,0,0,0,1,0


# Annoying times working with sklearn OneHotEncoder

In [15]:
# from sklearn.preprocessing import OneHotEncoder

# enc = OneHotEncoder()
# X = data[['Sex', 'Pclass', 'Embarked']].copy()
# enc.fit_transform(X).toarray()

In [16]:
# enc.get_feature_names_out()

In [17]:
# cat_features = ['Pclass', 'Sex', 'Embarked']
# encoded_features = []
# dfs = [data]

# for df in dfs:
#     for feature in cat_features:
#         encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
#         n = df[feature].nunique()
#         cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
#         encoded_df = pd.DataFrame(encoded_feat, columns=cols)
#         encoded_df.index = df.index
#         encoded_features.append(encoded_df)


In [18]:
# cols 

In [19]:
# encoded_feat