# Library

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

# Config

In [2]:
DATA_DIR = '../data/'
RAW_DIR = DATA_DIR + '01_raw/'

DATASET_DIR = 'titanic/'

INPUT_DIR = RAW_DIR + DATASET_DIR

In [22]:
OUTPUT_DIR = DATA_DIR + '05_model_input/' + DATASET_DIR
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
VERSION = 1

In [4]:
ID_COL = 'PassengerId'
Y_COL = 'Survived'

# Load data

In [5]:
# train
train_fname = INPUT_DIR + 'train.csv'
base_train_df = pd.read_csv(train_fname)
base_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# test
test_fname = INPUT_DIR + 'test.csv'
base_test_df = pd.read_csv(test_fname)
base_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Preprocessing

## Define method

In [70]:
def len_str(x):
    return len(str(x))

def left_str(x, num_chars):
    calib_num_chars = num_chars - 1
    return str(x)[:calib_num_chars]

def target_mean_encoding(df, col):
    tmp_df = df.copy()
    train = tmp_df[tmp_df['flg'] == 'train']
    test = tmp_df[tmp_df['flg'] == 'test']
    
    me_col = 'me_' + col
    train[me_col] = train[Y_COL]
    me_df = train.groupby(col)[me_col].mean().reset_index()
    
    train = train.drop(me_col, axis=1).merge(me_df, on=col, how='left')
    test = test.merge(me_df, on=col, how='left')
    return pd.concat([train, test])

def impute_age(x):
    if np.isnan(x['Age']):
        return x['age_to_impute']
    return x['Age']

## Minimum processing

In [56]:
train_df = base_train_df.copy().assign(flg='train')
test_df = base_test_df.copy().assign(flg='test')
test_df[Y_COL] = None

prepro_df = pd.concat([train_df, test_df])
prepro_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


## Name

In [57]:
# 欠損値補間
prepro_df['fix_Name'] = prepro_df['Name'].fillna('Null')

# 文字数
prepro_df['len_Name'] = prepro_df['fix_Name'].map(len_str)

## Ticket

In [58]:
# 欠損値補間
prepro_df['fix_Ticket'] = prepro_df['Ticket'].fillna('-1')

# 文字数
prepro_df['len_Ticket'] = prepro_df['fix_Ticket'].map(len_str)

# 最初の2文字
prepro_df['first_chars_Ticket'] = prepro_df['fix_Ticket'].map(lambda x: left_str(x, 2))

## Cabin

In [59]:
# 欠損値補間
prepro_df['fix_Cabin'] = prepro_df['Cabin'].fillna('-1')

# 文字数
prepro_df['len_Cabin'] = prepro_df['fix_Cabin'].map(len_str)

# 最初の1文字
prepro_df['first_chars_Cabin'] = prepro_df['fix_Cabin'].map(lambda x: left_str(x, 1))

## Target mean encoding

In [60]:
prepro_df.head(1).T

Unnamed: 0,0
PassengerId,1
Survived,0
Pclass,3
Name,"Braund, Mr. Owen Harris"
Sex,male
Age,22.0
SibSp,1
Parch,0
Ticket,A/5 21171
Fare,7.25


In [61]:
prepro_df.describe(include='O')

Unnamed: 0,Survived,Name,Sex,Ticket,Cabin,Embarked,flg,fix_Name,fix_Ticket,first_chars_Ticket,fix_Cabin,first_chars_Cabin
count,891,1309,1309,1309,295,1307,1309,1309,1309,1309,1309,1309.0
unique,2,1307,2,929,186,3,2,1307,929,16,187,1.0
top,0,"Connolly, Miss. Kate",male,CA. 2343,C23 C25 C27,S,train,"Connolly, Miss. Kate",CA. 2343,3,-1,
freq,549,2,843,11,6,914,891,2,11,429,1014,1309.0


In [62]:
me_cols = ['Sex', 'Embarked', 'first_chars_Ticket']
for col in me_cols:
    prepro_df = target_mean_encoding(prepro_df, col)
prepro_df.head(1).T

Unnamed: 0,0
PassengerId,1
Survived,0
Pclass,3
Name,"Braund, Mr. Owen Harris"
Sex,male
Age,22.0
SibSp,1
Parch,0
Ticket,A/5 21171
Fare,7.25


## Add on features refference from others kernel
https://www.kaggle.com/code/konstantinmasich/titanic-0-82-0-83/notebook#Engineering-features

### Imputing Age

In [77]:
prepro_df['Title'] = prepro_df['Name']
# Cleaning name and extracting Title
prepro_df['Title'] = prepro_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
prepro_df = prepro_df.replace({'Title': mapping})

# Imputing Age by Title
age_to_impute_df = prepro_df.groupby('Title')['Age'].median().reset_index()
age_to_impute_df = age_to_impute_df.rename(columns={'Age': 'age_to_impute'})
prepro_df = prepro_df.merge(age_to_impute_df, on='Title', how='left')
prepro_df['fix_Age'] = prepro_df.apply(lambda x: impute_age(x), axis=1)
prepro_df = prepro_df.drop(['age_to_impute', 'Title'], axis=1) # remove needless column

prepro_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin,me_Sex,me_Embarked,me_first_chars_Ticket,fix_Age
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,,0.188908,0.336957,0.068966,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,,0.742038,0.553571,0.646154,38.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,,0.742038,0.336957,0.323077,26.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,,0.742038,0.336957,0.630137,35.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,,0.188908,0.336957,0.239203,35.0


### Adding family_size

In [78]:
prepro_df['Family_Size'] = prepro_df['Parch'] + prepro_df['SibSp']
prepro_df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin,me_Sex,me_Embarked,me_first_chars_Ticket,fix_Age,Family_Size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,,0.188908,0.336957,0.068966,22.0,1


### Adding Family_Survival
This feature is from S.Xu's kernel, he groups families and people with the same tickets togerher and researches the info. I've cleaned the code a bit but it still does the same, I left it as is. For comments see the original kernel.

In [80]:
prepro_df['Last_Name'] = prepro_df['Name'].apply(lambda x: str.split(x, ",")[0])
prepro_df['fix_Fare'] = prepro_df['Fare'].fillna(prepro_df['Fare'].mean())

DEFAULT_SURVIVAL_VALUE = 0.5
prepro_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in prepro_df[['Survived','Name', 'Last_Name', 'fix_Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'fix_Age', 'Cabin']].groupby(['Last_Name', 'fix_Fare']):
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                prepro_df.loc[prepro_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                prepro_df.loc[prepro_df['PassengerId'] == passID, 'Family_Survival'] = 0


print("Number of passengers with family survival information:", 
      prepro_df.loc[prepro_df['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 420


In [82]:
for _, grp_df in prepro_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    prepro_df.loc[prepro_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    prepro_df.loc[prepro_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(prepro_df[prepro_df['Family_Survival']!=0.5].shape[0]))

Number of passenger with family/group survival information: 546


### Making FARE BINS

In [83]:
prepro_df['fix_Fare'].fillna(prepro_df['Fare'].median(), inplace = True)

# Making Bins
prepro_df['FareBin'] = pd.qcut(prepro_df['fix_Fare'], 5)

# label = LabelEncoder()
# prepro_df['FareBin_Code'] = label.fit_transform(prepro_df['FareBin'])
prepro_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin,me_Sex,me_Embarked,me_first_chars_Ticket,fix_Age,Family_Size,Last_Name,fix_Fare,Family_Survival,FareBin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,,0.188908,0.336957,0.068966,22.0,1,Braund,7.25,0.5,"(-0.001, 7.854]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,,0.742038,0.553571,0.646154,38.0,1,Cumings,71.2833,0.5,"(41.579, 512.329]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,,0.742038,0.336957,0.323077,26.0,0,Heikkinen,7.925,0.5,"(7.854, 10.5]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,,0.742038,0.336957,0.630137,35.0,1,Futrelle,53.1,0.0,"(41.579, 512.329]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,,0.188908,0.336957,0.239203,35.0,0,Allen,8.05,0.5,"(7.854, 10.5]"


### Making AGE BINS

In [86]:
prepro_df['AgeBin'] = pd.qcut(prepro_df['fix_Age'], 4)

# label = LabelEncoder()
# prepro_df['AgeBin_Code'] = label.fit_transform(prepro_df['AgeBin'])
prepro_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin,me_Sex,me_Embarked,me_first_chars_Ticket,fix_Age,Family_Size,Last_Name,fix_Fare,Family_Survival,FareBin,AgeBin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,,0.188908,0.336957,0.068966,22.0,1,Braund,7.25,0.5,"(-0.001, 7.854]","(0.169, 22.0]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,,0.742038,0.553571,0.646154,38.0,1,Cumings,71.2833,0.5,"(41.579, 512.329]","(36.0, 80.0]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,,0.742038,0.336957,0.323077,26.0,0,Heikkinen,7.925,0.5,"(7.854, 10.5]","(22.0, 30.0]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,,0.742038,0.336957,0.630137,35.0,1,Futrelle,53.1,0.0,"(41.579, 512.329]","(30.0, 36.0]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,,0.188908,0.336957,0.239203,35.0,0,Allen,8.05,0.5,"(7.854, 10.5]","(30.0, 36.0]"


# Output

In [87]:
prepro_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin,me_Sex,me_Embarked,me_first_chars_Ticket,fix_Age,Family_Size,Last_Name,fix_Fare,Family_Survival,FareBin,AgeBin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,,0.188908,0.336957,0.068966,22.0,1,Braund,7.25,0.5,"(-0.001, 7.854]","(0.169, 22.0]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,,0.742038,0.553571,0.646154,38.0,1,Cumings,71.2833,0.5,"(41.579, 512.329]","(36.0, 80.0]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,,0.742038,0.336957,0.323077,26.0,0,Heikkinen,7.925,0.5,"(7.854, 10.5]","(22.0, 30.0]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,,0.742038,0.336957,0.630137,35.0,1,Futrelle,53.1,0.0,"(41.579, 512.329]","(30.0, 36.0]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,,0.188908,0.336957,0.239203,35.0,0,Allen,8.05,0.5,"(7.854, 10.5]","(30.0, 36.0]"


In [88]:
prepro_df.to_pickle(OUTPUT_DIR + 'train_test_v' + str(VERSION) + '.pkl')