In [1]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from IPython.display import display_html
from itertools import chain,cycle

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
test_y = pd.read_csv('./data/gender_submission.csv')
# df_test['Survived'] = test_y['Survived']
df_test.insert(0, 'Survived', test_y['Survived'])
df_all = pd.concat([df_train, df_test], sort=True).reset_index(drop=True)

df_train.name = 'Train Set'
df_test.name = 'Test Set'
df_all.name = 'All Set'

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,0,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,0,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,1,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df_all.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


## Explore Data

In [6]:
display(df_train.info())
display(df_train.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
698,699,0,1,"Thayer, Mr. John Borland",male,49.0,1,1,17421,110.8833,C68,C
484,485,1,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,B49,C
416,417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34.0,1,1,28220,32.5,,S
358,359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q
781,782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17.0,1,0,17474,57.0,B20,S


## Fix missing data

In [7]:
def display_nan(df_data):
   for column in df_data.columns.tolist():
      count = df_data[column].isnull().sum()
      if count > 0:
         print('{} missing {} values'.format(column, count))

In [8]:
def verifyNaN():
   print(df_train.name)
   display_nan(df_train)
   print('\n')
   print(df_test.name)
   display_nan(df_test)

In [9]:
verifyNaN()

Train Set
Age missing 177 values
Cabin missing 687 values
Embarked missing 2 values


Test Set
Age missing 86 values
Fare missing 1 values
Cabin missing 327 values


In [10]:
df_train_corr = df_all.corr(numeric_only=True).abs().unstack().sort_values(kind='quicksort', ascending=False).reset_index()
df_train_corr.rename(columns={"level_0": "F1", "level_1": "F2", 0: 'Correlation'}, inplace=True)

In [11]:
display(df_train_corr[df_train_corr['F1'] == 'Age'])
# Age is correlated with Pclass

Unnamed: 0,F1,F2,Correlation
0,Age,Age,1.0
9,Age,Pclass,0.408106
16,Age,SibSp,0.243699
22,Age,Fare,0.17874
26,Age,Parch,0.150917
34,Age,Survived,0.053695
39,Age,PassengerId,0.028814


In [12]:
df_all.groupby(['Sex', 'Pclass']).median(numeric_only=True)['Age']

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64

In [13]:
# Change missing Age for median of age by sex and pclass
df_train['Age'] = df_train.groupby(['Sex', 'Pclass'], group_keys=False)['Age'].apply(lambda x: x.fillna(x.median()))
df_test['Age'] = df_test.groupby(['Sex', 'Pclass'], group_keys=False)['Age'].apply(lambda x: x.fillna(x.median()))

In [14]:
display(df_test[df_test['Fare'].isnull()])

Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,0,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [15]:
value = df_all.groupby(['Pclass'], group_keys=False)['Fare'].median()[3]
df_test['Fare'].fillna(value, inplace=True)

In [16]:
# Missing values of cabin change for Missing(M)
df_train['Cabin'].fillna('M', inplace=True)

df_test['Cabin'].fillna('M', inplace=True)

In [17]:
df_train['Cabin'] = df_train['Cabin'].apply(lambda x: x[0])
df_test['Cabin'] = df_train['Cabin'].apply(lambda x: x[0])

In [18]:
print("Top value of Embarked:",df_train['Embarked'].describe()['top'])
df_train['Embarked'].fillna('S', inplace=True)

Top value of Embarked: S


In [19]:
verifyNaN()

Train Set


Test Set


In [20]:
df_train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
517,518,0,3,"Ryan, Mr. Patrick",male,25.0,0,0,371110,24.15,M,Q
649,650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23.0,0,0,CA. 2314,7.55,M,S
521,522,0,3,"Vovk, Mr. Janko",male,22.0,0,0,349252,7.8958,M,S
641,642,1,1,"Sagesser, Mlle. Emma",female,24.0,0,0,PC 17477,69.3,B,C
448,449,1,3,"Baclini, Miss. Marie Catherine",female,5.0,2,1,2666,19.2583,M,C


## Values Transformation

In [21]:
# Convert str to number using Label Encoder
str_features = ['Sex', 'Cabin', 'Embarked', 'Fare']
for feature in str_features:
   df_train[feature] = LabelEncoder().fit_transform(df_train[feature])
   df_test[feature] = LabelEncoder().fit_transform(df_test[feature])
   

In [22]:
cat_features = ['Embarked', 'Cabin', 'Pclass', 'Sex']
encoded_features = []

for feature in cat_features:
   encoded = OneHotEncoder().fit_transform(df_train[feature].values.reshape(-1, 1)).toarray()
   n = df_train[feature].nunique()
   cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
   encoded_df = pd.DataFrame(encoded, columns=cols)
   encoded_df.index = df_train.index
   encoded_features.append(encoded_df)
   
df_train = pd.concat([df_train, *encoded_features[:6]], axis=1)

In [23]:
cat_features = ['Embarked', 'Cabin', 'Pclass', 'Sex']
encoded_features = []

for feature in cat_features:
   encoded = OneHotEncoder().fit_transform(df_test[feature].values.reshape(-1, 1)).toarray()
   n = df_test[feature].nunique()
   cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
   encoded_df = pd.DataFrame(encoded, columns=cols)
   encoded_df.index = df_test.index
   encoded_features.append(encoded_df)
   
df_test = pd.concat([df_test, *encoded_features[:6]], axis=1)

In [24]:
# Drop col
display(df_test.columns)
drop_columns = ['Embarked', 'Cabin', 'Name', 'PassengerId', 'Pclass', 'Sex', 'Ticket']
df_train.drop(columns=drop_columns, inplace=True)
df_test.drop(columns=drop_columns, inplace=True)
display(df_test.columns)

display(df_test['Survived'])

Index(['Survived', 'PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Embarked_1',
       'Embarked_2', 'Embarked_3', 'Cabin_1', 'Cabin_2', 'Cabin_3', 'Cabin_4',
       'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8', 'Cabin_9', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'],
      dtype='object')

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_1', 'Embarked_2',
       'Embarked_3', 'Cabin_1', 'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5',
       'Cabin_6', 'Cabin_7', 'Cabin_8', 'Cabin_9', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_1', 'Sex_2'],
      dtype='object')

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

## Save Training Data

In [25]:
df_train.columns


Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_1', 'Embarked_2',
       'Embarked_3', 'Cabin_1', 'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5',
       'Cabin_6', 'Cabin_7', 'Cabin_8', 'Cabin_9', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_1', 'Sex_2'],
      dtype='object')

In [26]:
df_test.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_1', 'Embarked_2',
       'Embarked_3', 'Cabin_1', 'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5',
       'Cabin_6', 'Cabin_7', 'Cabin_8', 'Cabin_9', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_1', 'Sex_2'],
      dtype='object')

In [27]:
columns_to_use = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
# X_census = base_census.iloc[:, 0:14].values
X_titanic_train = df_train.iloc[:, df_train.columns != 'Survived']
y_titanic_train = df_train['Survived']

X_titanic_test = df_test.iloc[:, df_train.columns != 'Survived']
y_titanic_test = df_test['Survived']

In [28]:
with open('data/titanic.pkl', mode='wb') as f:
   pickle.dump([X_titanic_train, y_titanic_train, X_titanic_test, y_titanic_test], f)