## 1 - SetUp Environment

In [1]:
import numpy as np
import pandas as pd
import pickle

<hr>

## 2 - Load Features

In [2]:
with open('../Assets/Version 4-2-mapped.pickle', 'rb') as file:
    df = pickle.load(file)

<hr>

## 3 - Hot Encoding Nominal Categorical Variable

we should declare all nominal categorical variables for encoding:
*   Sex
*   Cabin
*   Embarked

for each variable, we should do this process</br>
1-convert to dummy</br>
2-change the column name</br>
3-remove the first one (prevent multicollinearity)

In [3]:
def make_dummy(x):
    dummy = pd.get_dummies(df[x] , drop_first=False)

    return dummy

> ### 3.1 - Sex

In [4]:
Sex_dummy = make_dummy('Sex')

col_name = ['male', 'female']
Sex_dummy.columns = col_name

Sex_dummy = Sex_dummy.drop(['male'], axis = 1)
Sex_dummy.head(2)

Unnamed: 0,female
0,0
1,1


> ### 3.2 - Cabin

In [5]:
Cabin_dummy = make_dummy('Cabin')

col_name = ['UnKnown', 'Cabin A' , 'Cabin B' , 'Cabin C' , 'Cabin D' , 'Cabin E' , 'Cabin F' , 'Cabin G,T']
Cabin_dummy.columns = col_name

Cabin_dummy = Cabin_dummy.drop(['UnKnown'], axis = 1)
Cabin_dummy.head(2)

Unnamed: 0,Cabin A,Cabin B,Cabin C,Cabin D,Cabin E,Cabin F,"Cabin G,T"
0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0


> ### 3.3 - Embarked

In [6]:
Embarked_dummy = make_dummy('Embarked')

col_name = ['S:Southampton', 'C:Cherbourg' , 'Q:Queenstown']
Embarked_dummy.columns = col_name

Embarked_dummy = Embarked_dummy.drop(['S:Southampton'], axis = 1)
Embarked_dummy.head(2)

Unnamed: 0,C:Cherbourg,Q:Queenstown
0,0,0
1,1,0


<hr>

## 4 - Summary of Benchmark

let's review the benchmark of each dummy variable. the benchmarks were those categories that we removed to prevenet multicollinearity.

*   <b>Sex</b>: <i> male (0) </i>
*   <b>Cabin</b>: <i> UnKnown (0) </i>
*   <b>Embarked</b>: <i> S:Southampton (0) </i>

<hr>

## 5 - Encoded Dataset

In [7]:
df

Unnamed: 0,Age,Parch,Fare,Pclass,Sex,Cabin,Embarked,Survived
0,22.0,0,7.2500,3,0,0,0,0
1,38.0,0,71.2833,1,1,3,1,1
2,26.0,0,7.9250,3,1,0,0,1
3,35.0,0,53.1000,1,1,3,0,1
4,35.0,0,8.0500,3,0,0,0,0
...,...,...,...,...,...,...,...,...
1304,28.0,0,8.0500,3,0,0,0,0
1305,39.0,0,108.9000,1,1,3,1,1
1306,38.0,0,7.2500,3,0,0,0,0
1307,28.0,0,8.0500,3,0,0,0,0


In [7]:
numeric = df.iloc[: , [0,1,2]]

target = df.loc[: , 'Survived']

dummy = pd.concat([
    Sex_dummy, 
    Cabin_dummy, 
    Embarked_dummy],
    axis = 1)

df = pd.concat([numeric, dummy, target] , axis = 1)

In [9]:
df

Unnamed: 0,Age,Parch,Fare,female,Cabin A,Cabin B,Cabin C,Cabin D,Cabin E,Cabin F,"Cabin G,T",C:Cherbourg,Q:Queenstown,Survived
0,22.0,0,7.2500,0,0,0,0,0,0,0,0,0,0,0
1,38.0,0,71.2833,1,0,0,1,0,0,0,0,1,0,1
2,26.0,0,7.9250,1,0,0,0,0,0,0,0,0,0,1
3,35.0,0,53.1000,1,0,0,1,0,0,0,0,0,0,1
4,35.0,0,8.0500,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,28.0,0,8.0500,0,0,0,0,0,0,0,0,0,0,0
1305,39.0,0,108.9000,1,0,0,1,0,0,0,0,1,0,1
1306,38.0,0,7.2500,0,0,0,0,0,0,0,0,0,0,0
1307,28.0,0,8.0500,0,0,0,0,0,0,0,0,0,0,0


<hr>

## Check Point

In [11]:
with open('../Assets/Version 4-3-encoded.pickle', 'wb') as file:
    pickle.dump(df, file)