#### Data Science : LAB-5
#### NAME : RAJNI GUPTA
#### PRN : 20190802012
#### AIM: PERFORMING FEATURE ENCODING ON TITANIC DATASET


In [129]:
# Importing Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


### Pre-processing Data to remove null Values

In [114]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('Titanic_Dataset.csv',
                   usecols=['Sex', 'Embarked', 'Cabin', 'Survived'])
# let's capture only the first letter of the 
# cabin for this demonstration
data['Cabin'] = data['Cabin'].fillna('Missing')
data['Cabin'] = data['Cabin'].str[0]
data.head()


Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,M,S
1,1,female,C,C
2,1,female,M,S
3,1,female,C,S
4,0,male,M,S


### One Hot Encoding

In [115]:
pd.get_dummies(data)


Unnamed: 0,Survived,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0
2,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1
3,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
887,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1
888,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
889,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0


### Integer Encoding / Label Encoding


In [116]:
# Returns dictionary having key as category and values as number
def find_category_mappings(data, variable):
    return {k: i for i, k in enumerate(data[variable].unique())}

find_category_mappings(data, variable)


{'S': 0, 'C': 1, 'Q': 2, nan: 3}

In [117]:
# Returns the column after mapping with dictionary
def integer_encode(data,variable, ordinal_mapping):
    data[variable] = data[variable].map(ordinal_mapping)
    
data.head()


Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,M,S
1,1,female,C,C
2,1,female,M,S
3,1,female,C,S
4,0,male,M,S


In [118]:
for variable in ['Sex','Cabin','Embarked']:
    mappings = find_category_mappings(data,variable)
    integer_encode(data, variable, mappings)
    
data.head()


Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,0,0,0
1,1,1,1,1
2,1,1,0,0
3,1,1,1,0
4,0,0,0,0


### Count or frequency encoding


In [119]:
# create the dictionary
count_map_sex = data['Sex'].value_counts().to_dict()
count_map_cabin = data['Cabin'].value_counts().to_dict()
count_map_embark = data['Embarked'].value_counts().to_dict()


In [120]:
# Map the column with dictionary
data['Sex'] = data['Sex'].map(count_map_sex)
data['Cabin'] = data['Cabin'].map(count_map_cabin)
data['Embarked'] = data['Embarked'].map(count_map_embark)
data.head()


Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,577,687,644
1,1,314,59,168
2,1,314,687,644
3,1,314,59,644
4,0,577,687,644


### Ordered Integer Encoding


In [121]:
target = 'Survived'
def find_category_mappings(data, variable, target):
    # first  we generate an ordered list with the labels
    ordered_labels = data.groupby([variable])[target].mean().sort_values().index
    # return the dictionary with mappings
    return {k: i for i, k in enumerate(ordered_labels, 0)}

find_category_mappings(data, variable, target)


{644: 0, 77: 1, 168: 2, 2: 3}

In [122]:
def integer_encode(data,variable, ordinal_mapping):
    data[variable] = data[variable].map(ordinal_mapping)
data.head()


Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,577,687,644
1,1,314,59,168
2,1,314,687,644
3,1,314,59,644
4,0,577,687,644


### Encoding using “Weight of Evidence”


In [123]:
#probability of survived = 1
prob_df = data.groupby(['Cabin'])['Survived'].mean()

# and capture it into a dataframe
prob_df = pd.DataFrame(prob_df)


In [124]:
# now the probability of survived = 0
# (probability of non-events or p(0))
prob_df['died'] = 1-prob_df['Survived']
prob_df


Unnamed: 0_level_0,Survived,died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,1.0
4,0.5,0.5
13,0.615385,0.384615
15,0.466667,0.533333
32,0.75,0.25
33,0.757576,0.242424
47,0.744681,0.255319
59,0.59322,0.40678
687,0.299854,0.700146


In [125]:
prob_df['ratio'] = np.log( prob_df['Survived'] / prob_df['died'] )
prob_df

Unnamed: 0_level_0,Survived,died,ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,1.0,-inf
4,0.5,0.5,0.0
13,0.615385,0.384615,0.470004
15,0.466667,0.533333,-0.133531
32,0.75,0.25,1.098612
33,0.757576,0.242424,1.139434
47,0.744681,0.255319,1.070441
59,0.59322,0.40678,0.377294
687,0.299854,0.700146,-0.847991


In [126]:
# Encoding using WoE
def find_category_mappings(data, variable, target):
    tmp = pd.DataFrame(data.groupby([variable])[target].mean())
    tmp['non-target'] = 1 - tmp[target]
    tmp['ratio'] = np.log( tmp[target] / tmp['non-target'] )
    return tmp['ratio'].to_dict()

find_category_mappings(data, variable, target)


{2: inf,
 77: -0.44895022004790314,
 168: 0.21511137961694568,
 644: -0.6768866596881652}

In [127]:
def integer_encode(data,variable, ordinal_mapping):
    data[variable] = data[variable].map(ordinal_mapping)
    

In [128]:
for variable in ['Sex','Cabin','Embarked']:
    mappings = find_category_mappings(data, variable, 'Survived')
    integer_encode(data,variable, mappings)
    
data.head()


Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,-1.45712,-0.847991,-0.676887
1,1,1.056589,0.377294,0.215111
2,1,1.056589,-0.847991,-0.676887
3,1,1.056589,0.377294,-0.676887
4,0,-1.45712,-0.847991,-0.676887


### Observations : 
#### If there are lesser categories and it is nominal categorical data, then one-hot encoding works just fine. 
#### If the relationship between any categorical column as independent variable and dependent variable (Target Variable) is important, then Ordered Integer Encoding can be applied. 
#### For ordinal categorical data, simply Label Encoding can be used.
#### It is also prone to cause over-fitting

### Conclusion :
#### Feature Encoding Techniques are successfully applied on nominal & ordinal categorical variables on the given dataset.