In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


## One Hot Encoding

In [4]:
pd.get_dummies(df).head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19,19000,0,0,1
1,15810944,35,20000,0,0,1
2,15668575,26,43000,0,1,0
3,15603246,27,57000,0,1,0
4,15804002,19,76000,0,0,1


In [4]:
pd.get_dummies(df, columns=['Gender','Purchased']).head()

Unnamed: 0,User ID,Age,EstimatedSalary,Gender_Female,Gender_Male,Purchased_0,Purchased_1
0,15624510,19,19000,0,1,1,0
1,15810944,35,20000,0,1,1,0
2,15668575,26,43000,1,0,1,0
3,15603246,27,57000,1,0,1,0
4,15804002,19,76000,0,1,1,0


In [5]:
# one hot encoding
pd.get_dummies(df, columns=['Gender','Purchased'], prefix=['Gen','Pur']).head()

Unnamed: 0,User ID,Age,EstimatedSalary,Gen_Female,Gen_Male,Pur_0,Pur_1
0,15624510,19,19000,0,1,1,0
1,15810944,35,20000,0,1,1,0
2,15668575,26,43000,1,0,1,0
3,15603246,27,57000,1,0,1,0
4,15804002,19,76000,0,1,1,0


In [6]:
# obtaining k-1 labels
pd.get_dummies(df, drop_first=True, columns=['Gender','Purchased']).head()

Unnamed: 0,User ID,Age,EstimatedSalary,Gender_Male,Purchased_1
0,15624510,19,19000,1,0
1,15810944,35,20000,1,0
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,1,0


In [7]:
df = pd.read_csv('Data.csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,No
4,Germany,40.0,,Yes


In [9]:
# obtaining k-1 labels
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain,Purchased_Yes
0,44.0,72000.0,0,0,0
1,27.0,48000.0,0,1,1
2,30.0,54000.0,1,0,0
3,,61000.0,0,1,0
4,40.0,,1,0,1


## Ordinal Encoding

In [11]:
cats = df['Country'].unique()
cats

array(['France', 'Spain', 'Germany'], dtype=object)

In [43]:
cats = list(df.groupby(['Country'])['Age'].mean().sort_values().index)
cats

['Spain', 'Germany', 'France']

In [41]:
ord_dict = {}
i=0
for cat in cats:
    ord_dict[cat] = i
    i = i+1
ord_dict

{'Spain': 0, 'Germany': 1, 'France': 2}

In [44]:
df['Country_ord'] = df['Country'].map(ord_dict)

In [45]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Country_ord
0,France,44.0,72000.0,No,2
1,Spain,27.0,48000.0,Yes,0
2,Germany,30.0,54000.0,No,1
3,Spain,,61000.0,No,0
4,Germany,40.0,,Yes,1


## Frequency Encoding

In [19]:
df.Country.value_counts()

France     4
Germany    3
Spain      3
Name: Country, dtype: int64

In [20]:
count_dict = df.Country.value_counts().to_dict()
count_dict

{'France': 4, 'Germany': 3, 'Spain': 3}

In [46]:
df['Country_count'] = df['Country'].map(count_dict)

In [47]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Country_ord,Country_count
0,France,44.0,72000.0,No,2,4
1,Spain,27.0,48000.0,Yes,0,3
2,Germany,30.0,54000.0,No,1,3
3,Spain,,61000.0,No,0,3
4,Germany,40.0,,Yes,1,3


## Probability Encoding

In [48]:
df['Purchased'] = df['Purchased'].apply(lambda x: 1 if x == 'Yes' else 0)

In [49]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Country_ord,Country_count
0,France,44.0,72000.0,0,2,4
1,Spain,27.0,48000.0,1,0,3
2,Germany,30.0,54000.0,0,1,3
3,Spain,,61000.0,0,0,3
4,Germany,40.0,,1,1,3


In [50]:
df.groupby(['Country'])['Purchased'].mean()

Country
France     0.750000
Germany    0.333333
Spain      0.333333
Name: Purchased, dtype: float64

In [51]:
ordered_labels = df.groupby(['Country'])['Purchased'].mean().to_dict()
ordered_labels

{'France': 0.75, 'Germany': 0.3333333333333333, 'Spain': 0.3333333333333333}

In [52]:
df['Purchase_Prob'] = df['Country'].map(ordered_labels)
df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Country_ord,Country_count,Purchase_Prob
0,France,44.0,72000.0,0,2,4,0.75
1,Spain,27.0,48000.0,1,0,3,0.333333
2,Germany,30.0,54000.0,0,1,3,0.333333
3,Spain,,61000.0,0,0,3,0.333333
4,Germany,40.0,,1,1,3,0.333333


## Probability Ratio Encoding

In [53]:
prob_df = df.groupby(['Country'])['Purchased'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Purchased
Country,Unnamed: 1_level_1
France,0.75
Germany,0.333333
Spain,0.333333


In [54]:
# now let's  calculate the probability of target = 0 (people who did not purchase)
prob_df = df.groupby(['Country'])['Purchased'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df['Not Purchased'] = 1-prob_df.Purchased
prob_df

Unnamed: 0_level_0,Purchased,Not Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
France,0.75,0.25
Germany,0.333333,0.666667
Spain,0.333333,0.666667


In [55]:
prob_df['ratio'] = prob_df['Purchased']/prob_df['Not Purchased']
prob_df

Unnamed: 0_level_0,Purchased,Not Purchased,ratio
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,0.75,0.25,3.0
Germany,0.333333,0.666667,0.5
Spain,0.333333,0.666667,0.5


In [56]:
prob_df['ratio'].to_dict()

{'France': 3.0, 'Germany': 0.49999999999999994, 'Spain': 0.49999999999999994}

In [57]:
prob_labels = prob_df['ratio'].to_dict()

In [58]:
df['Country_prob'] = df.Country.map(prob_labels)
df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Country_ord,Country_count,Purchase_Prob,Country_prob
0,France,44.0,72000.0,0,2,4,0.75,3.0
1,Spain,27.0,48000.0,1,0,3,0.333333,0.5
2,Germany,30.0,54000.0,0,1,3,0.333333,0.5
3,Spain,,61000.0,0,0,3,0.333333,0.5
4,Germany,40.0,,1,1,3,0.333333,0.5


## Weight Of Evidence Encoding

In [60]:
prob_df = df.groupby(['Country'])['Purchased'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Purchased
Country,Unnamed: 1_level_1
France,0.75
Germany,0.333333
Spain,0.333333


In [61]:
# now let's  calculate the probability of target = 0 (people who did not purchase)
prob_df = df.groupby(['Country'])['Purchased'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df['Not Purchased'] = 1-prob_df.Purchased
prob_df

Unnamed: 0_level_0,Purchased,Not Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
France,0.75,0.25
Germany,0.333333,0.666667
Spain,0.333333,0.666667


In [62]:
# since the log of zero is not defined
prob_df.loc[prob_df['Purchased'] == 0, 'Purchased'] = 0.00001
prob_df.loc[prob_df['Not Purchased'] == 0, 'Not Purchased'] = 0.00001
prob_df

Unnamed: 0_level_0,Purchased,Not Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
France,0.75,0.25
Germany,0.333333,0.666667
Spain,0.333333,0.666667


In [64]:
prob_df['WoE'] = np.log(prob_df['Purchased']/prob_df['Not Purchased'])
prob_df

Unnamed: 0_level_0,Purchased,Not Purchased,WoE
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,0.75,0.25,1.098612
Germany,0.333333,0.666667,-0.693147
Spain,0.333333,0.666667,-0.693147


In [65]:
woe_labels = prob_df['WoE'].to_dict()

In [67]:
df['Country_WOE'] = df.Country.map(woe_labels)
df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Country_ord,Country_count,Purchase_Prob,Country_prob,Country_WOE
0,France,44.0,72000.0,0,2,4,0.75,3.0,1.098612
1,Spain,27.0,48000.0,1,0,3,0.333333,0.5,-0.693147
2,Germany,30.0,54000.0,0,1,3,0.333333,0.5,-0.693147
3,Spain,,61000.0,0,0,3,0.333333,0.5,-0.693147
4,Germany,40.0,,1,1,3,0.333333,0.5,-0.693147
