# Feature Engineering - Encoding

This is done only for categorical variables, which are of two tyoes:
- Nominal (male, female)
    * One Hot Encoding
    * One Hot Encoding with Many Categorical Variables
    * Mean Encoding
- Ordinal (can be ranked - MBA, BE, BCOM etc.)
    - Label Encoding
    - Target Guided Ordinal Encoding

## Import Data

In [2]:
gdurltitanic = 'https://drive.google.com/file/d/15tUbXaTIktuLzNjTq6euUHqBpGNuz7u6/view?usp=sharing'
gdtitanic = 'https://drive.google.com/uc?export=download&id='+gdurltitanic.split('/')[-2]
gdurlhousing = 'https://drive.google.com/file/d/11bQwRc8TIPWbLXwnElIOf_NygTCwomwa/view?usp=sharing'
gdhousing = 'https://drive.google.com/uc?export=download&id='+gdurlhousing.split('/')[-2]
gdmercedes = 'https://drive.google.com/uc?export=download&id='+'https://drive.google.com/file/d/1cgfq8R82cLL1xJYbh9jA_qbHihThzYHg/view?usp=sharing'.split('/')[-2]

# 1. One Hot Encoding

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(gdtitanic, usecols=['Sex'])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [6]:
pd.get_dummies(df,drop_first=False).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [5]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
df = pd.read_csv(gdtitanic, usecols=['Embarked'])

In [7]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [8]:
df.dropna(inplace = True)

In [9]:
pd.get_dummies(df, drop_first=False).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


# 2. Handling Multiple Categorical Variables - Mercedes Dataset

In [3]:
df = pd.read_csv(gdmercedes,usecols=['X0','X1','X2','X3','X4','X5','X6'])

In [4]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [10]:
for i in df.columns:
    print(i,len(df[i].unique()))

X0 47
X1 27
X2 44
X3 7
X4 4
X5 29
X6 12


In [11]:
df.X1.value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

##### Check out KDD Cup Orange Challenge
Pick only the Top 10 recurring values

In [15]:
lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10 = list(lst_10)
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [16]:
import numpy as np
for categories in lst_10:
    df[categories]=np.where(df['X1']==categories,1,0)

In [17]:
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0
4205,j,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1
4206,ak,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0
4207,al,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0


In [18]:
lst_10.append('X1')

In [19]:
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


# 3. Label Encoding/ Ordinal Number Encoding
For variables that can be ranked

In [2]:
import datetime

In [3]:
today_date = datetime.datetime.today()

In [4]:
today_date

datetime.datetime(2020, 9, 12, 5, 33, 16, 176107)

In [5]:
today_date-datetime.timedelta(1)

datetime.datetime(2020, 9, 11, 5, 33, 16, 176107)

In [6]:
## List Comprehension

days = [today_date-datetime.timedelta(x) for x in range(0,15)]

In [12]:
import pandas as pd
data = pd.DataFrame(days)
data.columns=['Day']

In [13]:
data.head()

Unnamed: 0,Day
0,2020-09-12 05:33:16.176107
1,2020-09-11 05:33:16.176107
2,2020-09-10 05:33:16.176107
3,2020-09-09 05:33:16.176107
4,2020-09-08 05:33:16.176107


In [20]:
data['weekday'] = data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2020-09-12 05:33:16.176107,Saturday
1,2020-09-11 05:33:16.176107,Friday
2,2020-09-10 05:33:16.176107,Thursday
3,2020-09-09 05:33:16.176107,Wednesday
4,2020-09-08 05:33:16.176107,Tuesday


In [21]:
dictionary = {'Monday':1,'Tuesday':2,'Wednesday': 3, 'Thursday':4, 'Friday':5, 'Saturday':6,'Sunday':7}

In [22]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [25]:
# Map Function
data['weekday_ordinal']=data['weekday'].map(dictionary)
data

Unnamed: 0,Day,weekday,weekday_ordinal
0,2020-09-12 05:33:16.176107,Saturday,6
1,2020-09-11 05:33:16.176107,Friday,5
2,2020-09-10 05:33:16.176107,Thursday,4
3,2020-09-09 05:33:16.176107,Wednesday,3
4,2020-09-08 05:33:16.176107,Tuesday,2
5,2020-09-07 05:33:16.176107,Monday,1
6,2020-09-06 05:33:16.176107,Sunday,7
7,2020-09-05 05:33:16.176107,Saturday,6
8,2020-09-04 05:33:16.176107,Friday,5
9,2020-09-03 05:33:16.176107,Thursday,4


# 4. Count of Frequency Encoding
Advantages:
- We are not increasing the number of features

Disadvantages:
- If there are features with similar frequency, it will provide the same weight to both (bad handling)

In [43]:
import pandas as pd
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [44]:
df.shape

(32561, 15)

In [45]:
columns = [1,3,5,6,7,8,9,13]

In [46]:
df = df[columns]

In [47]:
df.columns=['Employment','Degree','Status','Designation','Family','Race','Sex','Country']

In [48]:
df.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [49]:
for feature in df.columns[:]:
    print(feature,":",len(df[feature].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [53]:
country_map = df['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' Greece': 29,
 ' France': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

### Now we want to map the frequency in place of the actual feature

In [54]:
df['Country']=df['Country'].map(country_map)
df

Unnamed: 0,Employment,Degree,Status,Designation,Family,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,29170
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,29170
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,29170
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,29170


# 5. Target Guided Ordinal Encoding
- For Ordinal Categories
- Order the labels according to the target
- Replace the labels by the joint probability of being 1 or 0 in classification problem

In [55]:
import pandas as pd
df=pd.read_csv(gdtitanic,usecols=['Cabin','Survived'])

In [57]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [58]:
df['Cabin'].fillna('Missing',inplace=True)

In [59]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [61]:
df['Cabin']=df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [62]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [64]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [67]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [69]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [70]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [71]:
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


# 6. Mean Encoding
Advantages:
- Captures info within the label
- Creates a monotonic relationship between the label and the target

Disadvantages:
- Prone to overfitting

In [75]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [77]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


# 7. Probability Ratio Encoding

In [78]:
import pandas as pd

In [79]:
df = pd.read_csv(gdtitanic, usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [82]:
# Replacing NaN
df['Cabin'].fillna('Missing', inplace=True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [83]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [85]:
df['Cabin']=df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [91]:
prob_df=df.groupby(['Cabin'])['Survived'].mean()
prob_df

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [96]:
# Replace Survived values with Probability of Survival
prob_df=pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [97]:
prob_df['Died']=1-prob_df['Survived']
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [98]:
prob_df['Probabity_ratio']=prob_df['Survived']/prob_df['Died']
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probabity_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [100]:
probability_encoded=prob_df['Probabity_ratio'].to_dict()

In [101]:
df['Cabin_encoded']=df['Cabin'].map(probability_encoded)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
