## Handle Categorical Features

#### Ordinal Number Encoding

In [1]:
import datetime

In [2]:
today_date = datetime.datetime.today()

In [3]:
today_date

datetime.datetime(2021, 5, 20, 14, 14, 51, 642059)

In [4]:
today_date-datetime.timedelta(1)

datetime.datetime(2021, 5, 19, 14, 14, 51, 642059)

In [5]:
## list compernsion
days = [today_date-datetime.timedelta(x) for x in range(0,15)]

In [6]:
import pandas as pd

In [7]:
data = pd.DataFrame(days)
data.columns=['Day']

In [8]:
data.head()

Unnamed: 0,Day
0,2021-05-20 14:14:51.642059
1,2021-05-19 14:14:51.642059
2,2021-05-18 14:14:51.642059
3,2021-05-17 14:14:51.642059
4,2021-05-16 14:14:51.642059


In [9]:
data['weekday'] = data['Day'].dt.day_name()

In [10]:
data.head()

Unnamed: 0,Day,weekday
0,2021-05-20 14:14:51.642059,Thursday
1,2021-05-19 14:14:51.642059,Wednesday
2,2021-05-18 14:14:51.642059,Tuesday
3,2021-05-17 14:14:51.642059,Monday
4,2021-05-16 14:14:51.642059,Sunday


In [11]:
dictionary = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [12]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [13]:
data['weekday_ordinal']= data['weekday'].map(dictionary)

In [14]:
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2021-05-20 14:14:51.642059,Thursday,4
1,2021-05-19 14:14:51.642059,Wednesday,3
2,2021-05-18 14:14:51.642059,Tuesday,2
3,2021-05-17 14:14:51.642059,Monday,1
4,2021-05-16 14:14:51.642059,Sunday,7


### Count or Frequency Encoding

In [15]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
columns = [column for column in train_set.columns if train_set[column].dtype =='O']

In [17]:
columns

[1, 3, 5, 6, 7, 8, 9, 13, 14]

In [18]:
train_set = train_set[columns]

In [19]:
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country','unknown']

In [20]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country,unknown
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [21]:
for feature in train_set.columns[:]:
    print(feature,':',len(train_set[feature].unique()))

Employment : 9
Degree : 16
Status : 7
Designation : 15
family_job : 6
Race : 5
Sex : 2
Country : 42
unknown : 2


In [22]:
country_map = train_set['Country'].value_counts().to_dict()

In [23]:
train_set['Country'] = train_set['Country'].map(country_map)

In [24]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country,unknown
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95,<=50K


#### Advantages
1. Easy To Use
2. Not increasing feature space 
##### Disadvantages
It will provide same weight if the frequencies are same

## Target Guided Ordinal Encoding

1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [29]:
import pandas as pd
df = pd.read_csv('titanic.csv',usecols=['survived','deck'])

In [30]:
df.head()

Unnamed: 0,survived,deck
0,0,
1,1,C
2,1,
3,1,C
4,0,


In [31]:
df['deck'].fillna('Missing',inplace=True)

In [33]:
df.head(10)

Unnamed: 0,survived,deck
0,0,Missing
1,1,C
2,1,Missing
3,1,C
4,0,Missing
5,0,Missing
6,0,E
7,0,Missing
8,1,Missing
9,1,Missing


In [36]:
df['deck'] = df['deck'].astype(str).str[0]

In [37]:
df.head()

Unnamed: 0,survived,deck
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [38]:
df.deck.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [39]:
df.groupby(['deck'])['survived'].mean()

deck
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299419
Name: survived, dtype: float64

In [41]:
df.groupby(['deck'])['survived'].mean().sort_values()

deck
M    0.299419
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: survived, dtype: float64

In [43]:
ordinal_labels = df.groupby(['deck'])['survived'].mean().sort_values().index
ordinal_labels

Index(['M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='deck')

In [47]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels,0)}

In [48]:
ordinal_labels2

{'M': 0, 'A': 1, 'G': 2, 'C': 3, 'F': 4, 'B': 5, 'E': 6, 'D': 7}

In [49]:
df['deck_ordinal_label'] = df['deck'].map(ordinal_labels2)

In [50]:
df.head()

Unnamed: 0,survived,deck,deck_ordinal_label
0,0,M,0
1,1,C,3
2,1,M,0
3,1,C,3
4,0,M,0


## Mean Encoding

In [53]:
mean_ordinal = df.groupby(['deck'])['survived'].mean().to_dict()

In [54]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29941860465116277}

In [55]:
df['mean_ordinal_encode'] = df['deck'].map(mean_ordinal)
df.head()

Unnamed: 0,survived,deck,deck_ordinal_label,mean_ordinal_encode
0,0,M,0,0.299419
1,1,C,3,0.59322
2,1,M,0,0.299419
3,1,C,3,0.59322
4,0,M,0,0.299419
