### Ordinal Number Encoding

In [1]:
import datetime

In [4]:
today_date = datetime.datetime.today()

In [5]:
today_date

datetime.datetime(2023, 7, 19, 13, 38, 37, 492344)

In [6]:
today_date - datetime.timedelta(3)

datetime.datetime(2023, 7, 16, 13, 38, 37, 492344)

#### List Comprehension

In [7]:
days = [today_date-datetime.timedelta(x) for x in range(0, 15)]

In [8]:
import pandas as pd
data = pd.DataFrame(days)
data.columns = ["Day"]

In [9]:
data.head()

Unnamed: 0,Day
0,2023-07-19 13:38:37.492344
1,2023-07-18 13:38:37.492344
2,2023-07-17 13:38:37.492344
3,2023-07-16 13:38:37.492344
4,2023-07-15 13:38:37.492344


In [13]:
data['weekday']=data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2023-07-19 13:38:37.492344,Wednesday
1,2023-07-18 13:38:37.492344,Tuesday
2,2023-07-17 13:38:37.492344,Monday
3,2023-07-16 13:38:37.492344,Sunday
4,2023-07-15 13:38:37.492344,Saturday


In [14]:
dictionary = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6, 'Sunday': 7}

In [15]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [17]:
data['weekday_ordinal'] = data['weekday'].map(dictionary)
data

Unnamed: 0,Day,weekday,weekday_ordinal
0,2023-07-19 13:38:37.492344,Wednesday,3
1,2023-07-18 13:38:37.492344,Tuesday,2
2,2023-07-17 13:38:37.492344,Monday,1
3,2023-07-16 13:38:37.492344,Sunday,7
4,2023-07-15 13:38:37.492344,Saturday,6
5,2023-07-14 13:38:37.492344,Friday,5
6,2023-07-13 13:38:37.492344,Thursday,4
7,2023-07-12 13:38:37.492344,Wednesday,3
8,2023-07-11 13:38:37.492344,Tuesday,2
9,2023-07-10 13:38:37.492344,Monday,1


### Count or Frequency Encoding

In [18]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [19]:
columns = [1, 3, 5, 6, 7, 8, 9, 13]

In [20]:
train_set = train_set[columns]

In [21]:
train_set.columns = ['Employment', 'Degree', 'Status', 'Designations', 'Family Job', 'Race', 'Gender', 'Country']
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designations,Family Job,Race,Gender,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [22]:
for feature in train_set.columns[:]:
    print(feature, ":", len(train_set[feature].unique()), 'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designations : 15 labels
Family Job : 6 labels
Race : 5 labels
Gender : 2 labels
Country : 42 labels


In [24]:
country_map = train_set['Country'].value_counts().to_dict()

In [26]:
train_set['Country'] = train_set['Country'].map(country_map)
train_set.head(15)

Unnamed: 0,Employment,Degree,Status,Designations,Family Job,Race,Gender,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,


#### Advantages
1. Easy To Use
2. Not increasing feature space

#### Disadvantages
1. It will provide same weight if the frequencies are same

### Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [29]:
import pandas as pd
df = pd.read_csv('titanic.csv', usecols = ['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [30]:
df['Cabin'].fillna('Missing', inplace = True)

In [31]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [32]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [33]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [34]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [35]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [37]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [38]:
enumerate(ordinal_labels, 0)

<enumerate at 0x24ffa5d3f40>

In [39]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels, 0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [40]:
df['Cabin_ordinal_labels'] = df['Cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Encoding

In [41]:
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [42]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [43]:
df['mean_ordinal_encode'] = df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
