#### Process of converting categorical data into numerical data that can be easily processed by Machine Learning

### Types of Categorical Variables: 

    1. Nominal 

    2. Ordinal (Ordered) 

### Methods of Encoding 

<style>
    table {
        border-collapse: collapse;
        width: 100%;
        max-width: 800px;
        margin: 0 auto;
    }

    th, td {
        padding: 10px;
        text-align: left;
        vertical-align: top;
        border: 1px solid #ddd;
    }

    th {
        background-color: #f2f2f2;
    }

    .number-column {
        width: 30px;
        text-align: right;
        padding-right: 10px;
    }

    @media (max-width: 767px) {
        table {
            display: block;
        }
        th, td {
            display: block;
            width: 100%;
        }
        .number-column {
            width: 100%;
            text-align: left;
            padding-right: 0;
        }
    }
</style>

<h1>List of Encoding Techniques</h1>
<table>
    <tr>
        <th class="number-column">1.</th>
        <td style="text-align:left;color:green;">One Hot Encoding</td>
        <th class="number-column">9.</th>
        <td style="text-align:left;color:green;">Frequency Encoding</td>
    </tr>
    <tr>
        <th class="number-column">2.</th>
        <td style="text-align:left;color:green;">Target Guided Ordinal Encoding</td>
        <th class="number-column">10.</th>
        <td style="text-align:left;">Backward Difference Encoding</td>
    </tr>
    <tr>
        <th class="number-column">3.</th>
        <td style="text-align:left;color:green;">Label Encoding</td>
        <th class="number-column">11.</th>
        <td style="text-align:left;color:green;">Mean Encoding</td>
    </tr>
    <tr>
        <th class="number-column">4.</th>
        <td style="text-align:left;color:green;">Probability Ratio Encoding</td>
        <th class="number-column">12.</th>
        <td style="text-align:left;">Weight of Evidence Encoding</td>
    </tr>
    <tr>
        <th class="number-column">5.</th>
        <td style="text-align:left;color:green;">Ordinal Encoding</td>
        <th class="number-column">13.</th>
        <td style="text-align:left;">Leave One Out Encoding</td>
    </tr>
    <tr>
        <th class="number-column">6.</th>
        <td style="text-align:left;">Helmert Encoding</td>
        <th class="number-column">14.</th>
        <td style="text-align:left;">James-Stein Encoding</td>
    </tr>
    <tr>
        <th class="number-column">7.</th>
        <td style="text-align:left;">Hashing Encoding</td>
        <th class="number-column">15.</th>
        <td style="text-align:left;">M-estimator Encoding (updated)</td>
    </tr>
    <tr>
        <th class="number-column">8.</th>
        <td style="text-align:left;">Binary Encoding</td>
        <th class="number-column">16.</th>
        <td style="text-align:left;">Thermometer Encoder (updated)</td>
    </tr>
</table>


In [1]:
# https://www.kaggle.com/code/aditya1702/mercedes-benz-data-exploration/input
import pandas as pd
import numpy as np
data = pd.read_csv('Dataset/benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head(3)

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j


In [None]:
# checking number of unique categories in each column
for col in data.columns:
    print(col,":", len(data[col].unique()),' labels')

## 1. One Hot Encoding

In [None]:
# using pandas
df = pd.get_dummies(data,drop_first=True)
df.shape

In [None]:
df.head(2)

In [None]:
# using sklearn
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder()


## Note:
in case of larger number of categories like 500 the number of feature will increase and will result in Curese of Dimensionality and may impact accuracy level

 ### From KDD cup orange challenge problem technique
    took 10 most frequent categoreis from each features and left the remaning feature and performed one hot encoding on selected 10 categories

In [None]:
# taking the top 10 most frequent categories for the variable X2
data.X2.value_counts().sort_values(ascending=False).head(20)

In [None]:
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

In [None]:
# making the 10 binary variables 
for label in top_10:
    data[label] = np.where(data['X2']==label,1,0)
data[['X2']+top_10].head(10)

## 2. Target Guided Ordinal Encoding
    1. Ordering the labels according to the target
    2. Replace the labels by the joint probablity of being 1 or 0  in classification probablem

In [18]:
import pandas as pd

# Create sample data
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue'],
        'price': [100, 150, 120, 80, 90, 140]}
df = pd.DataFrame(data)

# Calculate mean price for each color
means = df.groupby('color')['price'].mean()

# Create mapping dictionary
mapping = {}
for i, val in enumerate(sorted(df['color'].unique(),reverse = False)):
    mapping[val] = i+1

# Apply mapping to the 'color' column
df['color_encoded'] = df['color'].map(mapping)

# Print the results
print(df)

   color  price  color_encoded
0    red    100              3
1   blue    150              1
2  green    120              2
3    red     80              3
4  green     90              2
5   blue    140              1


In [19]:
mapping = {}
for i, val in enumerate(sorted(df['color'].unique(),reverse = False)):
    mapping[val] = i+1
mapping

{'blue': 1, 'green': 2, 'red': 3}

## 3. Label Encoding

In [9]:
import pandas as pd
import numpy as np
data = pd.read_csv('Dataset/benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j
3,l,n,f,d,z,l
4,s,as,c,d,y,i


In [11]:
da=data
da

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j
3,l,n,f,d,z,l
4,s,as,c,d,y,i
...,...,...,...,...,...,...
4204,h,as,f,d,aa,j
4205,aa,ai,d,d,aa,j
4206,v,as,f,d,aa,d
4207,v,as,a,d,aa,c


In [16]:
from sklearn.preprocessing import LabelEncoder
df= data
df['X1_label'] = LabelEncoder().fit_transform(data.X1)
df.head(5)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_label
0,v,n,f,d,t,a,23
1,b,ai,a,d,b,g,3
2,v,as,f,d,a,j,23
3,l,n,f,d,z,l,13
4,s,as,c,d,y,i,20


## 4. Mean Encoding

In [33]:
import pandas as pd

# Create sample data
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue','red', 'blue', 'green','green', 'red', 'green', 'blue','red'],
        'price': [100, 150, 120, 80, 90, 140, 80, 90, 140,90, 140, 80, 90, 140]}
df = pd.DataFrame(data)


In [34]:
mean_encode = df.groupby('color')['price'].mean()
mean_encode
df.loc[:, 'mean'] = df['color'].map(mean_encode)
df

Unnamed: 0,color,price,mean
0,red,100,108.0
1,blue,150,117.5
2,green,120,104.0
3,red,80,108.0
4,green,90,104.0
5,blue,140,117.5
6,red,80,108.0
7,blue,90,117.5
8,green,140,104.0
9,green,90,104.0


## 5. Count or Frequency Encoding

In [1]:
import pandas as pd
import numpy as np
df= pd.read_csv('Dataset/benz.csv',usecols = ['X1','X2'])
df.head()

Unnamed: 0,X1,X2
0,v,n
1,b,ai
2,v,as
3,l,n
4,s,as


In [3]:
df.shape

(4209, 2)

In [2]:
# if use OHE
pd.get_dummies(df).shape

(4209, 72)

In [None]:
### from 2 features to it inc it upto 72, so not effective

In [5]:
# total labels
df.nunique()

X1    27
X2    45
dtype: int64

In [6]:
df_freq_map = df.X2.value_counts().to_dict()
df_freq_map

{'as': 1658,
 'ae': 478,
 'ai': 462,
 'm': 348,
 'ak': 260,
 'r': 155,
 'n': 113,
 's': 100,
 'f': 85,
 'e': 84,
 'ay': 78,
 'aq': 72,
 'a': 44,
 'b': 38,
 'k': 25,
 't': 25,
 'ag': 23,
 'ac': 20,
 'ao': 19,
 'i': 15,
 'z': 12,
 'ap': 11,
 'p': 10,
 'aw': 9,
 'h': 6,
 'd': 6,
 'g': 5,
 'q': 5,
 'au': 5,
 'al': 4,
 'ad': 4,
 'af': 4,
 'ab': 4,
 'ah': 3,
 'am': 3,
 'w': 3,
 'at': 3,
 'j': 2,
 'x': 2,
 'av': 1,
 'ax': 1,
 'y': 1,
 'aj': 1,
 'an': 1,
 'u': 1}

In [7]:
df.X2 = df.X2.map(df_freq_map)
df.head()

Unnamed: 0,X1,X2
0,v,113
1,b,462
2,v,1658
3,l,113
4,s,1658


## 6. Ordinal Encoding

In [5]:
# Create a sample dataset
data = ['small', 'medium', 'large', 'small', 'large', 'medium']

# Create a dictionary to map categories to numbers
mapping = {'small': 0, 'medium': 1, 'large': 2}

# Use the dictionary to encode the data
encoded_data = [mapping[item] for item in data]

print(encoded_data)



[0, 1, 2, 0, 2, 1]


# another example

In [2]:
import pandas as pd
import datetime

In [3]:
# create a variable with dates, and from that extract the weekday
# I create a list of dates with 20 days difference from today
# and then transform it into a datafame

df_base = datetime.datetime.today()
df_date_list = [df_base - datetime.timedelta(days=x) for x in range(0, 20)]
df = pd.DataFrame(df_date_list)
df.columns = ['day']
df

Unnamed: 0,day
0,2023-04-21 17:15:37.919696
1,2023-04-20 17:15:37.919696
2,2023-04-19 17:15:37.919696
3,2023-04-18 17:15:37.919696
4,2023-04-17 17:15:37.919696
5,2023-04-16 17:15:37.919696
6,2023-04-15 17:15:37.919696
7,2023-04-14 17:15:37.919696
8,2023-04-13 17:15:37.919696
9,2023-04-12 17:15:37.919696


In [22]:
# extract the week day name

df['day_of_week'] = df['day'].dt.day_name()
df.head()

Unnamed: 0,day,day_of_week
0,2023-04-21 17:15:37.919696,Friday
1,2023-04-20 17:15:37.919696,Thursday
2,2023-04-19 17:15:37.919696,Wednesday
3,2023-04-18 17:15:37.919696,Tuesday
4,2023-04-17 17:15:37.919696,Monday


In [23]:
# Engineer categorical variable by ordinal number replacement

weekday_map = {'Monday':1,
               'Tuesday':2,
               'Wednesday':3,
               'Thursday':4,
               'Friday':5,
               'Saturday':6,
               'Sunday':7
}

df['day_ordinal'] = df.day_of_week.map(weekday_map)
df.head(20)

Unnamed: 0,day,day_of_week,day_ordinal
0,2023-04-21 17:15:37.919696,Friday,5
1,2023-04-20 17:15:37.919696,Thursday,4
2,2023-04-19 17:15:37.919696,Wednesday,3
3,2023-04-18 17:15:37.919696,Tuesday,2
4,2023-04-17 17:15:37.919696,Monday,1
5,2023-04-16 17:15:37.919696,Sunday,7
6,2023-04-15 17:15:37.919696,Saturday,6
7,2023-04-14 17:15:37.919696,Friday,5
8,2023-04-13 17:15:37.919696,Thursday,4
9,2023-04-12 17:15:37.919696,Wednesday,3


## 7. Probablity Ratio Encoding

In [44]:
import pandas as pd

In [45]:
df = pd.read_csv('Dataset/titanic_train.csv', usecols = ['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [46]:
# replacing NaN
df['Cabin'].fillna('Missing', inplace =  True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [47]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [48]:
df['Cabin'] = df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [49]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [50]:
# prob ratio encoding
prob_df = df.groupby(['Cabin'])['Survived'].mean()
prob_df 

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [51]:
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [52]:
prob_df['died'] = 1 - prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [53]:
## probablity ratio

In [54]:
prob_df['Probablity ratio'] = prob_df['Survived']/prob_df['died']
prob_df.head()

Unnamed: 0_level_0,Survived,died,Probablity ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [55]:
probablity_encoded=prob_df['Probablity ratio'].to_dict()
probablity_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [56]:
df['Cabin_Encoded'] = df['Cabin'].map(probablity_encoded)

In [57]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274


### example 2 

In [36]:
import pandas as pd

# create a sample dataframe
data = {'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Houston', 'New York']}
df = pd.DataFrame(data)

# calculate the probability of each city
prob_city = df['city'].value_counts(normalize=True)

# set the reference city
ref_city = 'Houston'

# calculate the probability ratio of each city with respect to the reference city
prob_ratio = prob_city / prob_city[ref_city]

# replace the original categorical variable with the corresponding probability ratios
df['city_encoded'] = df['city'].map(prob_ratio)

print(df)


          city  city_encoded
0     New York           1.0
1  Los Angeles           0.5
2      Chicago           0.5
3      Houston           1.0
4      Houston           1.0
5     New York           1.0
