#### Process of converting categorical data into numerical data that can be easily processed by Machine Learning

### Types of Categorical Variables: 

    1. Nominal 

    2. Ordinal (Ordered) 

### Methods of Encoding 

<style>
    table {
        border-collapse: collapse;
        width: 100%;
        max-width: 800px;
        margin: 0 auto;
    }

    th, td {
        padding: 10px;
        text-align: left;
        vertical-align: top;
        border: 1px solid #ddd;
    }

    th {
        background-color: #f2f2f2;
    }

    .number-column {
        width: 30px;
        text-align: right;
        padding-right: 10px;
    }

    @media (max-width: 767px) {
        table {
            display: block;
        }
        th, td {
            display: block;
            width: 100%;
        }
        .number-column {
            width: 100%;
            text-align: left;
            padding-right: 0;
        }
    }
</style>

<h1>List of Encoding Techniques</h1>
<table>
    <tr>
        <th class="number-column">1.</th>
        <td style="text-align:left;color:green;">One Hot Encoding</td>
        <th class="number-column">9.</th>
        <td style="text-align:left;color:green;">Frequency Encoding</td>
    </tr>
    <tr>
        <th class="number-column">2.</th>
        <td style="text-align:left;color:green;">Target Guided Ordinal Encoding</td>
        <th class="number-column">10.</th>
        <td style="text-align:left;">Backward Difference Encoding</td>
    </tr>
    <tr>
        <th class="number-column">3.</th>
        <td style="text-align:left;color:green;">Label Encoding</td>
        <th class="number-column">11.</th>
        <td style="text-align:left;color:green;">Mean Encoding</td>
    </tr>
    <tr>
        <th class="number-column">4.</th>
        <td style="text-align:left;">Probability Ratio Encoding</td>
        <th class="number-column">12.</th>
        <td style="text-align:left;">Weight of Evidence Encoding</td>
    </tr>
    <tr>
        <th class="number-column">5.</th>
        <td style="text-align:left;">Ordinal Encoding</td>
        <th class="number-column">13.</th>
        <td style="text-align:left;">Leave One Out Encoding</td>
    </tr>
    <tr>
        <th class="number-column">6.</th>
        <td style="text-align:left;">Helmert Encoding</td>
        <th class="number-column">14.</th>
        <td style="text-align:left;">James-Stein Encoding</td>
    </tr>
    <tr>
        <th class="number-column">7.</th>
        <td style="text-align:left;">Hashing Encoding</td>
        <th class="number-column">15.</th>
        <td style="text-align:left;">M-estimator Encoding (updated)</td>
    </tr>
    <tr>
        <th class="number-column">8.</th>
        <td style="text-align:left;">Binary Encoding</td>
        <th class="number-column">16.</th>
        <td style="text-align:left;">Thermometer Encoder (updated)</td>
    </tr>
</table>


In [1]:
# https://www.kaggle.com/code/aditya1702/mercedes-benz-data-exploration/input
import pandas as pd
import numpy as np
data = pd.read_csv('Dataset/benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head(3)

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j


In [None]:
# checking number of unique categories in each column
for col in data.columns:
    print(col,":", len(data[col].unique()),' labels')

## 1. One Hot Encoding

In [None]:
# using pandas
df = pd.get_dummies(data,drop_first=True)
df.shape

In [None]:
df.head(2)

In [None]:
# using sklearn
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder()


## Note:
in case of larger number of categories like 500 the number of feature will increase and will result in Curese of Dimensionality and may impact accuracy level

 ### From KDD cup orange challenge problem technique
    took 10 most frequent categoreis from each features and left the remaning feature and performed one hot encoding on selected 10 categories

In [None]:
# taking the top 10 most frequent categories for the variable X2
data.X2.value_counts().sort_values(ascending=False).head(20)

In [None]:
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

In [None]:
# making the 10 binary variables 
for label in top_10:
    data[label] = np.where(data['X2']==label,1,0)
data[['X2']+top_10].head(10)

## 2. Target Guided Ordinal Encoding

In [18]:
import pandas as pd

# Create sample data
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue'],
        'price': [100, 150, 120, 80, 90, 140]}
df = pd.DataFrame(data)

# Calculate mean price for each color
means = df.groupby('color')['price'].mean()

# Create mapping dictionary
mapping = {}
for i, val in enumerate(sorted(df['color'].unique(),reverse = False)):
    mapping[val] = i+1

# Apply mapping to the 'color' column
df['color_encoded'] = df['color'].map(mapping)

# Print the results
print(df)

   color  price  color_encoded
0    red    100              3
1   blue    150              1
2  green    120              2
3    red     80              3
4  green     90              2
5   blue    140              1


In [19]:
mapping = {}
for i, val in enumerate(sorted(df['color'].unique(),reverse = False)):
    mapping[val] = i+1
mapping

{'blue': 1, 'green': 2, 'red': 3}

## 3. Label Encoding

In [9]:
import pandas as pd
import numpy as np
data = pd.read_csv('Dataset/benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j
3,l,n,f,d,z,l
4,s,as,c,d,y,i


In [11]:
da=data
da

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j
3,l,n,f,d,z,l
4,s,as,c,d,y,i
...,...,...,...,...,...,...
4204,h,as,f,d,aa,j
4205,aa,ai,d,d,aa,j
4206,v,as,f,d,aa,d
4207,v,as,a,d,aa,c


In [16]:
from sklearn.preprocessing import LabelEncoder
df= data
df['X1_label'] = LabelEncoder().fit_transform(data.X1)
df.head(5)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_label
0,v,n,f,d,t,a,23
1,b,ai,a,d,b,g,3
2,v,as,f,d,a,j,23
3,l,n,f,d,z,l,13
4,s,as,c,d,y,i,20


## 4. Mean Encoding

In [33]:
import pandas as pd

# Create sample data
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue','red', 'blue', 'green','green', 'red', 'green', 'blue','red'],
        'price': [100, 150, 120, 80, 90, 140, 80, 90, 140,90, 140, 80, 90, 140]}
df = pd.DataFrame(data)


In [34]:
mean_encode = df.groupby('color')['price'].mean()
mean_encode
df.loc[:, 'mean'] = df['color'].map(mean_encode)
df

Unnamed: 0,color,price,mean
0,red,100,108.0
1,blue,150,117.5
2,green,120,104.0
3,red,80,108.0
4,green,90,104.0
5,blue,140,117.5
6,red,80,108.0
7,blue,90,117.5
8,green,140,104.0
9,green,90,104.0


## 5. Count or Frequency Encoding

In [1]:
import pandas as pd
import numpy as np
df= pd.read_csv('Dataset/benz.csv',usecols = ['X1','X2'])
df.head()

Unnamed: 0,X1,X2
0,v,n
1,b,ai
2,v,as
3,l,n
4,s,as


In [3]:
df.shape

(4209, 2)

In [2]:
# if use OHE
pd.get_dummies(df).shape

(4209, 72)

In [None]:
### from 2 features to it inc it upto 72, so not effective

In [5]:
# total labels
df.nunique()

X1    27
X2    45
dtype: int64

In [6]:
df_freq_map = df.X2.value_counts().to_dict()
df_freq_map

{'as': 1658,
 'ae': 478,
 'ai': 462,
 'm': 348,
 'ak': 260,
 'r': 155,
 'n': 113,
 's': 100,
 'f': 85,
 'e': 84,
 'ay': 78,
 'aq': 72,
 'a': 44,
 'b': 38,
 'k': 25,
 't': 25,
 'ag': 23,
 'ac': 20,
 'ao': 19,
 'i': 15,
 'z': 12,
 'ap': 11,
 'p': 10,
 'aw': 9,
 'h': 6,
 'd': 6,
 'g': 5,
 'q': 5,
 'au': 5,
 'al': 4,
 'ad': 4,
 'af': 4,
 'ab': 4,
 'ah': 3,
 'am': 3,
 'w': 3,
 'at': 3,
 'j': 2,
 'x': 2,
 'av': 1,
 'ax': 1,
 'y': 1,
 'aj': 1,
 'an': 1,
 'u': 1}

In [7]:
df.X2 = df.X2.map(df_freq_map)
df.head()

Unnamed: 0,X1,X2
0,v,113
1,b,462
2,v,1658
3,l,113
4,s,1658
