-----------------------
#### Understanding Frequency Encoding

Frequency encoding involves replacing categorical values with their respective frequencies in the dataset. 

Instead of using labels or one-hot encoding, this approach captures the information about the frequency distribution of each category. 

The rationale is to provide the model with valuable insights into the importance of each category based on its prevalence.

-------------------------------

In [1]:
import pandas as pd

In [2]:
# Sample DataFrame with a categorical column 'Category'
data = {'Category': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C', 'C']}

df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Category
0,A
1,B
2,A
3,C
4,B
5,A
6,A
7,C
8,C


In [4]:
df['Category'].value_counts()

A    4
C    3
B    2
Name: Category, dtype: int64

In [5]:
df['Category'].value_counts(normalize=True)

A    0.444444
C    0.333333
B    0.222222
Name: Category, dtype: float64

In [6]:
# Calculate the frequency of each category
frequency_map = df['Category'].value_counts(normalize=True).to_dict()

In [7]:
# Map the frequencies to the original DataFrame
df['Category_FrequencyEncoded'] = df['Category'].map(frequency_map)

In [8]:
df

Unnamed: 0,Category,Category_FrequencyEncoded
0,A,0.444444
1,B,0.222222
2,A,0.444444
3,C,0.333333
4,B,0.222222
5,A,0.444444
6,A,0.444444
7,C,0.333333
8,C,0.333333


#### Example - 02

In [9]:
# Sample DataFrame with multiple categorical columns
data = {
    'Category_A': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C', 'C'],
    'Category_B': ['X', 'Y', 'X', 'Z', 'Y', 'Z', 'X', 'Y', 'Z'],
    'Category_C': ['High', 'Low', 'Medium', 'High', 'High', 'Low', 'Medium', 'Medium', 'Low'],
}

In [10]:
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,Category_A,Category_B,Category_C
0,A,X,High
1,B,Y,Low
2,A,X,Medium
3,C,Z,High
4,B,Y,High
5,A,Z,Low
6,A,X,Medium
7,C,Y,Medium
8,C,Z,Low


In [12]:
# Frequency encoding for each categorical column
for column in df.select_dtypes(include='object').columns:
    frequency_map = df[column].value_counts(normalize=True).to_dict()
    df[f'{column}_FrequencyEncoded'] = df[column].map(frequency_map)

In [13]:
df

Unnamed: 0,Category_A,Category_B,Category_C,Category_A_FrequencyEncoded,Category_B_FrequencyEncoded,Category_C_FrequencyEncoded
0,A,X,High,0.444444,0.333333,0.333333
1,B,Y,Low,0.222222,0.333333,0.333333
2,A,X,Medium,0.444444,0.333333,0.333333
3,C,Z,High,0.333333,0.333333,0.333333
4,B,Y,High,0.222222,0.333333,0.333333
5,A,Z,Low,0.444444,0.333333,0.333333
6,A,X,Medium,0.444444,0.333333,0.333333
7,C,Y,Medium,0.333333,0.333333,0.333333
8,C,Z,Low,0.333333,0.333333,0.333333


#### Example - 03
-  frequency encoding for 1 col and for the other cat col we choose OHE

In [14]:
# Sample DataFrame with multiple categorical columns
data = {
    'Category_Freq': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C', 'C'],
    'Category_OHE': ['X', 'Y', 'X', 'Z', 'Y', 'Z', 'X', 'Y', 'Z'],
    'Numeric_Value': [10, 20, 15, 25, 18, 22, 30, 17, 28],
}

In [15]:
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Category_Freq,Category_OHE,Numeric_Value
0,A,X,10
1,B,Y,20
2,A,X,15
3,C,Z,25
4,B,Y,18
5,A,Z,22
6,A,X,30
7,C,Y,17
8,C,Z,28


In [17]:
# Frequency encoding for 'Category_Freq'
freq_encoding_map = df['Category_Freq'].value_counts(normalize=True).to_dict()
df['Category_Freq_Encoded'] = df['Category_Freq'].map(freq_encoding_map)

In [18]:
df

Unnamed: 0,Category_Freq,Category_OHE,Numeric_Value,Category_Freq_Encoded
0,A,X,10,0.444444
1,B,Y,20,0.222222
2,A,X,15,0.444444
3,C,Z,25,0.333333
4,B,Y,18,0.222222
5,A,Z,22,0.444444
6,A,X,30,0.444444
7,C,Y,17,0.333333
8,C,Z,28,0.333333


In [19]:
# One-hot encoding for 'Category_OHE'
df_ohe = pd.get_dummies(df['Category_OHE'], prefix='Category_OHE')


In [20]:
# Concatenate the original DataFrame and the one-hot encoded DataFrame
df = pd.concat([df, df_ohe], axis=1)

In [21]:
df

Unnamed: 0,Category_Freq,Category_OHE,Numeric_Value,Category_Freq_Encoded,Category_OHE_X,Category_OHE_Y,Category_OHE_Z
0,A,X,10,0.444444,1,0,0
1,B,Y,20,0.222222,0,1,0
2,A,X,15,0.444444,1,0,0
3,C,Z,25,0.333333,0,0,1
4,B,Y,18,0.222222,0,1,0
5,A,Z,22,0.444444,0,0,1
6,A,X,30,0.444444,1,0,0
7,C,Y,17,0.333333,0,1,0
8,C,Z,28,0.333333,0,0,1
