# Target or Mean Encoding

In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
import category_encoders as ce

### 1. Create a Dataframe.

In [2]:
input_data = [["alfa-romero", 13495],["alfa-romero", 16500], ["alfa-romero", 16500], ["alfa-romero", 13950],
       ["audi", 13950],["audi", 17450], ["audi", 15250], ["audi", 17710], ["audi", 18920],["audi", 23875], 
        ["bmw", 16430], ["bmw", 15690]]
df = pd.DataFrame(input_data, columns=["make", "price"])
display(df)

Unnamed: 0,make,price
0,alfa-romero,13495
1,alfa-romero,16500
2,alfa-romero,16500
3,alfa-romero,13950
4,audi,13950
5,audi,17450
6,audi,15250
7,audi,17710
8,audi,18920
9,audi,23875


### 2. Target Encoding Using Mean Technique

In [3]:
input_data = [["alfa-romero", 13495],["alfa-romero", 16500], ["alfa-romero", 16500], ["alfa-romero", 13950],
       ["audi", 13950],["audi", 17450], ["audi", 15250], ["audi", 17710], ["audi", 18920],["audi", 23875], 
        ["bmw", 16430], ["bmw", 15690]]
df = pd.DataFrame(input_data, columns=["make", "price"])


# Creating TargetEncoder object
encoder = ce.TargetEncoder(cols=['make'])

# Fit and transform the data
df_encoded = encoder.fit_transform(df['make'], df['price'])

# Adding encoded column to the DataFrame
df['encoded_column'] = df_encoded

print(df)

           make  price  encoded_column
0   alfa-romero  13495    16385.971501
1   alfa-romero  16500    16385.971501
2   alfa-romero  16500    16385.971501
3   alfa-romero  13950    16385.971501
4          audi  13950    16883.844755
5          audi  17450    16883.844755
6          audi  15250    16883.844755
7          audi  17710    16883.844755
8          audi  18920    16883.844755
9          audi  23875    16883.844755
10          bmw  16430    16560.586879
11          bmw  15690    16560.586879


In [4]:
def target_encoding(data:DataFrame, encode_column:str, target_column:str, smoothing:bool = False, smoothing_factor:float = None)-> DataFrame:
    
    grouped_df = data.groupby(encode_column)
    category_group_mean_dict = grouped_df[target_column].mean().to_dict()
    print(category_group_mean_dict)
    
    if smoothing:
        if smoothing_factor:
            if smoothing_factor >= 0 and smoothing_factor <=1:
                
                weight_dict = grouped_df.size().to_dict()
                overall_mean = data[target_column].mean()
                encode_dict = {key: (weight_dict[key]*value) + ((1-weight_dict[key])*overall_mean) 
                               for key,value in category_group_mean_dict.items()}
                print(encode_dict)
                
                data[encode_column] = data[encode_column].map(encode_dict)
                
            else:
                raise ValueError(f'The expected smoothing_factor value between 0 to 1 but {smoothing_factor} received.')
        else:
            raise ValueError('The smoothing_factor is mendetory when smoothing is True.')
    else:
        data[encode_column] = data[encode_column].map(category_group_mean_dict)
        
    return data


In [5]:
encode_df = target_encoding(df, encode_column = "make", target_column = "price", smoothing = True, smoothing_factor= 0.2)
encode_df

{'alfa-romero': 15111.25, 'audi': 17859.166666666668, 'bmw': 16060.0}
{'alfa-romero': 10515.0, 'audi': 23938.333333333343, 'bmw': 15476.666666666668}


Unnamed: 0,make,price,encoded_column
0,10515.0,13495,16385.971501
1,10515.0,16500,16385.971501
2,10515.0,16500,16385.971501
3,10515.0,13950,16385.971501
4,23938.333333,13950,16883.844755
5,23938.333333,17450,16883.844755
6,23938.333333,15250,16883.844755
7,23938.333333,17710,16883.844755
8,23938.333333,18920,16883.844755
9,23938.333333,23875,16883.844755
