# Encoding Continuous Values

A common transformation for machine learning. 

encoding a Z Score

In [1]:
import os
import pandas as pd
from scipy.stats import zscore

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

df['mpg'] = zscore(df['mpg'])
display(df[0:5])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,-0.706439,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,-1.090751,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,-0.706439,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,-0.962647,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,-0.834543,8,302.0,140.0,3449,10.5,70,1,ford torino


#Encoding Categorical Values as Dummie

In [2]:
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

display(df[0:5])

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


In [3]:
areas = list(df['area'].unique())
print(f'Number of areas: {len(areas)}')
print(f'Areas: {areas}')

Number of areas: 4
Areas: ['c', 'd', 'a', 'b']


Number of areas: 4
Areas: ['c', 'd', 'a', 'b']

There are four unique values in the areas column. To encode these to dummy variables we would use four columns, each of which would represent one of the areas. For each row, one column would have a value of one, the rest zeros. This is why this type of encoding is sometimes called one-hot encoding. The following code shows how you might encode the values "a" through "d". The value A becomes [1,0,0,0] and the value B becomes [0,1,0,0].

In [4]:
dummies = pd.get_dummies(['a','b','c','d'],prefix='area')
print(dummies)

   area_a  area_b  area_c  area_d
0       1       0       0       0
1       0       1       0       0
2       0       0       1       0
3       0       0       0       1


merge these dummies back into the data frame.

In [6]:
dummies = pd.get_dummies(df['area'],prefix='area')
print(dummies[0:10]) # Just show the first 10

   area_a  area_b  area_c  area_d
0       0       0       1       0
1       0       0       1       0
2       0       0       1       0
3       0       0       1       0
4       0       0       0       1
5       0       0       1       0
6       0       0       0       1
7       1       0       0       0
8       0       0       1       0
9       1       0       0       0


In [0]:
df = pd.concat([df,dummies],axis=1)


In [9]:
display(df[0:10][['id','job','area','income','area_a',
                  'area_b','area_c','area_d']])

Unnamed: 0,id,job,area,income,area_a,area_a.1,area_b,area_b.1,area_c,area_c.1,area_d,area_d.1
0,1,vv,c,50876.0,0,0,0,0,1,1,0,0
1,2,kd,c,60369.0,0,0,0,0,1,1,0,0
2,3,pe,c,55126.0,0,0,0,0,1,1,0,0
3,4,11,c,51690.0,0,0,0,0,1,1,0,0
4,5,kl,d,28347.0,0,0,0,0,0,0,1,1
5,6,e2,c,70854.0,0,0,0,0,1,1,0,0
6,7,kl,d,38726.0,0,0,0,0,0,0,1,1
7,8,nb,a,55162.0,1,1,0,0,0,0,0,0
8,9,al,c,67311.0,0,0,0,0,1,1,0,0
9,10,pe,a,63344.0,1,1,0,0,0,0,0,0


Drop the area col. 

In [10]:
df.drop('area', axis=1, inplace=True)
display(df[0:10][['id','job','income','area_a',
                  'area_b','area_c','area_d']])

Unnamed: 0,id,job,income,area_a,area_a.1,area_b,area_b.1,area_c,area_c.1,area_d,area_d.1
0,1,vv,50876.0,0,0,0,0,1,1,0,0
1,2,kd,60369.0,0,0,0,0,1,1,0,0
2,3,pe,55126.0,0,0,0,0,1,1,0,0
3,4,11,51690.0,0,0,0,0,1,1,0,0
4,5,kl,28347.0,0,0,0,0,0,0,1,1
5,6,e2,70854.0,0,0,0,0,1,1,0,0
6,7,kl,38726.0,0,0,0,0,0,0,1,1
7,8,nb,55162.0,1,1,0,0,0,0,0,0
8,9,al,67311.0,0,0,0,0,1,1,0,0
9,10,pe,63344.0,1,1,0,0,0,0,0,0


# Target Encoding for Categoricals

can increase the models predictive power but increases the risk of overfitting.

Generally only used with regression

In [11]:
# Create a small sample dataset
import pandas as pd
import numpy as np

np.random.seed(43)
df = pd.DataFrame({
    'cont_9': np.random.rand(10)*100,
    'cat_0': ['dog'] * 5 + ['cat'] * 5,
    'cat_1': ['wolf'] * 9 + ['tiger'] * 1,
    'y': [1, 0, 1, 1, 1, 1, 0, 0, 0, 0]
})

display(df)

Unnamed: 0,cont_9,cat_0,cat_1,y
0,11.505457,dog,wolf,1
1,60.906654,dog,wolf,0
2,13.339096,dog,wolf,1
3,24.058962,dog,wolf,1
4,32.713906,dog,wolf,1
5,85.913749,cat,wolf,1
6,66.609021,cat,wolf,0
7,54.116221,cat,wolf,0
8,2.901382,cat,wolf,0
9,73.37483,cat,tiger,0


Rather than creating dummy variables for dog and cat, we would like to change it to a number. We could just use 0 for cat, 1 for dog. However, we can encode more information than just that. The simple 0 or 1 would also only work for one animal. Consider what the mean target value is for cat and dog.

In [12]:
means0 = df.groupby('cat_0')['y'].mean().to_dict()
means0

{'cat': 0.2, 'dog': 0.8}

The danger is that we are now using the target value for training. This will potentially overfit. The possibility of overfitting is even greater if there are a small number of a particular category. To prevent this from happening, we use a weighting factor. The stronger the weight the more than categories with a small number of values will tend towards the overall average of y, which is calculated as follows

In [13]:
df['y'].mean()

0.5

In [0]:
# Source: https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
def calc_smooth_mean(df1, df2, cat_name, target, weight):
    # Compute the global mean
    mean = df[target].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    if df2 is None:
        return df1[cat_name].map(smooth)
    else:
        return df1[cat_name].map(smooth),df2[cat_name].map(smooth.to_dict())

In [0]:
WEIGHT = 5
df['cat_0_enc'] = calc_smooth_mean(df1=df, df2=None, cat_name='cat_0', target='y', weight=WEIGHT)
df['cat_1_enc'] = calc_smooth_mean(df1=df, df2=None, cat_name='cat_1', target='y', weight=WEIGHT)

In [16]:
display(df)


Unnamed: 0,cont_9,cat_0,cat_1,y,cat_0_enc,cat_1_enc
0,11.505457,dog,wolf,1,0.65,0.535714
1,60.906654,dog,wolf,0,0.65,0.535714
2,13.339096,dog,wolf,1,0.65,0.535714
3,24.058962,dog,wolf,1,0.65,0.535714
4,32.713906,dog,wolf,1,0.65,0.535714
5,85.913749,cat,wolf,1,0.35,0.535714
6,66.609021,cat,wolf,0,0.35,0.535714
7,54.116221,cat,wolf,0,0.35,0.535714
8,2.901382,cat,wolf,0,0.35,0.535714
9,73.37483,cat,tiger,0,0.35,0.416667
