### Encoding Continuous Values

In [21]:
import os
import pandas as pd
from scipy.stats import zscore

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

In [22]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


 One very common machine learning normalization is the Z-Score:

$z = \frac{x - \mu}{\sigma} $

To calculate the Z-Score you need to also calculate the mean($\mu$) and the standard deviation ($\sigma$). 

$\mu = \bar{x} = \frac{x_1+x_2+\cdots +x_n}{n}$
The standard deviation is calculated as follows:

$\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}, {\rm \ \ where\ \ } \mu = \frac{1}{N} \sum_{i=1}^N x_i$


In [23]:
df['mpg'] = zscore(df['mpg'])
display(df[0:5])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,-0.706439,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,-1.090751,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,-0.706439,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,-0.962647,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,-0.834543,8,302.0,140.0,3449,10.5,70,1,ford torino


### Encoding Categorical Values as Dummies

In [24]:
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

display(df[0:5])

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


In [25]:
areas = list(df['area'].unique())

In [26]:
areas

['c', 'd', 'a', 'b']

In [27]:
dummies = pd.get_dummies(['a', 'b', 'c', 'd'], prefix='area')

In [28]:
dummies

Unnamed: 0,area_a,area_b,area_c,area_d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1


In [29]:
dummies = pd.get_dummies(df['area'], prefix='area')

In [30]:
df = pd.concat([df, dummies], axis=1)

In [31]:
df.head(10)

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product,area_a,area_b,area_c,area_d
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b,0,0,1,0
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c,0,0,1,0
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b,0,0,1,0
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b,0,0,1,0
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a,0,0,0,1
5,6,e2,c,70854.0,40.4,1,14.893343,87,20.340593,43,0.866142,0.673228,0.473581,d,0,0,1,0
6,7,kl,d,38726.0,30.975,3,3.822477,33,9.480399,39,0.976378,0.874016,0.092151,f,0,0,0,1
7,8,nb,a,55162.0,26.966667,2,4.312097,17,29.219896,44,1.0,0.724409,0.162833,b,1,0,0,0
8,9,al,c,67311.0,32.383333,0,25.093772,169,10.927357,45,0.952756,0.681102,0.096333,c,0,0,1,0
9,10,pe,a,63344.0,38.233333,1,2.816034,3,21.915695,42,0.897638,0.724409,0.173986,c,1,0,0,0


In [32]:
list(df.columns)

['id',
 'job',
 'area',
 'income',
 'aspect',
 'subscriptions',
 'dist_healthy',
 'save_rate',
 'dist_unhealthy',
 'age',
 'pop_dense',
 'retail_dense',
 'crime',
 'product',
 'area_a',
 'area_b',
 'area_c',
 'area_d']

In [35]:
df[0:10][
    ['id',
 'job',
 'area',
 'income',
 'area_a',
 'area_b',
 'area_c',
 'area_d'
 ]
]

Unnamed: 0,id,job,area,income,area_a,area_b,area_c,area_d
0,1,vv,c,50876.0,0,0,1,0
1,2,kd,c,60369.0,0,0,1,0
2,3,pe,c,55126.0,0,0,1,0
3,4,11,c,51690.0,0,0,1,0
4,5,kl,d,28347.0,0,0,0,1
5,6,e2,c,70854.0,0,0,1,0
6,7,kl,d,38726.0,0,0,0,1
7,8,nb,a,55162.0,1,0,0,0
8,9,al,c,67311.0,0,0,1,0
9,10,pe,a,63344.0,1,0,0,0


In [37]:
df.drop('area', axis=1, inplace=True)

In [38]:
df.head(10)

Unnamed: 0,id,job,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product,area_a,area_b,area_c,area_d
0,1,vv,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b,0,0,1,0
1,2,kd,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c,0,0,1,0
2,3,pe,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b,0,0,1,0
3,4,11,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b,0,0,1,0
4,5,kl,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a,0,0,0,1
5,6,e2,70854.0,40.4,1,14.893343,87,20.340593,43,0.866142,0.673228,0.473581,d,0,0,1,0
6,7,kl,38726.0,30.975,3,3.822477,33,9.480399,39,0.976378,0.874016,0.092151,f,0,0,0,1
7,8,nb,55162.0,26.966667,2,4.312097,17,29.219896,44,1.0,0.724409,0.162833,b,1,0,0,0
8,9,al,67311.0,32.383333,0,25.093772,169,10.927357,45,0.952756,0.681102,0.096333,c,0,0,1,0
9,10,pe,63344.0,38.233333,1,2.816034,3,21.915695,42,0.897638,0.724409,0.173986,c,1,0,0,0


### Target Encoding for Categoricals

In [41]:
# creating a small sample dataset

import numpy as np
import pandas as pd

np.random.seed(43)

df = pd.DataFrame({
    'cont_9': np.random.rand(10) * 100,
    'cat_0': ['dog'] * 5 + ['cat'] * 5,
    'cat_1': ['wolf'] * 9 + ['tiger'] * 1,
    'y': [1, 0, 1, 1, 1, 1, 0, 0, 0, 0]
})

display(df)

Unnamed: 0,cont_9,cat_0,cat_1,y
0,11.505457,dog,wolf,1
1,60.906654,dog,wolf,0
2,13.339096,dog,wolf,1
3,24.058962,dog,wolf,1
4,32.713906,dog,wolf,1
5,85.913749,cat,wolf,1
6,66.609021,cat,wolf,0
7,54.116221,cat,wolf,0
8,2.901382,cat,wolf,0
9,73.37483,cat,tiger,0
