## Helmert Coding using `Categorical encoders`

### Importing modules

In [15]:
import pandas as pd
import numpy as np

from statsmodels.formula.api import ols

### Loading the dataset
https://www.kaggle.com/toramky/automobile-dataset

In [2]:
car_data = pd.read_csv('datasets/auto-mpg.csv', na_values='?')

car_data.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
61,21.0,4,122.0,86.0,2226,16.5,72,1,ford pinto runabout
46,22.0,4,140.0,72.0,2408,19.0,71,1,chevrolet vega (sw)
353,33.0,4,105.0,74.0,2190,14.2,81,2,volkswagen jetta
26,10.0,8,307.0,200.0,4376,15.0,70,1,chevy c20
208,13.0,8,318.0,150.0,3940,13.2,76,1,plymouth volare premier v8
148,26.0,4,116.0,75.0,2246,14.0,74,2,fiat 124 tc
27,11.0,8,318.0,210.0,4382,13.5,70,1,dodge d200
234,24.5,4,151.0,88.0,2740,16.0,77,1,pontiac sunbird coupe
31,25.0,4,113.0,95.0,2228,14.0,71,3,toyota corona
41,14.0,8,318.0,150.0,4096,13.0,71,1,plymouth fury iii


In [3]:
car_data = car_data[['mpg', 'cylinders']]

In [4]:
car_data.dropna(inplace=True)

In [5]:
car_data.shape

(398, 2)

In [6]:
car_data.sample(10)

Unnamed: 0,mpg,cylinders
318,29.8,4
212,16.5,8
33,19.0,6
160,17.0,6
126,21.0,6
163,18.0,6
104,12.0,8
343,39.1,4
386,25.0,6
233,29.0,4


### Printing the different classes present

In [7]:
car_data['cylinders'].unique()

array([8, 4, 6, 3, 5])

In [9]:
car_data.sort_values(by=['cylinders'], inplace=True)

car_data.reset_index(inplace=True, drop=True)

car_data.head(10)

Unnamed: 0,mpg,cylinders
0,18.0,3
1,19.0,3
2,23.7,3
3,21.5,3
4,32.3,4
5,35.1,4
6,39.0,4
7,39.1,4
8,30.0,4
9,25.8,4


In [16]:
car_data.mean()

mpg          23.514573
cylinders     5.454774
dtype: float64

In [37]:
car_data_grouped = car_data.groupby(by=['cylinders']).mean()

car_data_grouped

Unnamed: 0_level_0,mpg
cylinders,Unnamed: 1_level_1
3,20.55
4,29.286765
5,27.366667
6,19.985714
8,14.963107


In [38]:
car_data_grouped['mpg'].mean()

22.43045049087596

In [46]:
coefficient_cylinder_4 = \
    (car_data_grouped.loc[4]['mpg'] - car_data_grouped.loc[3]['mpg']) / 2

coefficient_cylinder_4

4.368382352941174

In [47]:
mean_34 = (car_data_grouped.loc[3]['mpg'] + car_data_grouped.loc[4]['mpg']) / 2

coefficient_cylinder_5 = (car_data_grouped.loc[5]['mpg'] - mean_34) / 3

coefficient_cylinder_5

0.8160947712418304

In [48]:
mod = ols("mpg ~ C(cylinders, Helmert)", 
          data=car_data)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.637
Model:,OLS,Adj. R-squared:,0.634
Method:,Least Squares,F-statistic:,172.6
Date:,"Mon, 01 Jul 2019",Prob (F-statistic):,3.68e-85
Time:,10:14:18,Log-Likelihood:,-1180.8
No. Observations:,398,AIC:,2372.0
Df Residuals:,393,BIC:,2392.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.4305,0.739,30.353,0.000,20.978,23.883
"C(cylinders, Helmert)[H.4]",4.3684,1.194,3.657,0.000,2.020,6.717
"C(cylinders, Helmert)[H.5]",0.8161,0.994,0.821,0.412,-1.138,2.770
"C(cylinders, Helmert)[H.6]",-1.4372,0.329,-4.371,0.000,-2.084,-0.791
"C(cylinders, Helmert)[H.8]",-1.8668,0.206,-9.079,0.000,-2.271,-1.463

0,1,2,3
Omnibus:,48.011,Durbin-Watson:,1.464
Prob(Omnibus):,0.0,Jarque-Bera (JB):,71.51
Skew:,0.793,Prob(JB):,2.96e-16
Kurtosis:,4.341,Cond. No.,12.8


### Importing categorical encoders

In [49]:
import category_encoders as ce

### Creating a HelmertEncoder on the 'cylinders' column

In [62]:
ce_helmert = ce.HelmertEncoder(cols = ['cylinders'])
ce_helmert

HelmertEncoder(cols=['cylinders'], drop_invariant=False,
               handle_missing='indicator', handle_unknown='indicator',
               mapping=None, return_df=True, verbose=0)

### Encoding the dataset

In [51]:
car_he = ce_helmert.fit_transform(car_data)
car_he.sample(10)

Unnamed: 0,mpg,cylinders_0,cylinders_1,cylinders_2,cylinders_3
208,25.4,0.0,2.0,-1.0,-1.0
331,13.0,0.0,0.0,0.0,4.0
111,23.8,1.0,-1.0,-1.0,-1.0
241,18.0,0.0,0.0,3.0,-1.0
62,27.0,1.0,-1.0,-1.0,-1.0
143,34.1,1.0,-1.0,-1.0,-1.0
232,20.0,0.0,0.0,3.0,-1.0
127,24.0,1.0,-1.0,-1.0,-1.0
150,31.8,1.0,-1.0,-1.0,-1.0
243,25.4,0.0,0.0,3.0,-1.0


In [52]:
pd.concat([car_data['cylinders'], car_he], axis=1).sample(10)

Unnamed: 0,cylinders,mpg,cylinders_0,cylinders_1,cylinders_2,cylinders_3
79,4,24.0,1.0,-1.0,-1.0,-1.0
278,6,16.0,0.0,0.0,3.0,-1.0
337,8,18.2,0.0,0.0,0.0,4.0
252,6,20.2,0.0,0.0,3.0,-1.0
13,4,23.6,1.0,-1.0,-1.0,-1.0
281,6,20.0,0.0,0.0,3.0,-1.0
31,4,29.9,1.0,-1.0,-1.0,-1.0
374,8,16.0,0.0,0.0,0.0,4.0
30,4,38.0,1.0,-1.0,-1.0,-1.0
376,8,13.0,0.0,0.0,0.0,4.0


In [54]:
X = car_he.drop(columns = ['mpg'], axis=1)

y = car_he['mpg']

X.sample(10)

Unnamed: 0,cylinders_0,cylinders_1,cylinders_2,cylinders_3
47,1.0,-1.0,-1.0,-1.0
370,0.0,0.0,0.0,4.0
77,1.0,-1.0,-1.0,-1.0
292,0.0,0.0,3.0,-1.0
205,1.0,-1.0,-1.0,-1.0
163,1.0,-1.0,-1.0,-1.0
254,0.0,0.0,3.0,-1.0
382,0.0,0.0,0.0,4.0
60,1.0,-1.0,-1.0,-1.0
227,0.0,0.0,3.0,-1.0


In [55]:
y.head(10)

0    18.0
1    19.0
2    23.7
3    21.5
4    32.3
5    35.1
6    39.0
7    39.1
8    30.0
9    25.8
Name: mpg, dtype: float64

In [60]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

linear_model.fit(X, y)

print("Training_score : " , linear_model.score(X, y))

Training_score :  0.6372420899156167


In [61]:
linear_model.coef_

array([ 4.36838235,  0.81609477, -1.43719071, -1.86683592])