In [593]:
import pandas as pd

In [717]:
student_health = pd.read_csv('datasets/student_health.csv')

student_health.head()

Unnamed: 0,Grade,Gender,Height_cm,Weight_kg
0,First,Male,105,21
1,First,Female,126,25
2,First,Male,126,25
3,First,Male,112,20
4,First,Female,133,32


In [718]:
student_health.shape

(413, 4)

In [719]:
grade_mean = student_health.groupby(by='Grade').mean()

grade_mean

Unnamed: 0_level_0,Height_cm,Weight_kg
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1
First,122.357143,26.183673
Second,126.168317,33.148515
Third,134.775701,36.070093


In [720]:
grade_mean.loc['Second']['Weight_kg'] - grade_mean.loc['First']['Weight_kg']

6.96484138209739

In [721]:
grade_mean.loc['Third']['Weight_kg'] - grade_mean.loc['First']['Weight_kg']

9.88641998855617

In [722]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

import numpy as np

#### OLS Regression against Grade
The coefficients and T-statistics confirm that the grade does have a bearing on student weight

In [723]:
mod = ols("Weight_kg ~ Grade", 
          data=student_health)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Sun, 30 Jun 2019",Prob (F-statistic):,1.89e-56
Time:,18:21:25,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,26.1837,0.434,60.382,0.000,25.331,27.036
Grade[T.Second],6.9648,0.609,11.443,0.000,5.768,8.161
Grade[T.Third],9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


#### OLS with Treatment (Dummy) coding
It generates the same output as previously, showing that OLS accounts for the dummy trap and the default coding is dummy coding (called Treatment Coding in R)

In [724]:
mod = ols("Weight_kg ~ C(Grade, Treatment)", 
          data=student_health)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Sun, 30 Jun 2019",Prob (F-statistic):,1.89e-56
Time:,18:21:25,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,26.1837,0.434,60.382,0.000,25.331,27.036
"C(Grade, Treatment)[T.Second]",6.9648,0.609,11.443,0.000,5.768,8.161
"C(Grade, Treatment)[T.Third]",9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


In [725]:
student_health.drop(columns= ['Gender', 'Height_cm'], inplace=True)

student_health.head()

Unnamed: 0,Grade,Weight_kg
0,First,21
1,First,25
2,First,25
3,First,20
4,First,32


### The `Grade` column will be dummy encoded, dummy encoder does not encode string value so it is necesary label encode the `Grade` column first 

Label encoding the `Grade` Column

- 0 - First
- 1 - Second
- 2 - Third

In [726]:
from sklearn.preprocessing import LabelEncoder

grade_encoder = preprocessing.LabelEncoder()

student_health['Grade'] = grade_encoder.fit_transform(student_health.Grade)

In [727]:
student_health.sample(5)

Unnamed: 0,Grade,Weight_kg
125,1,39
328,2,35
258,2,39
358,2,38
52,0,32


In [728]:
grade_encoder.classes_

array(['First', 'Second', 'Third'], dtype=object)

In [729]:
student_health.describe()

Unnamed: 0,Grade,Weight_kg
count,413.0,413.0
mean,1.280872,33.009685
std,0.82353,5.855866
min,0.0,20.0
25%,1.0,31.0
50%,2.0,32.0
75%,2.0,38.0
max,2.0,46.0


### Installing and Importing the Patsy library

In [730]:
! pip install patsy

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [731]:
from patsy.contrasts import Treatment

https://www.statsmodels.org/devel/contrasts.html#treatment-dummy-coding

In [732]:
student_health.Grade.unique()

array([0, 1, 2])

### There are three classes

In [733]:
levels = [0, 1, 2]

### Contrast matrix without intercept

In [783]:
contrast_without_intercept_0 = Treatment(reference=0)\
                                .code_without_intercept(levels)

print(contrast_without_intercept_0.matrix)

[[0. 0.]
 [1. 0.]
 [0. 1.]]


In [784]:
contrast_without_intercept_1 = Treatment(reference=1)\
                                .code_without_intercept(levels)

print(contrast_without_intercept_1.matrix)

[[1. 0.]
 [0. 0.]
 [0. 1.]]


In [785]:
contrast_without_intercept_2 = Treatment(reference=2)\
                                .code_without_intercept(levels)

print(contrast_without_intercept_2.matrix)

[[1. 0.]
 [0. 1.]
 [0. 0.]]


### Contrast matrix with intercept

In [735]:
contrast_with_intercept = Treatment(reference=0)\
                            .code_with_intercept(levels)

print(contrast_with_intercept.matrix)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [786]:
contrast_with_intercept = Treatment(reference=1)\
                            .code_with_intercept(levels)

print(contrast_with_intercept.matrix)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


## 1) Taking Contrast without intercept for Regression

In [787]:
health_data_dummy = student_health

### Creating Contrast matrix for the `Grade` column of the dataset

In [788]:
health_data_contrast = contrast_without_intercept_0.matrix[health_data_dummy.Grade, :]
    
health_data_contrast[90:105]

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

### Assigning name to the columns

In [789]:
student_health_contrast_df = pd.DataFrame(health_data_contrast, 
                                          columns =['grade_2', 'grade_3'], dtype=np.int)

student_health_contrast_df.sample(5)

Unnamed: 0,grade_2,grade_3
252,0,1
15,0,0
77,0,0
199,0,1
338,0,1


### concatenating the encoded columns with the datset

In [790]:
health_data_dummy = pd.concat([health_data_dummy, student_health_contrast_df], 
                              axis=1)

health_data_dummy.sample(10)

Unnamed: 0,Grade,Weight_kg,grade_2,grade_3
49,0,25,0,0
135,1,33,1,0
154,1,31,1,0
260,2,35,0,1
294,2,39,0,1
282,2,31,0,1
272,2,39,0,1
151,1,35,1,0
209,2,39,0,1
293,2,31,0,1


### Deleting the column `Grade`

In [791]:
health_data_dummy.drop(columns = ['Grade'], 
                       inplace=True)

In [792]:
health_data_dummy.sample(5)

Unnamed: 0,Weight_kg,grade_2,grade_3
38,29,0,0
47,33,0,0
89,25,0,0
337,35,0,1
149,39,1,0


In [793]:
X = health_data_dummy.drop('Weight_kg', axis='columns')

y = health_data_dummy.Weight_kg

In [794]:
X.sample(5)

Unnamed: 0,grade_2,grade_3
40,0,0
217,0,1
95,0,0
240,0,1
280,0,1


In [795]:
y.sample(5)

287    35
148    31
262    31
371    39
308    39
Name: Weight_kg, dtype: int64

In [796]:
X_with_constant = sm.add_constant(X)

mod = sm.OLS(y, X_with_constant)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Mon, 01 Jul 2019",Prob (F-statistic):,1.89e-56
Time:,08:01:01,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,26.1837,0.434,60.382,0.000,25.331,27.036
grade_2,6.9648,0.609,11.443,0.000,5.768,8.161
grade_3,9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


### Importing sklearn linear regression model
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

### Creating the model

In [797]:
from sklearn.linear_model import LinearRegression

linear_model_dummy = LinearRegression(fit_intercept=True)

### Training the model

In [798]:
linear_model_dummy.fit(X, y)

print("Training_score : " , linear_model_dummy.score(X, y))

Training_score :  0.46521549611032575


In [799]:
linear_model_dummy.coef_

array([6.96484138, 9.88641999])

In [800]:
linear_model_dummy.intercept_

26.18367346938775

## 2) Taking Contrast matrix with intercept for regression

In [801]:
health_data_ohe = student_health

In [802]:
contrast_with_intercept.matrix

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

### Creating Contrast matrix for the `Grade` column of the dataset

In [803]:
health_data_contrast = contrast_with_intercept.matrix[health_data_ohe.Grade, :]

health_data_contrast[90:105]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

### Assigning name to the columns

In [804]:
health_data_contrast = pd.DataFrame(health_data_contrast, 
                                    columns = ['grade_1', 'grade_2', 'grade_3'],
                                    dtype=np.int)

health_data_contrast.sample(5)

Unnamed: 0,grade_1,grade_2,grade_3
210,0,0,1
299,0,0,1
286,0,0,1
247,0,0,1
82,1,0,0


### Concatenating the encoded columns with the dataset

In [805]:
health_data_ohe = pd.concat([health_data_ohe, health_data_contrast], 
                            axis=1)

health_data_ohe.sample(10)

Unnamed: 0,Grade,Weight_kg,grade_1,grade_2,grade_3
400,2,35,0,0,1
306,2,46,0,0,1
70,0,29,1,0,0
314,2,35,0,0,1
301,2,35,0,0,1
387,2,46,0,0,1
228,2,31,0,0,1
125,1,39,0,1,0
333,2,46,0,0,1
54,0,29,1,0,0


### Dropping column `'Grade'`

In [806]:
health_data_ohe.drop(columns = ['Grade'], 
                     inplace=True) 

In [807]:
X = health_data_ohe.drop('Weight_kg', axis='columns')

y = health_data_ohe.Weight_kg

X.head()

Unnamed: 0,grade_1,grade_2,grade_3
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


### Creating the model

In [808]:
linear_model_ohe = LinearRegression(fit_intercept=False)

### Training the model

In [809]:
linear_model_ohe.fit(X, y)

print("Training_score : " , linear_model_ohe.score(X, y))

Training_score :  0.46521549611032575


### Testing the model

In [810]:
linear_model_ohe.coef_

array([26.18367347, 33.14851485, 36.07009346])

In [811]:
linear_model_ohe.intercept_

0.0