In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import linear_model
random.seed(123)

In [2]:
df = pd.read_csv('categ_multi.csv')

In [3]:
df.shape

(8, 4)

In [4]:
df.head()

Unnamed: 0,income,age,sex,overtime
0,53,39,F,Y
1,67,44,M,N
2,55,32,M,Y
3,76,42,M,Y
4,98,23,F,N


In [5]:
target = df['income']
df.drop('income', axis = 1, inplace = True)
catVars = ['sex', 'overtime']
df_cat = df.drop(['sex', 'overtime'], axis = 1).copy()

In [6]:
def encode_cat_var_fit_model(df, df_cat, catVars, target, clf, drop_one_col):
    if drop_one_col == 1:
        dummy_var = pd.get_dummies(df[catVars], drop_first=True)
    else:
        dummy_var = pd.get_dummies(df[catVars], drop_first=False)
    reg_df = pd.concat([df_cat, dummy_var], axis = 1)    
    model_fit = clf.fit(reg_df, target)
    coeffs = pd.Series(model_fit.coef_)
    coef_dict = dict(zip(reg_df.columns, coeffs))
    return coef_dict

In [7]:
linear_reg = LinearRegression()

In [8]:
coeff_not_dropping_one_col = encode_cat_var_fit_model(df, df_cat, catVars, target, linear_reg, drop_one_col=0)
print('Coefficients for Linear Regression without dropping columns : \n')
for key, value in coeff_not_dropping_one_col.items():
    print (key, ':', round(value, 3), '\n')

Coefficients for Linear Regression without dropping columns : 

age : -0.017 

sex_F : -1.58 

sex_M : 1.58 

overtime_N : 5.461 

overtime_Y : -5.461 



In [9]:
coeff_dropping_one_col = encode_cat_var_fit_model(df, df_cat, catVars, target, linear_reg, drop_one_col=1)
print('Coefficients for Linear Regression without dropping columns : \n')
for key, value in coeff_dropping_one_col.items():
    print (key, ':', round(value, 3), '\n')

Coefficients for Linear Regression without dropping columns : 

age : -0.017 

sex_M : 3.161 

overtime_Y : -10.922 



In [10]:
lasso_reg = Lasso(alpha=0.1)

In [11]:
lasso_coeff_not_dropping_one_col = encode_cat_var_fit_model(df, df_cat, catVars, target, lasso_reg, drop_one_col=0)
print('Coefficients for Linear Regression without dropping columns : \n')
for key, value in lasso_coeff_not_dropping_one_col.items():
    print (key, ':', round(value, 3), '\n')

Coefficients for Linear Regression without dropping columns : 

age : -0.003 

sex_F : -2.266 

sex_M : 0.0 

overtime_N : 10.149 

overtime_Y : -0.0 

