In [1]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [3]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

In [14]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [15]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

In [16]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

In [26]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is -0.6707258146672874
The coefficient for disp is 3.014349700777794
The coefficient for hp is -0.8303813133253106
The coefficient for wt is -6.2181813408093225
The coefficient for acc is 0.17053969974945324
The coefficient for yr is 3.0907788393987805
The coefficient for origin_america is -0.7337708836958005
The coefficient for origin_asia is 0.3506361951144703
The coefficient for origin_europe is 0.5654047730253051


In [None]:
# Let us compare the coefficients with the RIDGE and LASSO coefficients 

## RIDGE COEFFICIENTS

In [36]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
    

Ridge model: [[-0.62494625  2.86181924 -0.85010003 -6.11414714  0.14971413  3.07891605
  -0.72666287  0.34868313  0.55841538]]


## LASSO COEFFICIENTS

In [59]:
lasso = Lasso(alpha=0.70)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.14388802 -4.741098    0.          2.36269918
 -0.37362768  0.          0.        ]


## Let us compare their scores

In [31]:
print(regression_model.score(X_test, y_test))
print(regression_model.score(X_train, y_train))

0.8433135132808833
0.8141025501610559


In [38]:

print(ridge.score(X_test, y_test))
print(ridge.score(X_train, y_train))

0.8437999817350272
0.8140828080856514


In [60]:
print(lasso.score(X_test, y_test))
print(lasso.score(X_train, y_train))

0.8353355541674008
0.7915795114093802


In [None]:
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model