In [1]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [2]:
mpg_df = pd.read_csv("D:/K2Analytics/Python/Featurization/Datafiles/car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

# separate independent and dependent variables

In [3]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [4]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 



In [5]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [6]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 2.5059518049385052
The coefficient for disp is 2.5357082860560483
The coefficient for hp is -1.7889335736325294
The coefficient for wt is -5.551819873098725
The coefficient for acc is 0.11485734803440854
The coefficient for yr is 2.931846548211609
The coefficient for car_type is 2.977869737601944
The coefficient for origin_america is -0.5832955290166003
The coefficient for origin_asia is 0.3474931380432235
The coefficient for origin_europe is 0.3774164680868855


# Create a regularized RIDGE model and note the coefficients

In [7]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
    

Ridge model: [[ 2.47057467  2.44494419 -1.78573889 -5.47285499  0.10115618  2.92319984
   2.94492098 -0.57949986  0.34667456  0.37344909]]


# Create a regularized LASSO model and note the coefficients

In [8]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [ 1.10693517  0.         -0.71587138 -4.2127655  -0.          2.73245903
  1.66333749 -0.63587683  0.          0.        ]


## Let us compare their scores

In [9]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.834377025696
0.851342138778


In [10]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.834361793131
0.851888217161


In [11]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.821144513478
0.857723420104


In [12]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [13]:
from sklearn.preprocessing import PolynomialFeatures

In [14]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

#poly = PolynomialFeatures(2)

In [15]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 56)

# Fit a simple non regularized linear model on poly features-

In [16]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


[ -9.67853872e-13  -1.02547610e+12  -4.44627550e+00  -2.25335309e+00
  -2.97461239e+00  -1.56303884e+00   2.99807210e+00  -1.36540038e+12
  -2.76502640e+11   3.37537063e+12  -2.51120872e+12  -1.14688873e+00
  -1.46820068e+00   8.01849365e-03   2.57632446e+00  -1.91624451e+00
  -3.28545774e+12  -5.63708876e+12  -2.02022600e+12  -1.92831070e+12
   4.05090332e-01   1.94641113e-01  -4.12567139e-01   3.58755493e+00
  -2.03466797e+00  -7.22273158e+11  -5.95274878e+11  -5.68191339e+11
   2.46704102e-01  -6.71813965e-01  -1.92431641e+00  -7.59445190e-01
  -3.61046618e+11  -2.97563296e+11  -2.84024900e+11  -1.74697876e-01
   5.25756836e-01  -3.31704712e+00   1.39983900e+12   1.15370339e+12
   1.10121273e+12   5.86059570e-01   1.53323364e+00   2.66618900e+11
   2.19738933e+11   2.09741354e+11   4.02069092e-01   1.19857431e+11
   9.87827349e+10   9.42883640e+10  -7.33210878e+11   9.38450032e+11
   8.95752870e+11   8.20702724e+10  -4.45507361e+12   2.24803837e+12]


In [17]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [18]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


0.9143225702
0.86133980537


In [19]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


Lasso model: [ 0.          0.52263805 -0.5402102  -1.99423315 -4.55360385 -0.85285179
  2.99044036  0.00711821 -0.          0.76073274 -0.         -0.
 -0.19736449  0.          2.04221833 -1.00014513  0.         -0.
  4.28412669 -0.          0.          0.31442062 -0.          2.13894094
 -1.06760107  0.         -0.          0.          0.         -0.44991392
 -1.55885506 -0.         -0.68837902  0.          0.17455864 -0.34653644
  0.3313704  -2.84931966  0.         -0.34340563  0.00815105  0.47019445
  1.25759712 -0.69634581  0.          0.55528147  0.2948979  -0.67289549
  0.06490671  0.         -1.19639935  1.06711702  0.         -0.88034391
  0.         -0.        ]


In [20]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


0.90982861939
0.869529685877
