In [7]:
#Reading file
url="https://raw.githubusercontent.com/rajsiddarth119/Generalized_Linear_Models/master/CustomerData.csv"
import pandas as pd
data=pd.read_csv(url,header=0).dropna()
#Dropping ID column
data.drop(['CustomerID'],inplace=True,axis=1)
data.head()

Unnamed: 0,City,NoOfChildren,MinAgeOfChild,MaxAgeOfChild,Tenure,FrquncyOfPurchase,NoOfUnitsPurchased,FrequencyOFPlay,NoOfGamesPlayed,NoOfGamesBought,FavoriteChannelOfTransaction,FavoriteGame,TotalRevenueGenerated
0,1,2,3,8,210,11,11,2344,108,10,Uniform,Uniform,107.51
1,1,2,3,6,442,20,20,245,22,7,Favorite,Uniform,382.4
2,1,4,3,5,424,18,18,1059,130,18,Favorite,Uniform,135.01
3,1,1,6,6,261,11,9,365,34,11,Favorite,Uniform,125.0
4,1,3,6,9,422,44,31,1066,102,44,Uniform,Uniform,335.05


In [8]:
data.dtypes

City                              int64
NoOfChildren                      int64
MinAgeOfChild                     int64
MaxAgeOfChild                     int64
Tenure                            int64
FrquncyOfPurchase                 int64
NoOfUnitsPurchased                int64
FrequencyOFPlay                   int64
NoOfGamesPlayed                   int64
NoOfGamesBought                   int64
FavoriteChannelOfTransaction     object
FavoriteGame                     object
TotalRevenueGenerated           float64
dtype: object

In [9]:
#Converting city to factors
data['City']=data['City'].astype('category')
#Converting City,FavoriteChannelOfTransaction,FavoriteGame using dummies
dummy_data=pd.get_dummies(data[['City','FavoriteChannelOfTransaction','FavoriteGame']])
dummy_data.head()

Unnamed: 0,City_1,City_2,FavoriteChannelOfTransaction_Favorite,FavoriteChannelOfTransaction_Uniform,FavoriteGame_Favorite,FavoriteGame_NONE,FavoriteGame_Uniform
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [10]:
#Final data frame
data.drop(['City','FavoriteChannelOfTransaction','FavoriteGame'],inplace=True,axis=1)
final_data=pd.concat([data,dummy_data],axis=1)
final_data.dtypes

NoOfChildren                               int64
MinAgeOfChild                              int64
MaxAgeOfChild                              int64
Tenure                                     int64
FrquncyOfPurchase                          int64
NoOfUnitsPurchased                         int64
FrequencyOFPlay                            int64
NoOfGamesPlayed                            int64
NoOfGamesBought                            int64
TotalRevenueGenerated                    float64
City_1                                   float64
City_2                                   float64
FavoriteChannelOfTransaction_Favorite    float64
FavoriteChannelOfTransaction_Uniform     float64
FavoriteGame_Favorite                    float64
FavoriteGame_NONE                        float64
FavoriteGame_Uniform                     float64
dtype: object

In [11]:
Y=final_data.loc[:,['TotalRevenueGenerated']]
X=final_data.loc[:,final_data.columns.difference(Y.columns)]

#Splitting into train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [12]:
#Ridge regression with initial alpha =1
from sklearn.linear_model import Ridge
import numpy as np

ridge_model=Ridge(normalize=True,random_state=10,alpha=1,fit_intercept=True)

In [42]:
#Fitting Ridge model
model=ridge_model.fit(X=X_train,y=y_train)

#Predicting on train data
pred=model.predict(X_train)
pred_test=model.predict(X_test)

#Error metrics on train data
#Writing function for calculating MAPE
from sklearn.utils import check_array

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true,y_pred=check_array(y_true),check_array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print ("MAPE on training data is : %f" %(round(mean_absolute_percentage_error(y_train,pred),2)))

#Predicting on test data
pred_test=model.predict(X_test)
print ("MAPE on test data is : %f" %(round(mean_absolute_percentage_error(y_test,pred_test),2)))



MAPE on training data is : 18.830000
MAPE on test data is : 18.680000


In [43]:
#For different values of alpha : grid search
alp=10**np.linspace(10,-2,100)*0.5

from sklearn.linear_model import RidgeCV
modelCV=RidgeCV(alphas=alp,normalize=True)
modelCV.fit(X_train,y_train)
print (modelCV.coef_)
print ("alpha value is %f"%(modelCV.alpha_))

[[ -4.34606988e+00   4.34606988e+00   7.43609049e+00  -7.43609049e+00
    3.89775990e+00   1.73916110e+01  -6.65328460e+00   2.75234709e-03
    9.39636721e+00  -2.32417300e-01   1.00458613e+00   2.82972964e+00
   -1.05233634e+01  -2.89703697e-02   9.31769779e+00  -1.49253342e-02]]
alpha value is 0.005000


In [44]:
#Grid Search Method 2
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alp))
grid.fit(X_train,y_train)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=0.0050000000000000001, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=True, random_state=10, solver='auto',
   tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  5.00000e+09,   3.78232e+09, ...,   6.60971e-03,   5.00000e-03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.693745113698
0.005


In [45]:
#Training Model with best alpha value
ridge_model=Ridge(normalize=True,random_state=10,alpha=modelCV.alpha_,fit_intercept=True)

#Fitting Ridge model
model2=ridge_model.fit(X=X_train,y=y_train)

#Predicting on train data
pred2=model2.predict(X_train)
pred_test2=model2.predict(X_test)

#Error metrics on train data

print ("MAPE on training data is : %f" %(round(mean_absolute_percentage_error(y_train,pred2),2)))

#Predicting on test data
pred_test2=model2.predict(X_test)
print ("MAPE on test data is : %f" %(round(mean_absolute_percentage_error(y_test,pred_test2),2)))



MAPE on training data is : 18.830000
MAPE on test data is : 18.680000


Implementation of lasso is similar to Ridge regression

In [61]:
help(Lasso)

Help on class Lasso in module sklearn.linear_model.coordinate_descent:

class Lasso(ElasticNet)
 |  Linear Model trained with L1 prior as regularizer (aka the Lasso)
 |  
 |  The optimization objective for Lasso is::
 |  
 |      (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
 |  
 |  Technically the Lasso model is optimizing the same objective function as
 |  the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).
 |  
 |  Read more in the :ref:`User Guide <lasso>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : float, optional
 |      Constant that multiplies the L1 term. Defaults to 1.0.
 |      ``alpha = 0`` is equivalent to an ordinary least square, solved
 |      by the :class:`LinearRegression` object. For numerical
 |      reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.
 |      Given this, you should use the :class:`LinearRegression` object.
 |  
 |  fit_intercept : boolean
 |      whether to calculate the intercept for this model. If set
 |     

In [63]:
#Lasso Regression
from sklearn.linear_model import Lasso,LassoCV
import numpy as np
LassoCV_model=LassoCV(alphas=alp,normalize=True)
LassoCV_model.fit(X_train,y_train)
print (LassoCV_model.coef_)
print ("alpha value is %f"%(LassoCV_model.alpha_))

#Training Lasso Model with best alpha value
Lasso_model=Lasso(normalize=True,random_state=10,alpha=LassoCV_model.alpha_,fit_intercept=True)

#Fitting Ridge model
model5=Lasso_model.fit(X=X_train,y=y_train)

#Predicting on train data
pred5=model5.predict(X_train)
pred_test5=model5.predict(X_test)

#Error metrics on train data

print ("MAPE on training data is : %f" %(round(mean_absolute_percentage_error(y_train,pred5),2)))

#Predicting on test data

print ("MAPE on test data is : %f" %(round(mean_absolute_percentage_error(y_test,pred_test5),2)))



[ -8.69903784e+00   1.20886707e-16   1.46903308e+01  -0.00000000e+00
   0.00000000e+00   1.10986580e+01  -9.38753674e+00   2.29464522e-03
   9.90793257e+00  -1.80225839e-01   8.97800204e-01   2.34466378e+00
  -1.09232707e+01  -1.79010018e-02   9.15679817e+00  -1.23723497e-02]
alpha value is 0.005000
MAPE on training data is : 42.430000
MAPE on test data is : 42.310000




In [58]:
#Model with Elastic net Regularization

from sklearn.linear_model import ElasticNetCV


#Training Model with best alpha value
elastic_model=ElasticNetCV(alphas=alp,random_state=12,normalize=True)

#Fitting Ridge model
model3=elastic_model.fit(X=X_train,y=y_train)

#Predicting on train data
pred3,pred_test3=model3.predict(X_train),model3.predict(X_test)

#Error metrics on train data

print ("MAPE on training data is : %f" %(round(mean_absolute_percentage_error(y_train,pred3),2)))

#Predicting on test data
pred_test2=model2.predict(X_test)
print ("MAPE on test data is : %f" %(round(mean_absolute_percentage_error(y_test,pred_test3),2)))
model3.alpha_
model3.coef_

MAPE on training data is : 33.650000
MAPE on test data is : 33.400000




array([  1.92283091e+00,  -1.92287387e+00,   2.14866310e+00,
        -2.14856498e+00,   4.70623207e-01,   4.85648657e+00,
        -1.38906810e+00,   9.59626547e-04,   7.59669729e-01,
        -1.22952181e-02,   7.28685016e-02,   8.44611114e-01,
         4.31546023e-01,   2.62416402e-02,   1.06482384e+00,
         9.23284182e-03])