In [1]:
#Sarah Wofford
#Digital Avertising - MSBuAn - CU Boulder
##LASSO Regression Model

In [2]:
#--------------------------------------------#
#Import packages and items into Python to aid#
#LASSO regression model                      #
#--------------------------------------------#

#Data is available on GitHub

# Importing items and packages to work help make the regression work with LASSO
import pandas as pd
from sklearn.cross_validation import train_test_split
import sklearn.linear_model
from sklearn.linear_model import LassoLarsCV
import matplotlib.pyplot as plt
from statistics import mean

#--------------------------------------------#
#  Import and clean up data before analysis  #
#--------------------------------------------#


# import data into Python
data = 'finalmaster-ratios.csv'

#import data into Panda as a data frame
alldata = pd.read_csv('finalmaster-ratios.csv')

# #Purchases = outcome variable in data set
# 189 predictor variables in data set

# Get a list of the variables from alldata
allvariablenames = list(alldata.columns.values)


# need to remove the first 8 variables from the list

allvariablenames.pop(0) #remove #Purchases
allvariablenames.pop(0) #remove B01001001
allvariablenames.pop(0) #remove B01001002
allvariablenames.pop(0) #remove B01001003
allvariablenames.pop(0) #remove B01001004
allvariablenames.pop(0) #remove B01001005
allvariablenames.pop(0) #remove B01001006
allvariablenames.pop(0) #remove B01001007
#--------------------------------------------#




'B01001007'

In [3]:
#--------------------------------------------#
#Get the columns ready in Pandas dataframe   #
#--------------------------------------------#

# load predictors into data frame
predictors = alldata[allvariablenames]

# load target into data frame
target = alldata['# Purchases']


#--------------------------------------------#
#Split data into train and test sets         #
# 30% should be retained for test set        #
#--------------------------------------------#

pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123)

#1 pred_train = the predictors training set
#2 pred_test = the predictors test set
#3 tar_train = the targeting training ste
#4 tar_test = the target test set

#--------------------------------------------#

In [4]:
#--------------------------------------------#
#  Build the LASSO model                     #
#--------------------------------------------#

#Use the following perameters for the LASSO Model:
#    -cv = 10 (this performs a 10-fold cross validation,
#      to make sure our results aren't do to random ordering of the data)
#    -precompute=False (not necessary)

model = LassoLarsCV(cv=10, precompute=False)
model.fit(pred_train, tar_train)







LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
      max_iter=500, max_n_alphas=1000, n_jobs=1, normalize=True,
      positive=False, precompute=False, verbose=False)

In [5]:
#--------------------------------------------#
# Build the coefficient chart for examination#
#--------------------------------------------#

predictors_model=pd.DataFrame(allvariablenames) #converting the list of all variables into a new dataframe to analyze for regression
predictors_model.columns = ['label'] #defines the column label for the list of variables in the data frame
predictors_model['coeff'] = model.coef_ #pulls the coefficients for each variable from the lasso "model" into a new column named "coeff"

for index, row in predictors_model.iterrows(): #for each index position and row in predictors.model.iterrows() (.iterrows returns a series for each row)
    if row ['coeff'] > 0: #if the coefficient in the row being iterated through is greater than 0
        print(row.values) #print the value of the coefficient for the variable in that row

['B01001014' 0.8557908775529921]
['B01001036' 2.505392496591849]
['B01001037' 0.8894214357013622]
['B01001038' 1.5315839680821497]
['B02001005' 0.4125408937426837]
['B13014026' 0.4800240326923769]
['B13014027' 0.6977454940063235]
['B13016001' 874922971.7249781]
['B19001017' 1.4834465563617387]


In [6]:
#QUESTION: If I had to report only two census variables to my boss that most steeply predicted sales,
#              what would those be?
#       I would report B13016001 (Women aged 15 to 50 years 
#       (who had birth in the last 12 months by age)) and B01001036 (Females, 30-44 years old).
#       Both had the highest coefficients from the census data 
#       and the LASSO model (874922971.724 and 2.5053924 respectively).  
#       I would emphasize that based on the outcome of prediction, that it would be wise 
#       to target variable B13016001 the most (based on the data) since the coefficient was 
#        significantly higher than any other variable.


In [7]:
#----------------------------------------------------------------#
# calculate the mean squared error for the training and test set
#----------------------------------------------------------------#
def mean_squared_error(targets,predictions):
    return mean((targets-predictions)**2)

train_error = mean_squared_error(tar_train, model.predict(pred_train))
print ('training data MSE')
print(train_error)

# run the above code,then do the same thing for test sets (2) and (4) above.
predtest_error = mean_squared_error(pred_test, model.predict(tar_test))
print ('Predict Test MSE')
print(predtest_error)

targtest_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('Target Test MSE')
print(targtest_error)


training data MSE
22025.312777378716


ValueError: Expected 2D array, got 1D array instead:
array=[   1    3    4    0  685    4    0    6   40    9    1   45    4   15
    8    4    4   10    3  556    3    3  133    2    3   16    4   15
    2   12    2   16    0    2    0    1   14    6    0    3   16    3
    4    0  700   76    1    4   15    3    1  233    8    1    2    2
    0    1    1    1   22   10    4    2    1   19    2   84  359  742
    0    0  684  206   36   12    3  111   37    4    0   14    2   12
  177    1   32    7    0    0    2   14    3    4    0   19   86    4
    0    3    6    3    7    2    1    0    2    0 2861   83    0    0
    2    1    2    0    7    2   20    6   64    2   37    0    3    1
    1   13   11    1   39    7   10    9    4    0    3    6    4   12
  217   16    0    0    5    5    3    0    9   14  121   10    0  107
    0  779  115    2    0    1   80   24  127    4   14    2    0   99
   10   14    0    1    2   23  328    1    5    6    4   10    2    1
    1   95   19    9    0    0    2   43   32    6    0    4    0    1
    9  136    0    0   15   45   12    0   69   16    0    0    0    2
    0   51   11    0    7   15   72   21  145    6].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [8]:
#----------------------------------------------------------------#
#Based on R Squared values, does the census data predict sales?  #
#----------------------------------------------------------------#
#No, there is simply not enough data to make such a decision.

# calculate the r-squared for the training set:

rsquared_train=model.score(pred_train,tar_train)
print ('training data R-Square')
print(rsquared_train)

#repeat for the test set.
rsquared_test=model.score(pred_test, tar_test)
print ('test data R-Square')
print(rsquared_test)

# Compare the two R-squareds.

#calculate the y-intercept to help interpret baseline sales number
print("y intercept: ")
print(model.intercept_)

training data R-Square
0.24002827375880997
test data R-Square
0.17587122769388464
y intercept: 
22.194697684317433


In [None]:
#----------------------------------------------------------------#
#What is our baseline sales number? 
#What does that mean, practically?
#----------------------------------------------------------------#
#Based on the output of the random sample generated above from my model, 
#the baseline sales number is 22 (products / units).  
#Optimally, 22 items are the baseline to predict sales for the product.  
#There is then opportunity to increase the amount of products sold if 
#sold to the correct demographic (census variable). 

