### Using Ecommerce_data.csv

Tasks: 
Import data > define independent and dependent vars > split data into train/test > run regression > get R-squared, intercept and coefficients > do predict

In [1]:
#Import all the necessary packages:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import sklearn
import matplotlib.pyplot as plt

from sklearn import datasets 
from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_df = pd.read_csv('LSE_DA301_Week_1_files/Data/Ecommerce_data.csv')
data_df

Unnamed: 0,Sale,por_OS,por_NON,recc,avg_no_it,age,dis,diff_reg,tax,bk,lowstat,Median_s
0,0.63,18.0,2.31,0,6.575,65.2,4.0900,1,296,396.90,4.98,24.0
1,2.73,0.0,7.07,0,6.421,78.9,4.9671,2,242,396.90,9.14,21.6
2,2.73,0.0,7.07,0,7.185,61.1,4.9671,2,242,392.83,4.03,34.7
3,3.24,0.0,2.18,0,6.998,45.8,6.0622,3,222,394.63,2.94,33.4
4,6.91,0.0,2.18,0,7.147,54.2,6.0622,3,222,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...
501,6.26,0.0,11.93,0,6.593,69.1,2.4786,1,273,391.99,9.67,22.4
502,4.53,0.0,11.93,0,6.120,76.7,2.2875,1,273,396.90,9.08,20.6
503,6.08,0.0,11.93,0,6.976,91.0,2.1675,1,273,396.90,5.64,23.9
504,10.96,0.0,11.93,0,6.794,89.3,2.3889,1,273,393.45,6.48,22.0


In [3]:
y = data_df['Median_s']
X = data_df[['avg_no_it', 'tax']]

In [4]:
x_train,x_test,y_train,y_test = train_test_split(X,y,train_size=0.8, 
                                                 random_state=5)

In [5]:
mlr = LinearRegression()
mlr.fit(x_train, y_train)
mlr.predict(x_train)

array([19.68077167, 38.76268832, 15.79429553, 23.49922927, 19.2739039 ,
        8.11113341, 22.59791218, 14.06288888, 26.39447012, 21.86039213,
       19.79545594, 30.26529047, 21.22777796, 21.44808858, 20.79264432,
       23.45370907, 21.97392517, 18.46339447, 23.11847005, 34.4541126 ,
       25.78072075, 19.76882387, 23.04084157, 26.82686538, 17.75595394,
       22.13454223, 20.1246662 , 37.45556138, 34.46353646, 25.41954164,
       14.71226966, 21.05474884, 20.37149849, 22.93705789, 11.66858537,
        5.63570732, 25.934629  , 26.27064708, 22.2364656 , 21.61934447,
       17.77360377, 18.96641292, 31.56527838, 15.15597811, 26.66444607,
       20.36881247, 24.43706724, 23.85868181, 18.76614735, 24.03793148,
       19.70204364, 17.6314042 , 21.96729204, 33.05748004, 11.14180696,
       25.42261122, 31.1691132 , 31.06150421, 21.95950768, 28.04224855,
       18.54177868, 31.03384325, 24.53573494, 19.40072084, 20.55261985,
       21.02538463, 30.45308592, -2.85702811, 24.909733  , 19.05

In [6]:
print("R-squared: ", mlr.score(x_train, y_train))
print("Intercept: ", mlr.intercept_)
print("Coefficients: ", mlr.coef_)
list(zip(x_train, mlr.coef_))

R-squared:  0.516326158853115
Intercept:  -19.67017947454343
Coefficients:  [ 7.78435878 -0.0163768 ]


[('avg_no_it', 7.784358782154015), ('tax', -0.016376802195044227)]

### Next, check the model with OLS

In [7]:
model = sm.OLS(y_train, sm.add_constant(x_train)).fit()
y_pred = model.predict(sm.add_constant(x_test))
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:               Median_s   R-squared:                       0.516
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     214.0
Date:                Mon, 19 Sep 2022   Prob (F-statistic):           5.66e-64
Time:                        23:14:08   Log-Likelihood:                -1326.1
No. Observations:                 404   AIC:                             2658.
Df Residuals:                     401   BIC:                             2670.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -19.6702      3.347     -5.877      0.0

### Test for multicolinearity 

In [8]:
x_temp = sm.add_constant(x_train)
vif = pd.DataFrame()
vif['VIF factor'] = [variance_inflation_factor(x_temp.values, i) for i in range(x_temp.values.shape[1])] 
vif['features'] = x_temp.columns
print(vif.round(1))

   VIF factor   features
0       108.1      const
1         1.1  avg_no_it
2         1.1        tax


### Test for homoscedasticity 

In [22]:
test = sms.het_breuschpagan(model.resid, model.model.exog)
# Print the results of the Breusch-Pagan test.
terms = ['LM stat', 'LM Test p-value', 'F-stat', 'F-test p-value']
print(dict(zip(terms, test)))


{'LM stat': 25.71361477472052, 'LM Test p-value': 2.6083117438457607e-06, 'F-stat': 13.62877429295051, 'F-test p-value': 1.8775385253290402e-06}
