In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

## Read in the cleaned dataset

In [2]:
train = pd.read_csv('cleaned_data/cleaned_train.csv', encoding='utf-8')
test = pd.read_csv('cleaned_data/cleaned_test.csv', encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect',
            'Year', 'Month', 'Week', 'Day', 'Dayofweek',
            'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end',
            'Is_quarter_start', 'Is_year_end', 'Is_year_start'
           ]

num_cols = ["totals.hits", "totals.pageviews", "visitNumber", 
            "visitStartTime", 'totals.bounces',  'totals.newVisits']

In [4]:
def score_metric(y_pred, targ):
    p = np.expm1(y_pred)
    t = np.expm1(targ)
    p[p < 0] = 0
    return np.sqrt(metrics.mean_squared_error(np.log1p(t), np.log1p(p)))

def val_err(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    model.fit(X_train, y_train)
    true = y_test
    preds = model.predict(X_test)
#     print(preds)
    score = score_metric(preds, true)
    print(score)
    return(score)

## Define the outcome variable as the log of the total transaction revenue

In [5]:
y_rf = np.log(train['totals.transactionRevenue'])
y_rf[y_rf < 0] = 0

  """Entry point for launching an IPython kernel.


## We get the top 5 most important variables using the Random Forest Analysis based on their feature scores

In [6]:
cols = ['Month', 'visitStartTime', 'Dayofyear', 'Week', 'totals.hits']

X = train[cols]

## Split the dataset into train and test in a 70:30 split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_rf, test_size=0.30, random_state=4200)

## Using the sklearn linear regression package to fit the model

In [8]:
linreg = LinearRegression()

linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Print out the fitted coefficients and the intercept

In [9]:
linreg.coef_

array([-3.65830850e-03,  5.11328913e-09,  8.37453020e-06,  3.03756381e-04,
        7.86650359e-02])

In [10]:
linreg.intercept_

-7.71886696943933

## Predict on the test set

In [11]:
y_pred = linreg.predict(X_test)

In [12]:
score_metric(y_pred, y_test)

1.8555400814973317

## Use the statsmodels.api package to find out summary statistics for the regression

In [13]:
mod = sm.OLS(y_train, X_train)
results = mod.fit()
print(results.summary())

                                OLS Regression Results                               
Dep. Variable:     totals.transactionRevenue   R-squared:                       0.144
Model:                                   OLS   Adj. R-squared:                  0.144
Method:                        Least Squares   F-statistic:                 2.669e+04
Date:                       Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                               01:45:09   Log-Likelihood:            -1.2878e+06
No. Observations:                     632557   AIC:                         2.576e+06
Df Residuals:                         632552   BIC:                         2.576e+06
Df Model:                                  4                                         
Covariance Type:                   nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------

We see that the top 5 important features are significant to the regression