<a href="https://colab.research.google.com/github/ryancarneyy/Spotifly/blob/ryanc/L08_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Chapter 11] Predicting online shopping purchase intent using using logistic regression

## **[DSLC stages]**: Analysis



In this document, you will find  code for fitting logistic regression to the online shopping data from the Online Shoppers Purchasing Intention Dataset—has been made publicly available by Sakar et al. 2019 from the UCI Machine Learning Repository.


The following code sets up the libraries and creates cleaned and pre-processed training, validation and test data that we will use in this document.

In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt




pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 100

In [2]:
shopping_train_preprocessed = pd.read_csv('shopping_train_preprocessed.csv', index_col=0)


In [3]:
shopping_train_preprocessed

Unnamed: 0,administrative_duration,informational,informational_duration,product_related,product_related_duration,bounce_rates,exit_rates,page_values,special_day,visitor_type,weekend,purchase,month_dec,month_feb,month_jul,month_june,month_mar,month_may,month_nov,month_oct,month_sep,operating_systems_2,operating_systems_3,operating_systems_4,operating_systems_other,browser_2,browser_4,browser_5,browser_other,region_2,region_3,region_4,region_5,region_6,region_7,region_8,region_9,traffic_type_10,traffic_type_11,traffic_type_13,traffic_type_2,traffic_type_3,traffic_type_4,traffic_type_5,traffic_type_6,traffic_type_8,traffic_type_other
0,0.000000,0.0,0.000000,81.0,34.877897,0.015000,0.027155,0.000000,0.0,1,0,False,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.000000,2.0,0.933333,9.0,6.316667,0.000000,0.030000,0.000000,0.0,0,0,False,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0.000000,0.0,0.000000,36.0,35.126389,0.014815,0.032870,0.000000,0.0,1,0,False,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0.389583,2.0,3.860000,35.0,16.602468,0.006081,0.041742,0.000000,0.0,1,1,False,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0.000000,0.0,0.000000,39.0,42.641667,0.000000,0.010811,0.000000,0.0,1,0,False,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7367,0.000000,0.0,0.000000,5.0,3.125000,0.000000,0.040000,0.000000,0.0,1,1,False,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
7368,0.370000,2.0,0.740000,37.0,6.680000,0.005128,0.002564,0.000000,0.0,1,0,False,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
7369,4.933333,0.0,0.000000,9.0,6.575000,0.000000,0.005556,28.398333,0.0,0,1,True,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
7370,1.533333,2.0,0.991667,39.0,36.795952,0.009756,0.023577,0.000000,0.0,1,0,False,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0



## Fitting logistic regression to the full training dataset


Next, compute a logistic regression fit for the entire training dataset:

In [4]:
lr_all = LogisticRegression(solver='liblinear')
lr_all.fit(X=shopping_train_preprocessed.drop(columns='purchase'),
           y=shopping_train_preprocessed['purchase'])
lr_all.intercept_, lr_all.coef_

(array([-2.06583111]),
 array([[ 9.89450878e-03,  8.61072961e-03,  1.70687514e-02,
          3.67187813e-03,  1.76200760e-03, -2.20956076e+00,
         -3.48261973e+00,  8.45649972e-02, -4.63316195e-01,
         -3.47793459e-01,  9.07015252e-02, -7.70239987e-01,
         -1.35284702e+00,  2.18080137e-01, -6.42471535e-01,
         -5.58456453e-01, -5.54590719e-01,  4.66684560e-01,
         -3.84085131e-02,  2.89546880e-02,  2.67032422e-02,
         -1.45236310e-01, -2.51111094e-01, -2.32196660e-01,
          2.12244959e-02,  1.69373472e-01,  9.33558184e-02,
          6.92990908e-02,  2.05129638e-01,  1.00856542e-01,
         -1.03945171e-01, -3.12794989e-01, -1.25818469e-02,
          1.10071151e-01, -1.91812983e-01, -3.21482488e-01,
          3.79243611e-01,  2.97013039e-01, -5.48815724e-01,
          2.54056710e-01, -3.59242413e-01, -6.12908732e-03,
          7.87102219e-02, -1.45917975e-01,  7.34232961e-01,
          9.63913446e-02]]))


## Evaluating binary predictions for a sample of 20 validation points

First, let's start by evaluating our predictions using just a random sample of 20 validation set sessions.

Let's create the same 20-session sample that was used in the book for evaluation.

In [7]:
shopping_val_preprocessed = pd.read_csv('shopping_val_preprocessed.csv', index_col=0)


In [8]:
shopping_val_preprocessed.to_csv('shopping_val_preprocessed.csv')

In [9]:
sample_index = [961, 1315, 408, 1678, 1810,
                  1566, 2036, 1005, 2198, 685,
                  1680, 1347, 2265, 286, 1393,
                  2267, 2247, 1576, 217, 420]
sample_index = [id - 1 for id in sample_index]
shopping_val_sample = shopping_val_preprocessed.loc[sample_index,:]


First, let's print out the observed and predicted (using LS and logistic regression fit to the full training set) purchase response for these 20 validation sessions.


In [10]:
pred_val_sample = pd.DataFrame(dict(
    purchase = shopping_val_sample['purchase'],
    lr_predict = lr_all.predict_proba(shopping_val_sample.drop(columns='purchase'))[:,1],
    lr_predict_binary = lr_all.predict(shopping_val_sample.drop(columns='purchase'))))
pred_val_sample

Unnamed: 0,purchase,lr_predict,lr_predict_binary
960,True,0.312302,False
1314,True,0.519242,True
407,True,0.887533,True
1677,True,0.629552,True
1809,True,0.595575,True
1565,False,0.051567,False
2035,False,0.420669,False
1004,False,0.046527,False
2197,False,0.03613,False
684,False,0.049173,False



### The confusion matrix

The confusion matrix for the LS (binary) fit, where the binary predictions are based (for now) on a threshold of 0.5, is



and for the logistic regression fit, the confusion matrix (again, for now, based on a threshold of 0.5) is:


In [11]:
conf_lr = metrics.confusion_matrix(y_true=pred_val_sample['purchase'],
                                   y_pred=pred_val_sample['lr_predict_binary'])
conf_lr

array([[12,  3],
       [ 1,  4]])



### Prediction accuracy


There are several ways that you can compute the prediction accuracy, such as from the confusion matrix, by adding up the diagonal entries and dividing by the total:

In [12]:
# Logistic regression
(conf_lr[0, 0] + conf_lr[1, 1]) / conf_lr.sum()

0.8


or using the `accuracy_score()` function from the `sklearn.metrics` library:


In [13]:
# Logistic regression
metrics.accuracy_score(y_true=pred_val_sample['purchase'],
                       y_pred=pred_val_sample['lr_predict_binary'])

0.8


### True positive and true negative rate


Similarly, the true positive rate can be computed from the confusion matrix:


In [14]:
# Logistic regression
(conf_lr[1, 1]) / conf_lr[1,:].sum()

0.8

The true positive rate (recall/sensitivity) can also be computing using the `recall_score()` function from the sklearn.metrics library:

In [15]:
metrics.recall_score(y_true=pred_val_sample['purchase'],
                     y_pred=pred_val_sample['lr_predict_binary'])

0.8



Similarly, the true negative rate can be computed from the confusion matrix:


In [16]:
# Logistic regression
(conf_lr[0, 0]) / conf_lr[0,:].sum()

0.8

There is no specific function for the true negative rate from sklearn, but you can use the `recall_score()` function by passing in the `pos_label=0` argument:

In [17]:
# Logistic regression
metrics.recall_score(y_true=pred_val_sample['purchase'],
                     y_pred=pred_val_sample['lr_predict_binary'],
                     pos_label=0)

0.8


### Predicted probability densities



We can also plot the distribution of the predicted probabilities using density plots.




Note, however, that using densities for so few samples is a bit misleading (there are only 5 data points in the "1" purchase class). A histogram would technically be more appropriate:

In [18]:
px.histogram(pred_val_sample, x='lr_predict', color='purchase',
             nbins=20, opacity=0.5,
             barmode='overlay')



### ROC curves


Computing ROC curves is easy with the `roc_curve()` function from the sklearn.metrics module. Let's plot an ROC curve for the LS and logistic regression predictions on the same plot



In [19]:
# compute the ROC curve variables
lr_fpr_sample, lr_tpr_sample, lr_thresholds_sample = metrics.roc_curve(pred_val_sample['purchase'], pred_val_sample['lr_predict'])


In [20]:
lr_thresholds_sample

array([       inf, 0.88753294, 0.78794426, 0.59557479, 0.52992327,
       0.51924237, 0.39731082, 0.3123021 , 0.03613003])

In [21]:
roc_lr_sample = pd.DataFrame({
    'False Positive Rate': lr_fpr_sample,
    'True Positive Rate': lr_tpr_sample,
    'Model': 'Logistic Regression'
}, index=lr_thresholds_sample)


roc_sample_df = pd.concat([roc_lr_sample])


px.line(roc_sample_df, y='True Positive Rate', x='False Positive Rate',
        color='Model',
        width=700, height=500
)


The AUC of each plot can be computed using the `roc_auc_score()` function from the sklearn.metrics module:

In [None]:
# Logistic regression
lr_auc_sample = metrics.roc_auc_score(pred_val_sample['purchase'], pred_val_sample['lr_predict'])
print('Logistic regression AUC:', lr_auc_sample.round(3))

Logistic regression AUC: 0.853


#**Using CV for Model Performance**

We can use CV to check how our model does on the entire validation data set.


In [None]:
from sklearn.model_selection import cross_val_score
X = shopping_val_preprocessed.drop(columns='purchase')
y = shopping_val_preprocessed['purchase']
# This does stratified Kfolds for us...
cross_val_score(lr_all, X, y, cv=5, scoring='roc_auc')

array([0.89145716, 0.82169849, 0.86223813, 0.86800583, 0.81129973])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# Use the shuffle and random state if want data shuffled before splitting
#skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skfolds = StratifiedKFold(n_splits=5)
i = 1
for train_index, test_index in skfolds.split(X, y):
    clone_lr = clone(lr_all)
    X_train_folds = X.iloc[train_index]
    y_train_folds = y.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    print(test_index)
    clone_lr.fit(X_train_folds, y_train_folds)
    y_pred = clone_lr.predict(X_test_fold)
    X_test_fold = X.iloc[test_index]

    clone_lr.fit(X_train_folds, y_train_folds)
    y_pred = clone_lr.predict(X_test_fold)
    auc_sample = metrics.roc_auc_score(y.iloc[test_index], y_pred)
    print('Fold: ', i)
    print('AUC: ', auc_sample)
    print('Accuracy: ', metrics.accuracy_score(y.iloc[test_index], y_pred))
    i += 1

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 24