Online Shoppers Purchasing Intention

- Preliminary exploratory data analysis:

Demonstrate that the dataset can be read from the web into R 

In [3]:
install.packages("formattable")

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [16]:
#load packages
library(tidyverse)
library(tidymodels)
library(formattable)
library(knitr)
library(caret)

In [8]:
#load data
shoppers<-read_csv("data/online_shoppers_intention.csv")

Parsed with column specification:
cols(
  Administrative = [32mcol_double()[39m,
  Administrative_Duration = [32mcol_double()[39m,
  Informational = [32mcol_double()[39m,
  Informational_Duration = [32mcol_double()[39m,
  ProductRelated = [32mcol_double()[39m,
  ProductRelated_Duration = [32mcol_double()[39m,
  BounceRates = [32mcol_double()[39m,
  ExitRates = [32mcol_double()[39m,
  PageValues = [32mcol_double()[39m,
  SpecialDay = [32mcol_double()[39m,
  Month = [31mcol_character()[39m,
  OperatingSystems = [32mcol_double()[39m,
  Browser = [32mcol_double()[39m,
  Region = [32mcol_double()[39m,
  TrafficType = [32mcol_double()[39m,
  VisitorType = [31mcol_character()[39m,
  Weekend = [33mcol_logical()[39m,
  Revenue = [33mcol_logical()[39m
)



In [None]:
#Three criteria for tidy data:
# Each row is a single observation
# Each column is a single variable
# Each value is a single cell

# Revenue: purchase or not
# Weekend: purchase on weekends or not

In [9]:
# we will use the logical Revenue variable as the target variable, and convert it to the factor datatype 
shoppers <- shoppers %>%
    mutate(Revenue = as_factor(Revenue))
head(shoppers)

set.seed(1)
# we will use 75% of the data for training and 25% for testing.
shoppers_split <- initial_split(shoppers, prop = 0.75, strata = Revenue)
shoppers_train <- training(shoppers_split)
shoppers_test <- testing(shoppers_split)

Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<lgl>,<fct>
0,0,0,0,1,0.0,0.2,0.2,0,0,Feb,1,1,1,1,Returning_Visitor,False,False
0,0,0,0,2,64.0,0.0,0.1,0,0,Feb,2,2,1,2,Returning_Visitor,False,False
0,0,0,0,1,0.0,0.2,0.2,0,0,Feb,4,1,9,3,Returning_Visitor,False,False
0,0,0,0,2,2.666667,0.05,0.14,0,0,Feb,3,2,2,4,Returning_Visitor,False,False
0,0,0,0,10,627.5,0.02,0.05,0,0,Feb,3,3,1,4,Returning_Visitor,True,False
0,0,0,0,19,154.216667,0.01578947,0.0245614,0,0,Feb,2,2,1,3,Returning_Visitor,False,False


In [10]:
# numerical features used in the analysis model
numeric_shoppers <- shoppers_train %>% 
    select(where(is.numeric)) %>%
    select(-c(OperatingSystems,Browser,Region,TrafficType))


num_shoppers_count <- numeric_shoppers %>% 
    nrow() # number of observations in each class
shoppers_mean <- map_df(numeric_shoppers, mean, na.rm = TRUE) # means of the predictor variables
shoppers_min <- map_df(numeric_shoppers, min, na.rm = TRUE) # mins of the predictor variables
shoppers_max <- map_df(numeric_shoppers, max, na.rm = TRUE) # maxs of the predictor variables
num_missing_data1 <- colSums(is.na(numeric_shoppers)| is.null(numeric_shoppers)) # numbers of missing data in predictor variables

# make a dataframe consisting of th
t1 <- rbind(num_shoppers_count,shoppers_mean,shoppers_min,shoppers_max,num_missing_data1)
rownames(t1)<-c("count","mean","min","max","# missing data")
numeric_table <- formattable(t1)
numeric_table

“Setting row names on a tibble is deprecated.”


Unnamed: 0_level_0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
count,9248.0,9248.0,9248.0,9248.0,9248.0,9248.0,9248.0,9248.0,9248.0,9248.0
mean,2.286873,79.62898,0.5009732,34.26938,31.50692,1187.485,0.02200924,0.04310152,5.974699,0.0620026
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,27.0,2657.31806,16.0,2256.91667,686.0,29970.466,0.2,0.2,361.763742,1.0
# missing data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Table 1 : Numerical Features Esed in the Analysis

In [11]:
# make a copy of training set 
shoppers_train_copy <- data.frame(shoppers_train)

# categorical features used in the analysis model
#cat_variables <- c(OperatingSystems, Browser,Region,
#            TrafficType,VisitorType,Weekend,Month,Revenue)

cat_shoppers <- shoppers_train_copy %>%
    select(OperatingSystems, Browser,Region,
            TrafficType,VisitorType,Weekend,Month,Revenue) %>%
    mutate_all(as_factor)

# number of observations in each class
cat_shoppers_count <- map_df(cat_shoppers, nlevels) 

# check for missing data in each column
num_missing_data <- colSums(is.na(cat_shoppers) | is.null(cat_shoppers))

t2 <- rbind(cat_shoppers_count,num_missing_data)
rownames(t2)<-c("count","# missing data")
cat_table <- formattable(t2)
cat_table

“Setting row names on a tibble is deprecated.”


Unnamed: 0_level_0,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Month,Revenue
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
count,8,13,9,20,3,2,10,2
# missing data,0,0,0,0,0,0,0,0


Table 2 : Categorical Features Used in the Analysis

In [None]:
# df consisting numerical features and Revenue
numeric_and_rev <- cbind(numeric_shoppers,shoppers_train$Revenue) %>%
    mutate(Revenue = shoppers_train$Revenue) %>%
    select(-c(11))

# create the standardization recipe
shoppers_recipe <- recipe(Revenue ~ ., data = numeric_and_rev) %>% 
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("classification")

shoppers_vfold <- vfold_cv(numeric_and_rev, v = 5, strata = Revenue)

In [None]:
# fit the knn model
shoppers_wkflw <- workflow() %>%
    add_recipe(shoppers_recipe) %>%
    add_model(knn_spec)

gridvals <- tibble(neighbors = seq(1, 200))

shoppers_results <- shoppers_wkflw %>%
  tune_grid(resamples = shoppers_vfold, grid = gridvals) %>%
  collect_metrics()

k_min <- shoppers_results %>%
  filter(.metric == "rmse") %>%
  filter(mean == min(mean))
k_min

numeric_and_rev <- cbind(numeric_shoppers,shoppers_train$Revenue) %>%
    mutate(Revenue = shoppers_train$Revenue) %>%
    select(-c(11))
numeric_and_rev

In [19]:
# select the features that had the largest effect on the revenue
tr<-train(Revenue ~ ., data=shoppers_train, method="rpart")
varImp(tr)

rpart variable importance

  only 20 most important variables shown (out of 29)

                             Overall
PageValues                   100.000
BounceRates                   16.825
ProductRelated                15.187
ExitRates                     13.704
ProductRelated_Duration       10.585
VisitorTypeReturning_Visitor   6.439
VisitorTypeNew_Visitor         6.055
MonthFeb                       0.000
MonthNov                       0.000
Browser                        0.000
MonthOct                       0.000
Administrative                 0.000
VisitorTypeOther               0.000
MonthDec                       0.000
MonthMar                       0.000
WeekendTRUE                    0.000
Informational_Duration         0.000
MonthJul                       0.000
Administrative_Duration        0.000
Informational                  0.000