# Mod 5 Online Shoppers Intent Project 

# Contents:
1. 
2.
3.
4.
5.
6.
7.


## 1. Business Case: 

## 2. Import Data

In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from textwrap import wrap
%matplotlib inline


from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import export_graphviz

from IPython.display import Image  
from pydotplus import graph_from_dot_data

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


In [2]:
df = pd.read_csv('online_shoppers_intention.csv')
df.shape

(12330, 18)

## 3. Data Scrubbing

In [3]:
df.isnull().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [4]:
# Check for Duplicated values
df.duplicated().sum()
# 125 values duplicated
# Will leave them in as they could be duplicates by chance and 
# they are very small portion of our data


125

# 4. Exploration

## 4.1 Data Set Information:

The dataset consists of feature vectors belonging to 12,330 sessions.
The dataset was formed so that each session
would belong to a different user in a 1-year period to avoid
any tendency to a specific campaign, special day, user
profile, or period.


### 4.2 Attribute Information:

The dataset consists of 10 numerical and 8 categorical attributes.
The 'Revenue' attribute can be used as the class label.

"Administrative", "Administrative Duration", "Informational", "Informational Duration", "Product Related" and "Product Related Duration" represent the number of different types of pages visited by the visitor in that session and total time spent in each of these page categories. The values of these features are derived from the URL information of the pages visited by the user and updated in real time when a user takes an action, e.g. moving from one page to another. The "Bounce Rate", "Exit Rate" and "Page Value" features represent the metrics measured by "Google Analytics" for each page in the e-commerce site. The value of "Bounce Rate" feature for a web page refers to the percentage of visitors who enter the site from that page and then leave ("bounce") without triggering any other requests to the analytics server during that session. The value of "Exit Rate" feature for a specific web page is calculated as for all pageviews to the page, the percentage that were the last in the session. The "Page Value" feature represents the average value for a web page that a user visited before completing an e-commerce transaction. The "Special Day" feature indicates the closeness of the site visiting time to a specific special day (e.g. Mother’s Day, Valentine's Day) in which the sessions are more likely to be finalized with transaction. The value of this attribute is determined by considering the dynamics of e-commerce such as the duration between the order date and delivery date. For example, for Valentina’s day, this value takes a nonzero value between February 2 and February 12, zero before and after this date unless it is close to another special day, and its maximum value of 1 on February 8. The dataset also includes operating system, browser, region, traffic type, visitor type as returning or new visitor, a Boolean value indicating whether the date of the visit is weekend, and month of the year.



In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
Administrative             12330 non-null int64
Administrative_Duration    12330 non-null float64
Informational              12330 non-null int64
Informational_Duration     12330 non-null float64
ProductRelated             12330 non-null int64
ProductRelated_Duration    12330 non-null float64
BounceRates                12330 non-null float64
ExitRates                  12330 non-null float64
PageValues                 12330 non-null float64
SpecialDay                 12330 non-null float64
Month                      12330 non-null object
OperatingSystems           12330 non-null int64
Browser                    12330 non-null int64
Region                     12330 non-null int64
TrafficType                12330 non-null int64
VisitorType                12330 non-null object
Weekend                    12330 non-null bool
Revenue                    12330 non-null bool
dtypes: bool(

## 4.3 Screen for Categorical variables:


In [6]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [7]:
# Here we are checking the ratio of unique values to the total number count for each column
df.nunique()/df.count()
# proportions of nuniques to total counts < 0.05 suggest categorical variables

Administrative             0.002190
Administrative_Duration    0.270479
Informational              0.001379
Informational_Duration     0.102028
ProductRelated             0.025223
ProductRelated_Duration    0.774615
BounceRates                0.151825
ExitRates                  0.387429
PageValues                 0.219303
SpecialDay                 0.000487
Month                      0.000811
OperatingSystems           0.000649
Browser                    0.001054
Region                     0.000730
TrafficType                0.001622
VisitorType                0.000243
Weekend                    0.000162
Revenue                    0.000162
dtype: float64

In [8]:
#Loop to visually inspect value counts for all variables
# for col in df.columns:
#     print(f'This is {col} value counts: \n{df[col].value_counts()}.\n')

In [9]:
# Subset categorical and continuous features from dataframe for visualisations:
df_cont = df[['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'ExitRates', 'PageValues', 'BounceRates']]
df_cat = df.drop(df_cont.columns, axis=1)

# 5. Baseline Model

## 5.1 Decision Tree

In [10]:
# Feature and Target subsetting:
X = df.drop(columns=['Revenue'], axis =1)
y = df['Revenue']

In [11]:
#train_test_split and create a global seed
seed=42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = seed)

In [12]:
# One-hot encode the categorical variables in the training data and show the resulting DataFrame with proper column names
ohe = OneHotEncoder()
#subset continuous and categorical variables:
train_cont = X_train[['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'ExitRates', 'PageValues', 'BounceRates']]
train_cat = X_train.drop(df_cont.columns, axis=1)


#Fit transform the variables and place them in a dataframe
encoded_vars = ohe.fit_transform(train_cat).toarray()
ohe_df = pd.DataFrame(encoded_vars, columns=ohe.get_feature_names(train_cat.columns))
ohe_df.shape


(9864, 401)

In [13]:
# reset index and make a copy of continuous dataframe.
X_train_ohe = train_cont.copy()
X_train_ohe.reset_index(drop=True, inplace=True)

# Concat into continuous and encoded categoricals into one training dataset:
X_train_ohe = pd.concat([X_train_ohe, ohe_df], axis=1)
X_train_ohe.shape

(9864, 407)

## 5.2 Decision Tree fit

In [14]:
# Decision tree classifier fit 
clf = DecisionTreeClassifier(random_state = seed, criterion='entropy')

clf.fit(X_train_ohe, y_train)
#KFold cross validator instantiated:
cv = KFold(n_splits=5, random_state=seed)

# Use Crossvalidation to obtain Performance metrics: F1 Score and Accuracy
F1_score = round(np.mean(cross_val_score(clf, X_train_ohe, y_train, cv=cv, scoring='f1')), 4)
Acc_score = round(np.mean(cross_val_score(clf, X_train_ohe, y_train, cv=cv, scoring='accuracy')), 4)
roc_AUC_score = round(np.mean(cross_val_score(clf, X_train_ohe, y_train, cv=cv, scoring='roc_auc')), 4)
#Print F1 and Accuracy and ROC_AUC score for crossvalidation baseline decision tree
F1_score, Acc_score, roc_AUC_score

(0.5674, 0.8692, 0.7445)

In [17]:
## 5.4 Baseline Model - Plot the Decision Tree

# Create DOT Data
# dot_data = export_graphviz(clf, out_file=None,
#                            feature_names=X_train_ohe.columns,
#                            class_names=np.unique(y).astype('str'),
#                            filled=True, rounded=True, special_characters=True)

# Draw Graph 
# graph = graph_from_dot_data(dot_data) 

# Show graph
# Image(graph.create_png())


# 6.0 Exploring Potential Improvements on Baseline Model



## 6.1 Decision tree with Gini impurity

In [18]:
# Gini impurity Decision tree classifier fit 
clf1 = DecisionTreeClassifier(random_state = seed, criterion='gini')
clf1.fit(X_train_ohe, y_train)

# Use Crossvalidation to obtain Performance metrics: F1 Score and Accuracy
F1_score = round(np.mean(cross_val_score(clf1, X_train_ohe, y_train, cv=cv, scoring='f1')), 4)
Acc_score = round(np.mean(cross_val_score(clf1, X_train_ohe, y_train, cv=cv, scoring='accuracy')), 4)
roc_AUC_score = round(np.mean(cross_val_score(clf1, X_train_ohe, y_train, cv=cv, scoring='roc_auc')), 4)

#Print F1 and Accuracy and ROC_AUC score for crossvalidation baseline decision tree
F1_score, Acc_score, roc_AUC_score

(0.5622, 0.8688, 0.7403)

## 6.2 Grid Search CV for Decision tree with Entropy impurity

In [19]:
# Entropy impurity performed better. Let's implement hyperparameter tuning with combinatoric grid searching.

#initial Param grid:

dt_param_grid = {
    'criterion': ['entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [10, 500, 1000]
}


#insantiate the GridSearchCV
dt_grid_search = GridSearchCV(clf, dt_param_grid, cv=3, return_train_score=True)

# Fit to the data
dt_grid_search.fit(X_train_ohe, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='entropy',
                                              max_depth=None, max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['entropy'],
                         'max_depth': [None, 2, 3, 4, 5, 6],
                       

In [22]:
# Mean training score
dt_gs_training_score = np.mean(dt_grid_search.cv_results_['mean_train_score'])


print(f"Mean Training Score: {dt_gs_training_score :.2%}")

# Print best parameter combination found during grid search:
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search.best_params_

Mean Training Score: 91.45%
Best Parameter Combination Found During Grid Search:


{'criterion': 'entropy',
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [37]:
# Primary Param grid prediction scores: (Accuracy, Precision, Recall and F1)
predictions = dt_grid_search.best_estimator_.predict(X_train_ohe)
accuracy_score1 = accuracy_score(y_train, predictions)
recall_score1 = recall_score(y_train, predictions)
precision_score1 = precision_score(y_train, predictions)
F1_score1 = f1_score(y_train, predictions)
print(f'Recall Score:{recall_score1}, Precision Score:{precision_score1}, Accuracy Score{accuracy_score1}, F1 Score {F1_score1}')

Recall Score:0.6239144956579826, Precision Score:0.6949404761904762, Accuracy Score0.9013584752635847, F1 Score 0.6575149595212954


In [30]:
# Secondary Param_Grid:

dt_param_grid1 = {
    'criterion': ['entropy'],
    'max_depth': [3],
    'min_samples_split': [2,3,5,7,9]
}
dt_grid_search1 = GridSearchCV(clf, dt_param_grid1, cv=3, return_train_score=True)

# Fit to the data
dt_grid_search1.fit(X_train_ohe, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='entropy',
                                              max_depth=None, max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['entropy'], 'max_depth': [3],
                         'min_samples_split': [2, 3, 5, 7, 9]},
  

In [31]:
# Secondary Param Grid Mean train score 
dt_gs_training_score1 = np.mean(dt_grid_search1.cv_results_['mean_train_score'])

print(f"Mean Training Score: {dt_gs_training_score1 :.2%}")

# Secondary Param grid best combinations:
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search1.best_params_

Mean Training Score: 90.04%
Best Parameter Combination Found During Grid Search:


{'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 2}

In [32]:
#Predictions for secondary parameter grid and its related Recall, precision accuracy and F1 scores:
predictions = dt_grid_search1.best_estimator_.predict(X_train_ohe)
accuracy_score2 = accuracy_score(y_train, predictions)
recall_score2 = recall_score(y_train, predictions)
precision_score2 = precision_score(y_train, predictions)
F1_score2 = f1_score(y_train, predictions)
print(f'Recall Score:{recall_score2}, Precision Score:{precision_score2}, Accuracy Score{accuracy_score2}, F1 Score {F1_score2}')

Recall Score:0.6239144956579826, Precision Score:0.6949404761904762, Accuracy Score0.9013584752635847, F1 Score 0.6575149595212954


In [None]:
# #Roc plot:
# scores(model, X_train_ohe,y_train)
# roc_plot(model,X_train_ohe,y_train)

# 6.3 Ensemble Methods - Random Forests

In [39]:
# Instantiate Random Forest and cross validate fit with the training data:
rf_clf = RandomForestClassifier()
mean_rf_cv_score = np.mean(cross_val_score(rf_clf, X_train_ohe, y_train, cv=3))

#print resulting mean score.
print(f"Mean Cross Validation Score for Random Forest Classifier: {mean_rf_cv_score : .2%}")




Mean Cross Validation Score for Random Forest Classifier:  88.39%


In [None]:
#Random Forest Param Grid 1:

# Create the random grid
Random_grid = {'bootstrap': [True, False],
             'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 5, 10],
             'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }