# Mod 5 Online Shoppers Intent Project 

# Contents:
1. 
2.
3.
4.
5.
6.
7.


## 1. Business Case: 

## 2. Import Data

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from textwrap import wrap
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
from sklearn.preprocessing import OneHotEncoder
from IPython.display import Image  
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data

In [3]:
df = pd.read_csv('online_shoppers_intention.csv')
df.shape

(12330, 18)

## 3. Data Scrubbing

In [4]:
df.isnull().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [5]:
# Check for Duplicated values
df.duplicated().sum()
# 125 values duplicated
# Will leave them in as they could be duplicates by chance and 
# they are very small portion of our data


125

# 4. Exploration

## 4.1 Data Set Information:

The dataset consists of feature vectors belonging to 12,330 sessions.
The dataset was formed so that each session
would belong to a different user in a 1-year period to avoid
any tendency to a specific campaign, special day, user
profile, or period.


### 4.2 Attribute Information:

The dataset consists of 10 numerical and 8 categorical attributes.
The 'Revenue' attribute can be used as the class label.

"Administrative", "Administrative Duration", "Informational", "Informational Duration", "Product Related" and "Product Related Duration" represent the number of different types of pages visited by the visitor in that session and total time spent in each of these page categories. The values of these features are derived from the URL information of the pages visited by the user and updated in real time when a user takes an action, e.g. moving from one page to another. The "Bounce Rate", "Exit Rate" and "Page Value" features represent the metrics measured by "Google Analytics" for each page in the e-commerce site. The value of "Bounce Rate" feature for a web page refers to the percentage of visitors who enter the site from that page and then leave ("bounce") without triggering any other requests to the analytics server during that session. The value of "Exit Rate" feature for a specific web page is calculated as for all pageviews to the page, the percentage that were the last in the session. The "Page Value" feature represents the average value for a web page that a user visited before completing an e-commerce transaction. The "Special Day" feature indicates the closeness of the site visiting time to a specific special day (e.g. Mother’s Day, Valentine's Day) in which the sessions are more likely to be finalized with transaction. The value of this attribute is determined by considering the dynamics of e-commerce such as the duration between the order date and delivery date. For example, for Valentina’s day, this value takes a nonzero value between February 2 and February 12, zero before and after this date unless it is close to another special day, and its maximum value of 1 on February 8. The dataset also includes operating system, browser, region, traffic type, visitor type as returning or new visitor, a Boolean value indicating whether the date of the visit is weekend, and month of the year.



In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
Administrative             12330 non-null int64
Administrative_Duration    12330 non-null float64
Informational              12330 non-null int64
Informational_Duration     12330 non-null float64
ProductRelated             12330 non-null int64
ProductRelated_Duration    12330 non-null float64
BounceRates                12330 non-null float64
ExitRates                  12330 non-null float64
PageValues                 12330 non-null float64
SpecialDay                 12330 non-null float64
Month                      12330 non-null object
OperatingSystems           12330 non-null int64
Browser                    12330 non-null int64
Region                     12330 non-null int64
TrafficType                12330 non-null int64
VisitorType                12330 non-null object
Weekend                    12330 non-null bool
Revenue                    12330 non-null bool
dtypes: bool(

## 4.3 Screen for Categorical variables:


In [7]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [8]:
# Here we are checking the ratio of unique values to the total number count for each column
df.nunique()/df.count()
# proportions of nuniques to total counts < 0.05 suggest categorical variables

Administrative             0.002190
Administrative_Duration    0.270479
Informational              0.001379
Informational_Duration     0.102028
ProductRelated             0.025223
ProductRelated_Duration    0.774615
BounceRates                0.151825
ExitRates                  0.387429
PageValues                 0.219303
SpecialDay                 0.000487
Month                      0.000811
OperatingSystems           0.000649
Browser                    0.001054
Region                     0.000730
TrafficType                0.001622
VisitorType                0.000243
Weekend                    0.000162
Revenue                    0.000162
dtype: float64

In [9]:
#Loop to visually inspect value counts for all variables
for col in df.columns:
    print(f'This is {col} value counts: \n{df[col].value_counts()}.\n')

This is Administrative value counts: 
0     5768
1     1354
2     1114
3      915
4      765
5      575
6      432
7      338
8      287
9      225
10     153
11     105
12      86
13      56
14      44
15      38
16      24
17      16
18      12
19       6
24       4
22       4
23       3
20       2
21       2
26       1
27       1
Name: Administrative, dtype: int64.

This is Administrative_Duration value counts: 
0.000000      5903
4.000000        56
5.000000        53
7.000000        45
11.000000       42
              ... 
294.070513       1
90.875000        1
97.333333        1
53.166667        1
247.083333       1
Name: Administrative_Duration, Length: 3335, dtype: int64.

This is Informational value counts: 
0     9699
1     1041
2      728
3      380
4      222
5       99
6       78
7       36
9       15
8       14
10       7
12       5
14       2
11       1
13       1
24       1
16       1
Name: Informational, dtype: int64.

This is Informational_Duration value counts: 
0.0   

In [10]:
# Subset categorical and continuous features from dataframe for visualisations:
df_cont = df[['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'ExitRates', 'PageValues', 'BounceRates']]
df_cat = df.drop(df_cont.columns, axis=1)

# 5. Baseline Model

## 5.1 Create X and y series and train test split Data

In [33]:
X = df.drop(columns=['Revenue'], axis =1)
y = df['Revenue']

In [34]:
#Create a global 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = seed)

## 5.2 Baseline Model - One Hot Encoding for Categorical Features

In [13]:
# One-hot encode the categorical variables in the training data and show the resulting DataFrame with proper column names
ohe = OneHotEncoder()
#subset continuous and categorical variables:
train_cont = X_train[['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'ExitRates', 'PageValues', 'BounceRates']]
train_cat = X_train.drop(df_cont.columns, axis=1)


#Fit transform the variables and place them in a dataframe
encoded_vars = ohe.fit_transform(train_cat).toarray()
ohe_df = pd.DataFrame(encoded_vars, columns=ohe.get_feature_names(train_cat.columns))
ohe_df.shape


(9864, 401)

In [14]:
# reset index and make a copy of continuous dataframe.
X_train_ohe = train_cont.copy()
X_train_ohe.reset_index(drop=True, inplace=True)

# Concat into continuous and encoded categoricals into one training dataset:
X_train_ohe = pd.concat([X_train_ohe, ohe_df], axis=1)
X_train_ohe.shape

(9864, 407)

## 5.3 Baseline Model - fit 

In [15]:
clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(X_train_ohe, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## 5.4 Baseline Model - Plot the Decision Tree

In [16]:
# Create DOT Data
dot_data = export_graphviz(clf, out_file=None,
                           feature_names=X_train_ohe.columns,
                           class_names=np.unique(y).astype('str'),
                           filled=True, rounded=True, special_characters=True)

# Draw Graph 
# graph = graph_from_dot_data(dot_data) 

# Show graph
# Image(graph.create_png())


## 5.5 Baseline Model - Evaluation of Predictive Performance

In [None]:
! git branch


* [32mAlek[m
  master[m
