In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1> Online Shoppers Intention Dataset </h1>
<img src="img/online_shoppers.jpg" width=900 height=500 />


# Summary
---
 - [Import libs](#Libs)
 - [Functions](#Functions)
 - [The subject](#TheProblem)
 - [Read Data](#Import)
 - [Let's look at the Data!](#Explore)
 - [Transfrom Data](#Convert)
    * [Test Dependency ](#Dependency)
    * [Buiding New Features](#NewFeatures)
    
- [What variables are more important?](#ThielU)
- [Prepare Data to ML](#ML)
- [Modeling](#ML2)

<a id='Libs' a/>
<h1> Import Libs </h1>
<hr></hr>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

##  Math & Statistics
import math
from collections import Counter
import scipy.stats as ss
from scipy.stats import chi2, \
                        chi2_contingency

## Machine Learning
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, \
                             AdaBoostClassifier, \
                             GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

#Metrics
from sklearn.metrics import accuracy_score, \
                            precision_score, \
                            recall_score, \
                            confusion_matrix

<a id='Functions' a/>
<h1> Functions </h1>
<hr></hr>

In [None]:
# Function that execute the qui-square test
def test_dependency(alpha, cont):
    conf = 1-alpha
    X2, p, dof, expected = chi2_contingency(cont)
    critical = chi2.ppf(conf, dof)
    if abs(X2) >= critical:
        print('dependent (reject H0)')
    else:
        print('independent (fail to reject H0)')


def conditional_entropy(x,y):
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

# Uncertainty coefficient or Thiel's U
def theil_u(x,y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

<a id='TheProblem' a/>

## **About Data**

Each data entry (row) is refer to one online session, that can be from the same user or not within a period of 1-year. Totalizing 12,330 sessions. 

The data has a set of variables like the type of page visited, amount of time spent, proximity to special dates, Browser used, and also Google Analytics Metrics like "Bounce Rate", "Exit Rate", and "Page Value". 

For each entry, the column "Revenue" is marked as 1 if that session generated revenue.  

## **Some questions our stakeholders may ask, in order to predict when a visitor is susceptible to buy, or what strategies they may use to improve the chance of sessions generate revenues**
What was the behavior of sessions resulting in purchases, and non-purchases?

How much time visitors spent at our sites in revenued sessions?

What kind of pages they access?

Are the weekends or special day's proximity relevant to they decision?



<a id='Import' a/>
<h1> Read data </h1>
<hr></hr>

In [None]:
df = pd.read_csv("/kaggle/input/online-shoppers-purchasing-intention-dataset/online_shoppers_intention.csv")

<a id='Explore' a/>
<h1> Let's look at the Data! </h1>
<hr></hr>

In [None]:
df.isna().sum()

### Dataset has no null values

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head(4)

In [None]:
fig, axs = plt.subplots(figsize=(5,4))
g = sns.countplot(x='Revenue', data=df, ax=axs)

#### The Data is unbalanced, with roughly 85% (10422) of Non-Revenue vs 15% (1902) of Revenue sessions

In [None]:
fig, axs = plt.subplots(figsize=(5,4))
g = sns.countplot(x='Month', data=df, ax=axs, hue='Revenue')

#### The Months with most online sessions are March, May, November and December. This may be caused by the important holiday, and special days in these months like: 
- **Woman's Day (Mar)**
- **Mother's day (May)**
- **Brack Friday, and Thanks Giving (Nov)**
- **Christmas , and Happy New Year (Dec)**

#### Although there is a discrepancy between Revenue and Non-Revenue visitors, the distributions are similar. Showing that most of purchases was made in these months

In [None]:
fig, axs = plt.subplots(figsize=(5,4))
g = sns.countplot(x='SpecialDay', data=df, ax=axs, hue='Revenue')

#### If we take a look at the "SpecialDay" column, we can see that most of the often revenued sessions were made 13 days after or 1 day after especial days, considering the dynamics of e-commerce such as the duration between the order date and delivery date.

In [None]:
df[['SpecialDay','Month','Revenue']].groupby('Month').sum()

#### The number of sessions close to special days was greater in May, and near of 51% of these sessions where revenued

In [None]:
pd.crosstab(df['Weekend'],df['Revenue'], margins=True)

#### And about week days, weekend improves Revenues by about 2,5% (17.40 % more Revenues in weekends against 14.90 % on weekdays)

## Taking a look at Google Analytics metrics

In [None]:
df[['BounceRates','Revenue']].groupby('Revenue').mean()

#### The bounce rate measure the percentage of visitors who enter the site then leave. If this page is only your home page, it's not good, this means that your visitors don't take any action and leave your page.
#### Bounce Rate is greater in non-revened sessions (25% to non-revenue against 5% to revenue). Sessions, when the amount of people visiting the site is greater than these leaving, is more likely to be revenue

In [None]:
df[['ExitRates','Revenue']].groupby('Revenue').mean()

#### Exit Rate  refer to the percentage of visitors to a page on the website from which they exit the website to a different website.
#### For revenued sessions, near of 20% of the sessions has started and ended at the same page, for non-revenued 50%. In other words, engaged people, who see more than only one page in the your site, has more chance to generate revenue


In [None]:
df[['PageValues','Revenue']].groupby('Revenue').mean()

#### Page Value is the average value for a page that a user visited before landing on the goal page or completing an Ecommerce transaction. For revenued sessions, the average Page Value is 27.00 and for non-revenued is about 2.00

<a id='Convert' a/>
<h1> Transform Data </h1>
<hr></hr>

<h4> In order to use Machine Learning Algorithms, we need to convert categorical-nominal to numerical data</h4>

In [None]:
df['Month'] = df['Month'].map( {'Jan': 1, 'Feb': 2,'Mar':3,'Apr':4,'May':5,'June':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} )
df['VisitorType'] = df['VisitorType'].map({'Returning_Visitor':0,'New_Visitor':1,'Other':2})
df['Weekend'] = df['Weekend'].map({True:1, False:0})
df['Revenue'] = df['Revenue'].map({True:1, False:0})

#### As we have categorical variables, we can verify the dependence between it. This can be made using the chi-square test

<a id='Dependency' a/>
<h2> Test dependency of Revenue and categorical variables </h2>

In [None]:
cat_var = ['SpecialDay','Month','OperatingSystems','Browser','Region','TrafficType','VisitorType','Weekend']
for var in cat_var:
    cont = pd.crosstab(df['Revenue'], df[var])
    print("Revenue and "+var+" are:",end=" ")
    test_dependency(0.05, cont)

#### The chi-square test shows that Region and Revenue are independent. This variable may be unuseful to use in ML algorithms

<a id='NewFeatures' a/>
<h2> Building New Features </h2>

In [None]:
df['TotalPages'] = df['Administrative'] + df['Informational'] + df['ProductRelated']

df['TotalTime'] = df['Administrative_Duration'] + df['Informational_Duration'] + df['ProductRelated_Duration']

df['Adm_time_per_page'] = (df['Administrative_Duration']/df['Administrative']).fillna(0)

df['Info_time_per_page'] = (df['Informational_Duration']/df['Informational']).fillna(0)

df['Pr_time_per_page'] = (df['ProductRelated_Duration']/df['ProductRelated']).fillna(0)

<a id='ThielU' a/>
<h1> What variables are most important to explain Revenued sessions?</h1>
<hr></hr>

Theil's U or Uncertainty Coefficient is a measure of nominal association. Sometimes expressed as U(x|y). Is the measure of entropy in variable y that variable x explains. Thiel's U is measured in the range of [0,1], where 0 means that feature y provides no information about feature x, and 1 means that feature y provides full information abpout features x's value.

In [None]:
theilu = pd.DataFrame(index=['Revenue'],columns=df.columns)
columns = df.columns
for j in range(0,len(columns)):
    u = theil_u(df['Revenue'].tolist(),df[columns[j]].tolist())
    theilu.loc[:,columns[j]] = u
theilu.fillna(value=np.nan,inplace=True)
plt.figure(figsize=(20,1))
sns.heatmap(theilu,annot=True,fmt='.2f')
plt.show()

#### Seems like TotalTime, Adm_time_per_page, and Pr_time_per_page, was most significative to Revenue

<a id="ML" a/>
<h1>Prepare Data to ML</h1>
<hr></hr>

In [None]:
X = df.drop(columns=['Revenue','Region'])
y = df['Revenue']

# Normalizing data
sc = StandardScaler()
X = sc.fit_transform(X)
y = np.array(y)

# Fixing class unbalancing
ovs = RandomOverSampler()
X_ovs,y_ovs = ovs.fit_resample(X,y)


# Split into train/test datasets
X_train, X_test, y_train, y_test = train_test_split(X_ovs,y_ovs, test_size=0.30 , random_state=50)

<a id="ML2" a/>
<h1>Selecting Better Algorithn and Modeling</h1>
<hr></hr>

In [None]:
# Models
classifiers = [
    KNeighborsClassifier(8),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

log_cols = ["Classifier", "Accuracy"]
log      = pd.DataFrame(columns=log_cols)
acc_dict = {}

# Search best model
for clf in classifiers:
    name = clf.__class__.__name__
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    if name in acc_dict:
        acc_dict[name] += acc
    else:
        acc_dict[name] = acc

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf]
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)
log.sort_values(by='Accuracy', ascending=False)


In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
pred = model.predict(X_test)

In [None]:
# Metrics
ac = accuracy_score(y_test, pred)
rc = recall_score(y_test, pred)
pr = precision_score(y_test, pred)
cfm = confusion_matrix(y_test, pred)

In [None]:
g = sns.heatmap(cfm/cfm.sum(), annot=True, fmt='.2%', cmap='Blues')
g.set_xlabel('Predicted')
g.set_ylabel('Class')

In [None]:
#### FN are 0.24% and FP are 3.25%. This means that the ML-algoritm is god to bet when a session will not generate a revenue



In [None]:
rc

In [None]:
pr

## References:

https://en.wikipedia.org/wiki/Uncertainty_coefficient,

https://www.statisticshowto.com/uncertainty-coefficient/

https://www.kaggle.com/shakedzy/alone-in-the-woods-using-theil-s-u-for-survival