In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


## Data Cleaning

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
print("The data has {} data points".format(df.shape[0]))
print("The data has {} features".format(df.shape[1]))

The data has 12330 data points
The data has 18 features


In [4]:
# Drop the'Browser' column
df.drop(columns = ['Browser'], inplace=True)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,1,4,Returning_Visitor,True,False


In [5]:
print("The data has {} data points".format(df.shape[0]))
print("The data has {} features".format(df.shape[1]))

The data has 12330 data points
The data has 17 features


In [6]:
(df.describe())

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,9.0,20.0


In [7]:
#Repersenting Categorical Features
df.describe(include=[object])

Unnamed: 0,Month,VisitorType
count,12330,12330
unique,10,3
top,May,Returning_Visitor
freq,3364,10551


In [8]:
#checking if data contains Null values
df.isnull().sum().sum() 

0

## Feature Extraction

In [9]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x : ordering.index(x))
    return df

def onehot_encode(df, column, prefix):
    df= df.copy();
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1);
    df = df.drop(column, axis = 1)
    return df;

In [10]:
month_ordering = ["Jan", "Feb",  "Mar", 
    "Apr", "May", "June", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
visitor_prefix = "V"

In [11]:
df = ordinal_encode(df, "Month", month_ordering)
df = onehot_encode(df, "VisitorType", visitor_prefix)

df["Weekend"] = df["Weekend"].astype(int)
df["Revenue"] = df["Revenue"].astype(int)

# Training and Testing

In [12]:
#Spliting and Scaling
y = df["Revenue"].copy()
x = df.drop("Revenue", axis = 1).copy()

In [13]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    train_size = 0.7, 
                                                    random_state = 20)

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    train_size = 0.7, 
                                                    random_state = 20)

## K-Nearest Neighbors Model

In [19]:
#K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(x_train, y_train)
knnScore = knn.score(x_test, y_test)

# Cross Validation Score
knnValidationList = cross_val_score(knn, x, y, cv=5, scoring='accuracy')
knnValidation = sum(knnValidationList) / 5
print (f' Knn Accuracy = {knnScore}')
print (f' Knn Validation = {knnValidation}')


 Knn Accuracy = 0.881859962151933
 Knn Validation = 0.8760746147607461


## Naive Bayes Model

In [20]:
#Naive Bayes
nb = GaussianNB()
nb.fit(x_train, y_train)
nbScore = nb.score(x_test, y_test)

# Cross Validation Score
nbValidationList = cross_val_score(nb, x, y, cv=5, scoring='accuracy')
nbValidation = sum(nbValidationList) / 5

print (f' NB Accuracy = {nbScore}')
print (f' NB Validation = {nbValidation}')

 NB Accuracy = 0.7629088942957556
 NB Validation = 0.770235198702352


## SVC Model

In [21]:
#Vector Machine
sv = SVC()
sv.fit(x_train, y_train)
svScore = sv.score(x_test, y_test)

# Cross Validation Score
svValidationList = cross_val_score(sv, x, y, cv=5, scoring='accuracy')
svValidation = sum(svValidationList) / 5

print (f' SVC Accuracy = {svScore}')
print (f' SCV Validation = {svValidation}')

 SVC Accuracy = 0.8961881589618816
 SCV Validation = 0.8892944038929441


## Analysing accuracy and validation

In [23]:
#Analysing accuracy and validation
print("KNN Accuracy: " + str(knnScore))
print("KNN Validation: " + str(knnValidation))
print("------------")

print("NB Accuracy: " + str(nbScore))
print("NB Validation: " + str(nbValidation))

print("------------")
print("SV Accuracy: " + str(svScore))
print("SV Validation: " + str(svValidation))

KNN Accuracy: 0.881859962151933
KNN Validation: 0.8760746147607461
------------
NB Accuracy: 0.7629088942957556
NB Validation: 0.770235198702352
------------
SV Accuracy: 0.8961881589618816
SV Validation: 0.8892944038929441
