In [1]:
import pandas as pd 
import numpy as np 

In [2]:
import matplotlib.pyplot as plt 

**Dataset Website**
https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

**Dataset Information**
The dataset consists of feature vectors belonging to 12,330 sessions. 
The dataset was formed so that each session
would belong to a different user in a 1-year period to avoid
any tendency to a specific campaign, special day, user
profile, or period. 



# Datset Import 

In [3]:
df = pd.read_csv("online_shoppers_intention.csv")
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
df.info() #we have to convert the datatypes of object and bool into integer for furthur processing

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [5]:
df.describe() #since the mean has large variations so we have to perform the standarscaler function to resolve this. 

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


# Data Preprocessing 

Since there are objects so we have to convert them into nnumerical values to perform furthur process. 

In [6]:
def change_objects(dataset,dty = ["object", "bool"]):
    for i in dataset.columns:
        if dataset[i].dtype in dty:
            uni = dataset[i].unique()
            data = dict()
            count = 0 
            for j in uni:
                data[j] = count
                count += 1
            dataset[i] = dataset[i].map(data)

change_objects(df)

Dividing the dataset into two parts train and test using train test split from the sklearn library. 

In [7]:
X = df.iloc[:,:-1] #spliting the data into X and y features indicating the input and the output of the supervised ML.
y = df.iloc[:,-1]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = True)

Now since there is very much distance in the mean values of the dataset after the transformation of the objects into there labels
so we are going to use the standard scaler of the sklear to get a proper set of data from it 

In [10]:
df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,4.566504,2.124006,2.357097,3.147364,4.069586,0.151176,0.232603,0.154745
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,2.982143,0.911325,1.717277,2.401591,4.025169,0.376989,0.422509,0.361676
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0,0.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,4.0,2.0,2.0,3.0,2.0,0.0,0.0,0.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,7.0,3.0,2.0,4.0,4.0,0.0,0.0,0.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,9.0,8.0,13.0,9.0,20.0,2.0,1.0,1.0


In [11]:
from sklearn.preprocessing import StandardScaler

s_scaler = StandardScaler()
X_train_scaled = s_scaler.fit_transform(x_train)
x_test_scaled = s_scaler.transform(x_test)

In [12]:
X_train_scaled, x_test_scaled

(array([[-0.69507529, -0.45717087, -0.39653192, ..., -0.51277367,
         -0.39929833, -0.5540843 ],
        [-0.69507529, -0.45717087, -0.39653192, ..., -0.51277367,
         -0.39929833, -0.5540843 ],
        [-0.69507529, -0.45717087, -0.39653192, ..., -0.26480821,
         -0.39929833, -0.5540843 ],
        ...,
        [-0.39541521, -0.39782009, -0.39653192, ..., -0.01684275,
         -0.39929833, -0.5540843 ],
        [-0.09575513, -0.24520378, -0.39653192, ...,  0.97501911,
         -0.39929833, -0.5540843 ],
        [ 0.20390495, -0.17831638,  3.52034253, ..., -0.51277367,
         -0.39929833, -0.5540843 ]]),
 array([[-0.69507529, -0.45717087, -0.39653192, ...,  0.23112272,
          2.26017417,  1.80477954],
        [ 0.80322511,  7.54388038, -0.39653192, ..., -0.76073914,
         -0.39929833, -0.5540843 ],
        [-0.69507529, -0.45717087, -0.39653192, ..., -0.26480821,
         -0.39929833, -0.5540843 ],
        ...,
        [ 0.20390495,  0.30873689, -0.39653192, ..., -

# Model Building 

In [13]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Logistic Regression
Logistic Regression is a widely-used binary classification algorithm in machine learning. Despite its name, it's employed for predicting the probability of an instance belonging to a particular class. It models the relationship between the dependent binary variable and one or more independent variables using the logistic function, which maps any real-valued number into a value between 0 and 1. By setting a threshold, predictions are made, with values above the threshold classified as one class and those below as the other. Logistic Regression is simple, interpretable, and effective for tasks like spam detection, medical diagnosis, and credit scoring.

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
lr_model = LogisticRegression(random_state = 64)
lr_model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
lr_model_pred = lr_model.predict(x_test)
lr_model.score(x_test,y_test)

0.8905109489051095

In [17]:
target_name = ["False", "True"]
print(classification_report(y_test, lr_model_pred, target_names=target_name))

              precision    recall  f1-score   support

       False       0.91      0.97      0.94      2115
        True       0.70      0.41      0.52       351

    accuracy                           0.89      2466
   macro avg       0.80      0.69      0.73      2466
weighted avg       0.88      0.89      0.88      2466



In [18]:
conf_matrix = confusion_matrix(y_test, lr_model_pred)
print(conf_matrix)

[[2052   63]
 [ 207  144]]


## DTC or Decision Tree Classifier model
The Decision Tree Classifier is a versatile supervised machine learning model used for both classification and regression tasks. It recursively splits the dataset based on the most significant features, creating a tree-like structure where each internal node represents a decision based on a feature, and each leaf node represents the predicted outcome. The algorithm selects the optimal splits by maximizing information gain or minimizing impurity. Decision trees are interpretable, handle non-linear relationships well, and are prone to overfitting. Ensembles like Random Forests mitigate this by aggregating multiple trees. They find applications in various fields, including finance, healthcare, and pattern recognition.

In [19]:
from sklearn.tree import DecisionTreeClassifier

### Gini

In [20]:
dt_classifier_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_classifier_gini.fit(x_train, y_train)

In [21]:
dtc_pred = dt_classifier_gini.predict(x_test)

In [22]:
target_name = ["False", "True"]
print(classification_report(y_test, dtc_pred, target_names=target_name))

              precision    recall  f1-score   support

       False       0.92      0.91      0.92      2115
        True       0.51      0.53      0.52       351

    accuracy                           0.86      2466
   macro avg       0.71      0.72      0.72      2466
weighted avg       0.86      0.86      0.86      2466



In [23]:
conf_matrix = confusion_matrix(y_test, dtc_pred)
print(conf_matrix)

[[1934  181]
 [ 164  187]]


### Entropy

In [59]:
dt_classifier_ent = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_classifier_ent.fit(x_train,y_train)

In [60]:
dtc_pred_ent = dt_classifier_ent.predict(x_test)

In [61]:
target_name = ["False", "True"]
print(classification_report(y_test, dtc_pred_ent, target_names=target_name))

              precision    recall  f1-score   support

       False       0.93      0.92      0.92      2115
        True       0.54      0.58      0.56       351

    accuracy                           0.87      2466
   macro avg       0.73      0.75      0.74      2466
weighted avg       0.87      0.87      0.87      2466



In [62]:
conf_matrix = confusion_matrix(y_test, dtc_pred_ent)
print(conf_matrix)

[[1942  173]
 [ 149  202]]


## KNN Classifer
The K-Nearest Neighbors (KNN) classifier is a simple and intuitive machine learning model used for classification tasks. It operates on the principle that similar instances in a dataset are likely to share the same class. During prediction, KNN calculates the class based on the majority vote of its k nearest neighbors in the feature space. The model is non-parametric and does not make assumptions about the underlying data distribution. KNN is versatile and effective, particularly in scenarios with well-defined clusters. However, it can be sensitive to the choice of the distance metric and the value of k, requiring careful consideration in practice.

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=2)

In [29]:
knn_classifier.fit(x_train, y_train)

In [30]:
y_pred_knn = knn_classifier.predict(x_test)

In [31]:
target_name = ["False", "True"]
print(classification_report(y_test, dtc_pred_ent, target_names=target_name))

              precision    recall  f1-score   support

       False       0.93      0.92      0.92      2115
        True       0.54      0.58      0.56       351

    accuracy                           0.87      2466
   macro avg       0.73      0.75      0.74      2466
weighted avg       0.87      0.87      0.87      2466



In [32]:
conf_matrix = confusion_matrix(y_test, y_pred_knn)
print(conf_matrix)

[[2052   63]
 [ 266   85]]


# Summary
This section displays the model and there different parameters according to the model we build before using the sklearn library.

In [68]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [82]:
Summary = pd.DataFrame(columns = ["Model Name" , "Accuracy" , "Recall_score" ,"F1_score" ,"Precision"])

In [83]:
def summary_df(model,name):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    pscore = precision_score(y_test,y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    data={
        "Model Name" : name, 
        "Accuracy":acc , 
        "Recall_score": recall,"F1_score": f1 ,"Precision": pscore
    }
    Summary.loc[len(Summary)] = data

summary_df(lr_model,"Logistic Regression")
summary_df(dt_classifier_gini,"DT (Gini)")
summary_df(dt_classifier_ent,"DT (Entropy)")
summary_df(knn_classifier,"KNN")

In [84]:
Summary.set_index("Model Name" , inplace = True)
Summary.head()
Summary.style.highlight_max(color = "green",axis=0)

Unnamed: 0_level_0,Accuracy,Recall_score,F1_score,Precision
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.890511,0.410256,0.516129,0.695652
DT (Gini),0.860097,0.532764,0.520167,0.508152
DT (Entropy),0.869424,0.575499,0.556474,0.538667
KNN,0.866586,0.242165,0.340681,0.574324


This Summary dataset clearly describes the maximum values by highlighting the max values of each column. So the Logistic Regression model is one of the best possible model to work on since it has maximum number of accuracy and precision with decent amount of Recall and F1 score.