In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Data Cleaning

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
print("The data has {} data points".format(df.shape[0]))
print("The data has {} features".format(df.shape[1]))

The data has 12330 data points
The data has 18 features


In [4]:
# Drop rows with missing values
df.dropna(inplace=True)

In [5]:
# Drop the'Browser' column
df.drop(columns = ['Browser'], inplace=True)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,1,4,Returning_Visitor,True,False


In [6]:
print("The data has {} data points".format(df.shape[0]))
print("The data has {} features".format(df.shape[1]))

The data has 12330 data points
The data has 17 features


In [7]:
(df.describe())

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,9.0,20.0


In [8]:
#Representing Categorical Features
df.describe(include=[object])

Unnamed: 0,Month,VisitorType
count,12330,12330
unique,10,3
top,May,Returning_Visitor
freq,3364,10551


In [9]:
#checking if data contains Null values
df.isnull().sum().sum() 

0

In [10]:
# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns
print("Categorical Features:")
print(categorical_features)

Categorical Features:
Index(['Month', 'VisitorType'], dtype='object')


In [11]:
# Identify numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical Features:")
print(numerical_features)

Numerical Features:
Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay',
       'OperatingSystems', 'Region', 'TrafficType'],
      dtype='object')


In [12]:
column_type = df['Revenue'].dtype
print(column_type)

bool


In [13]:
# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(df)

In [14]:
# Split the dataset into features (X) and target variable (y)
X = data.drop('Revenue', axis=1)
y = data['Revenue'].astype(int)

In [15]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Revenue, dtype: int32

# Check the balance of the labels variable (y) by using the value_counts function.

In [17]:
# Check the balance of our target values
y.value_counts()

0    10422
1     1908
Name: Revenue, dtype: int64

# Split the data into training and testing sets

In [18]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


# Create a Logistic Regression Model with the Original Data

### Fit a logistic regression model by using the training data (X_train and y_train)

In [19]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=1)

### Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [20]:
# Make a prediction using the testing data
y_pred = lr_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

# Evaluate the model’s performance by doing the following:

- Calculate the accuracy score of the model.

- Generate a confusion matrix.

- Print the classification report.

In [21]:
# Print the balanced_accuracy score of the model
accuracy_score  = balanced_accuracy_score(y_test, y_pred)
print(accuracy_score)

0.6827521002338405


In [22]:
# Generate a confusion matrix for the model
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[3069   80]
 [ 335  215]]


In [23]:
# Print the classification report for the model
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.90      0.97      0.94      3149
           1       0.73      0.39      0.51       550

    accuracy                           0.89      3699
   macro avg       0.82      0.68      0.72      3699
weighted avg       0.88      0.89      0.87      3699



# Predict a Logistic Regression Model with Resampled Training Data

Use the RandomOverSampler module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.

In [24]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [25]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0    7273
1    7273
Name: Revenue, dtype: int64

Use the LogisticRegression classifier and the resampled data to fit the model and make predictions.

In [26]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
ros_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
ros_model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
y_pred = ros_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluate the model’s performance by doing the following:

- Calculate the accuracy score of the model.

- Generate a confusion matrix.

- Print the classification report.

In [27]:
# Print the balanced_accuracy score of the model 
accuracy_score  = balanced_accuracy_score(y_test, y_pred)
print(accuracy_score)

0.8173284448165363


In [28]:
# Generate a confusion matrix for the model
ros_matrix = confusion_matrix(y_test, y_pred)
print(ros_matrix)

[[2823  326]
 [ 144  406]]


In [29]:
# Print the classification report for the model
ros_report = classification_report(y_test, y_pred)

print(ros_report)

              precision    recall  f1-score   support

           0       0.95      0.90      0.92      3149
           1       0.55      0.74      0.63       550

    accuracy                           0.87      3699
   macro avg       0.75      0.82      0.78      3699
weighted avg       0.89      0.87      0.88      3699

