In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, KFold

In [2]:
# Dataset
df = pd.read_csv("ecommerce_clickstream_transactions.csv")
df.head(100)

Unnamed: 0,UserID,SessionID,Timestamp,EventType,ProductID,Amount,Outcome
0,1,1,2024-07-07 18:00:26.959902,page_view,,,
1,1,1,2024-03-05 22:01:00.072000,page_view,,,
2,1,1,2024-03-23 22:08:10.568453,product_view,prod_8199,,
3,1,1,2024-03-12 00:32:05.495638,add_to_cart,prod_4112,,
4,1,1,2024-02-25 22:43:01.318876,add_to_cart,prod_3354,,
...,...,...,...,...,...,...,...
95,2,2,2024-04-16 05:04:41.305679,purchase,prod_9382,130.052528,purchase
96,2,2,2024-04-10 21:19:06.834986,page_view,,,
97,2,3,2024-01-03 01:39:14.214806,click,,,
98,2,3,2024-03-26 17:29:34.941192,logout,,,


In [3]:
# Convert the column in one line
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [4]:
# Remove the 'prod_' prefix from the 'Product ID' column
df['ProductID'] = df['ProductID'].str.replace('prod_', '', regex=False)

print(df)

       UserID  SessionID                  Timestamp     EventType ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view       NaN   
1           1          1 2024-03-05 22:01:00.072000     page_view       NaN   
2           1          1 2024-03-23 22:08:10.568453  product_view      8199   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart      4112   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart      3354   
...       ...        ...                        ...           ...       ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase      1238   
74813    1000         10 2024-03-29 04:09:32.514318        logout       NaN   
74814    1000         10 2024-02-09 02:58:56.128697         login       NaN   
74815    1000         10 2024-04-30 16:19:48.002633      purchase      2515   
74816    1000         10 2024-04-01 02:19:29.148727        logout       NaN   

           Amount   Outcome  
0             NaN    

In [5]:
df.head(100)

Unnamed: 0,UserID,SessionID,Timestamp,EventType,ProductID,Amount,Outcome
0,1,1,2024-07-07 18:00:26.959902,page_view,,,
1,1,1,2024-03-05 22:01:00.072000,page_view,,,
2,1,1,2024-03-23 22:08:10.568453,product_view,8199,,
3,1,1,2024-03-12 00:32:05.495638,add_to_cart,4112,,
4,1,1,2024-02-25 22:43:01.318876,add_to_cart,3354,,
...,...,...,...,...,...,...,...
95,2,2,2024-04-16 05:04:41.305679,purchase,9382,130.052528,purchase
96,2,2,2024-04-10 21:19:06.834986,page_view,,,
97,2,3,2024-01-03 01:39:14.214806,click,,,
98,2,3,2024-03-26 17:29:34.941192,logout,,,


In [6]:
# Convert the column to numeric, forcing errors to NaN
df['ProductID'] = pd.to_numeric(df['ProductID'], errors='coerce')

print(df)

       UserID  SessionID                  Timestamp     EventType  ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view        NaN   
1           1          1 2024-03-05 22:01:00.072000     page_view        NaN   
2           1          1 2024-03-23 22:08:10.568453  product_view     8199.0   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart     4112.0   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart     3354.0   
...       ...        ...                        ...           ...        ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase     1238.0   
74813    1000         10 2024-03-29 04:09:32.514318        logout        NaN   
74814    1000         10 2024-02-09 02:58:56.128697         login        NaN   
74815    1000         10 2024-04-30 16:19:48.002633      purchase     2515.0   
74816    1000         10 2024-04-01 02:19:29.148727        logout        NaN   

           Amount   Outcome  
0        

In [7]:
# Replace NaN with a computed value (e.g., max Product ID + 1)
df['ProductID'] = df['ProductID'].fillna(df['ProductID'].max() + 1)

print(df)

       UserID  SessionID                  Timestamp     EventType  ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view    10000.0   
1           1          1 2024-03-05 22:01:00.072000     page_view    10000.0   
2           1          1 2024-03-23 22:08:10.568453  product_view     8199.0   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart     4112.0   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart     3354.0   
...       ...        ...                        ...           ...        ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase     1238.0   
74813    1000         10 2024-03-29 04:09:32.514318        logout    10000.0   
74814    1000         10 2024-02-09 02:58:56.128697         login    10000.0   
74815    1000         10 2024-04-30 16:19:48.002633      purchase     2515.0   
74816    1000         10 2024-04-01 02:19:29.148727        logout    10000.0   

           Amount   Outcome  
0        

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74817 entries, 0 to 74816
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   UserID     74817 non-null  int64         
 1   SessionID  74817 non-null  int64         
 2   Timestamp  74817 non-null  datetime64[ns]
 3   EventType  74817 non-null  object        
 4   ProductID  74817 non-null  float64       
 5   Amount     10682 non-null  float64       
 6   Outcome    10682 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 4.0+ MB


In [9]:
df.head(1000)

Unnamed: 0,UserID,SessionID,Timestamp,EventType,ProductID,Amount,Outcome
0,1,1,2024-07-07 18:00:26.959902,page_view,10000.0,,
1,1,1,2024-03-05 22:01:00.072000,page_view,10000.0,,
2,1,1,2024-03-23 22:08:10.568453,product_view,8199.0,,
3,1,1,2024-03-12 00:32:05.495638,add_to_cart,4112.0,,
4,1,1,2024-02-25 22:43:01.318876,add_to_cart,3354.0,,
...,...,...,...,...,...,...,...
995,13,9,2024-03-08 12:45:15.880717,add_to_cart,7836.0,,
996,13,9,2024-01-28 08:38:35.797425,purchase,1362.0,276.823636,purchase
997,13,9,2024-03-17 09:05:29.380305,page_view,10000.0,,
998,13,9,2024-05-06 04:25:42.171537,page_view,10000.0,,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74817 entries, 0 to 74816
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   UserID     74817 non-null  int64         
 1   SessionID  74817 non-null  int64         
 2   Timestamp  74817 non-null  datetime64[ns]
 3   EventType  74817 non-null  object        
 4   ProductID  74817 non-null  float64       
 5   Amount     10682 non-null  float64       
 6   Outcome    10682 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 4.0+ MB


In [11]:
# Drop the 'Amount' column
df = df.drop(columns=['Amount'])

print(df)

       UserID  SessionID                  Timestamp     EventType  ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view    10000.0   
1           1          1 2024-03-05 22:01:00.072000     page_view    10000.0   
2           1          1 2024-03-23 22:08:10.568453  product_view     8199.0   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart     4112.0   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart     3354.0   
...       ...        ...                        ...           ...        ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase     1238.0   
74813    1000         10 2024-03-29 04:09:32.514318        logout    10000.0   
74814    1000         10 2024-02-09 02:58:56.128697         login    10000.0   
74815    1000         10 2024-04-30 16:19:48.002633      purchase     2515.0   
74816    1000         10 2024-04-01 02:19:29.148727        logout    10000.0   

        Outcome  
0           NaN  
1  

In [12]:
df.head(1000)

Unnamed: 0,UserID,SessionID,Timestamp,EventType,ProductID,Outcome
0,1,1,2024-07-07 18:00:26.959902,page_view,10000.0,
1,1,1,2024-03-05 22:01:00.072000,page_view,10000.0,
2,1,1,2024-03-23 22:08:10.568453,product_view,8199.0,
3,1,1,2024-03-12 00:32:05.495638,add_to_cart,4112.0,
4,1,1,2024-02-25 22:43:01.318876,add_to_cart,3354.0,
...,...,...,...,...,...,...
995,13,9,2024-03-08 12:45:15.880717,add_to_cart,7836.0,
996,13,9,2024-01-28 08:38:35.797425,purchase,1362.0,purchase
997,13,9,2024-03-17 09:05:29.380305,page_view,10000.0,
998,13,9,2024-05-06 04:25:42.171537,page_view,10000.0,


In [13]:
# Replace NaN values in the 'Outcome' column with 'No purchase'
df['Outcome'] = df['Outcome'].fillna('No purchase')

print(df)

       UserID  SessionID                  Timestamp     EventType  ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view    10000.0   
1           1          1 2024-03-05 22:01:00.072000     page_view    10000.0   
2           1          1 2024-03-23 22:08:10.568453  product_view     8199.0   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart     4112.0   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart     3354.0   
...       ...        ...                        ...           ...        ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase     1238.0   
74813    1000         10 2024-03-29 04:09:32.514318        logout    10000.0   
74814    1000         10 2024-02-09 02:58:56.128697         login    10000.0   
74815    1000         10 2024-04-30 16:19:48.002633      purchase     2515.0   
74816    1000         10 2024-04-01 02:19:29.148727        logout    10000.0   

           Outcome  
0      No purchase

In [14]:
# Initialize the OrdinalEncoder
encoder = OrdinalEncoder()

# Perform ordinal encoding on the 'EventType' column
df['EventType_encoded'] = encoder.fit_transform(df[['EventType']])

print(df)

       UserID  SessionID                  Timestamp     EventType  ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view    10000.0   
1           1          1 2024-03-05 22:01:00.072000     page_view    10000.0   
2           1          1 2024-03-23 22:08:10.568453  product_view     8199.0   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart     4112.0   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart     3354.0   
...       ...        ...                        ...           ...        ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase     1238.0   
74813    1000         10 2024-03-29 04:09:32.514318        logout    10000.0   
74814    1000         10 2024-02-09 02:58:56.128697         login    10000.0   
74815    1000         10 2024-04-30 16:19:48.002633      purchase     2515.0   
74816    1000         10 2024-04-01 02:19:29.148727        logout    10000.0   

           Outcome  EventType_encoded  

In [15]:
# Initialize the OrdinalEncoder
encoder = OrdinalEncoder()

# Perform ordinal encoding on the 'EventType' column
df['Outcome'] = encoder.fit_transform(df[['Outcome']])

print(df)

       UserID  SessionID                  Timestamp     EventType  ProductID  \
0           1          1 2024-07-07 18:00:26.959902     page_view    10000.0   
1           1          1 2024-03-05 22:01:00.072000     page_view    10000.0   
2           1          1 2024-03-23 22:08:10.568453  product_view     8199.0   
3           1          1 2024-03-12 00:32:05.495638   add_to_cart     4112.0   
4           1          1 2024-02-25 22:43:01.318876   add_to_cart     3354.0   
...       ...        ...                        ...           ...        ...   
74812    1000         10 2024-05-11 22:48:45.500117      purchase     1238.0   
74813    1000         10 2024-03-29 04:09:32.514318        logout    10000.0   
74814    1000         10 2024-02-09 02:58:56.128697         login    10000.0   
74815    1000         10 2024-04-30 16:19:48.002633      purchase     2515.0   
74816    1000         10 2024-04-01 02:19:29.148727        logout    10000.0   

       Outcome  EventType_encoded  
0  

In [16]:
df.head()

Unnamed: 0,UserID,SessionID,Timestamp,EventType,ProductID,Outcome,EventType_encoded
0,1,1,2024-07-07 18:00:26.959902,page_view,10000.0,0.0,4.0
1,1,1,2024-03-05 22:01:00.072000,page_view,10000.0,0.0,4.0
2,1,1,2024-03-23 22:08:10.568453,product_view,8199.0,0.0,5.0
3,1,1,2024-03-12 00:32:05.495638,add_to_cart,4112.0,0.0,0.0
4,1,1,2024-02-25 22:43:01.318876,add_to_cart,3354.0,0.0,0.0


In [17]:
# Drop the 'Amount' column
df = df.drop(columns=['Timestamp','EventType'])

print(df)

       UserID  SessionID  ProductID  Outcome  EventType_encoded
0           1          1    10000.0      0.0                4.0
1           1          1    10000.0      0.0                4.0
2           1          1     8199.0      0.0                5.0
3           1          1     4112.0      0.0                0.0
4           1          1     3354.0      0.0                0.0
...       ...        ...        ...      ...                ...
74812    1000         10     1238.0      1.0                6.0
74813    1000         10    10000.0      0.0                3.0
74814    1000         10    10000.0      0.0                2.0
74815    1000         10     2515.0      1.0                6.0
74816    1000         10    10000.0      0.0                3.0

[74817 rows x 5 columns]


In [18]:
# Splitting the dataset
X = df.drop('Outcome', axis=1)
y = df['Outcome']


In [28]:
df

Unnamed: 0,UserID,SessionID,ProductID,Outcome,EventType_encoded
0,1,1,10000.0,0.0,4.0
1,1,1,10000.0,0.0,4.0
2,1,1,8199.0,0.0,5.0
3,1,1,4112.0,0.0,0.0
4,1,1,3354.0,0.0,0.0
...,...,...,...,...,...
74812,1000,10,1238.0,1.0,6.0
74813,1000,10,10000.0,0.0,3.0
74814,1000,10,10000.0,0.0,2.0
74815,1000,10,2515.0,1.0,6.0


In [27]:
# Training a simple classification model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [21]:
# Perform K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')
cv_scores

array([1., 1., 1., 1., 1.])

In [22]:
# Splitting the dataset
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Training a simple classification model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [24]:
# Predicting and evaluating the model
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

1.0
1.0
1.0
1.0


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [26]:
# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9846297781341887
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     12781
         1.0       0.92      0.98      0.95      2183

    accuracy                           0.98     14964
   macro avg       0.96      0.98      0.97     14964
weighted avg       0.99      0.98      0.98     14964

Confusion Matrix:
[[12585   196]
 [   34  2149]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
