In [None]:
#load libraries

import random
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
#set seed
random.seed(42)
np.random.seed(42)

In [None]:
# load product dataset
import pandas as pd
product_df = pd.read_csv('cannabis-prod-data.csv')

# create map key = product_id, value = availability
product_avail_map = product_df.set_index('product_id')['availability'].to_dict()

In [None]:
#assuming we have the same product ids from our product dataset p001, p002, ... , p100

product_ids = list(product_avail_map.keys())

In [None]:
#generate 1000 user ids
user_ids = [f"u{i:04d}" for i in range(1, 1001)]

In [None]:
# possible event types with probabilities
# they just clicked on the product, they just saved the product to a wishlist,
# they just added to the cart without buying, they bought the product

event_types = {
    "click": 0.35,
    "save": 0.03,
    "cart": 0.20,
    "buy": 0.35}

In [None]:
# ratings are bimodal you either hate or love it if you rate it

rating_choices = [1, 2, 3, 4, 5]
rating_probs = [0.25, 0.20, 0.10, 0.25, 0.20]

In [None]:
# create a fct to generate a random timestamp within the last 90 days

def random_timestamp():
    start = datetime.now() - timedelta(days = 90)
    random_days = random.randint(0, 90)
    random_hours = random.randint(0, 23)
    random_minutes = random.randint(0, 59)
    random_seconds = random.randint(0, 59)
    return start + timedelta(days = random_days, hours = random_hours,
                             minutes = random_minutes, seconds = random_seconds)

In [None]:
# list to hold interation rows
interactions = []

In [None]:
# simulate interactions
for user in user_ids:
  # number of products this user interacts with 5 to 20
  n_products = random.randint(5, 20)
  products_sample = random.sample(product_ids, n_products)

  for product in products_sample:
    # number of events per product for this user from 1 to 5
    n_events = random.randint(1, 5)

    for _ in range(n_events):
      event = random.choices(list(event_types.keys()),
                                  list(event_types.values()))[0]
      timestamp = random_timestamp()

      # rating score, quantity bought
      rating = None
      quantity = None

      avail = product_avail_map.get(product, 0)
      if event == "cart":
        if avail > 0:
          max_qty = min(avail, 10) #cap max cart qty at 10 or availability whichever is smaller
          quantity = random.randint(1, max_qty) #if more than 0, buy 1 to max_qty
        else:
          event = "click"

      elif event == "buy":
        if avail > 0:
          max_qty = min(avail, 10)
          quantity = random.randint(1, max_qty)
          product_avail_map[product] -= quantity #subtract from availability
          rating = random.choices(rating_choices,
                                  weights=rating_probs, k = 1)[0]
        else:
          event = "click"
          quantity = None


      interactions.append({
          "user_id": user,
          "product_id": product,
          "event_type": event,
          "timestamp": timestamp,
          "rating": rating,
          "quantity": quantity })

In [None]:
#convert to df and sort by timestamp
df_interactions = pd.DataFrame(interactions)
df_interactions = df_interactions.sort_values("timestamp").reset_index(drop = True)

In [None]:
# show first 10 rows
print(df_interactions.head(10))

  user_id product_id event_type                  timestamp  rating  quantity
0   u0763       p086      click 2025-05-14 04:08:08.493663     NaN       NaN
1   u0168       p004      click 2025-05-14 04:11:23.186478     NaN       NaN
2   u0656       p081      click 2025-05-14 04:12:25.424196     NaN       NaN
3   u0171       p029      click 2025-05-14 04:14:34.187681     NaN       NaN
4   u0776       p064      click 2025-05-14 04:15:25.502724     NaN       NaN
5   u0818       p084      click 2025-05-14 04:19:25.528061     NaN       NaN
6   u0623       p016      click 2025-05-14 04:24:40.403648     NaN       NaN
7   u0706       p031      click 2025-05-14 04:27:10.460950     NaN       NaN
8   u0914       p099      click 2025-05-14 04:39:06.581217     NaN       NaN
9   u0796       p059      click 2025-05-14 04:39:26.514710     NaN       NaN


In [None]:
# how many buy events
print(df_interactions[df_interactions["event_type"] == "buy"]["user_id"].count())

7921


In [None]:
# how many unique users had a buy event type
print(df_interactions[df_interactions["event_type"] == "buy"]["user_id"].nunique())

963


In [None]:
print(df_interactions[df_interactions["event_type"] == "buy"])

      user_id product_id event_type                  timestamp  rating  \
10      u0498       p060        buy 2025-05-14 04:39:35.327513     4.0   
11      u0190       p067        buy 2025-05-14 04:39:46.195577     3.0   
15      u0522       p026        buy 2025-05-14 04:44:18.345471     1.0   
18      u0500       p068        buy 2025-05-14 04:52:44.329086     1.0   
25      u0483       p021        buy 2025-05-14 05:21:38.320810     2.0   
...       ...        ...        ...                        ...     ...   
37279   u0090       p058        buy 2025-08-13 00:48:14.141003     1.0   
37290   u0844       p030        buy 2025-08-13 01:45:52.541626     1.0   
37298   u0259       p010        buy 2025-08-13 02:34:08.219186     5.0   
37301   u0017       p030        buy 2025-08-13 02:41:58.996793     5.0   
37303   u0445       p083        buy 2025-08-13 02:53:32.297517     1.0   

       quantity  
10          4.0  
11          6.0  
15          2.0  
18          2.0  
25          1.0  
...

In [None]:
df_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37312 entries, 0 to 37311
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     37312 non-null  object        
 1   product_id  37312 non-null  object        
 2   event_type  37312 non-null  object        
 3   timestamp   37312 non-null  datetime64[ns]
 4   rating      7921 non-null   float64       
 5   quantity    12559 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 1.7+ MB


In [None]:
# convert df to csv

df_interactions.to_csv('cannabis-user-data.csv', index = False)

# download the file

from google.colab import files

files.download('cannabis-user-data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>