Documentation:

Why this project:
- Predict what the customer wants to buy, and recommend what else they can buy
	- For example, if a customer buys a phone, we can recommend a phone cover, screen guard, etc.
- Find list of searches or carts or user opening the project page, that are not converted to sales
	- If a customer searches for a product, but does not buy it, we can predict the reason, like the price is cheaper in another website.
- Find the list of searches that returned no results, and find most searched products that are not in our inventory.
	- With this data, we can stock-up those products

### Importing packages

In [28]:
import pandas as pd
import numpy as np
import string
import itertools

np.random.seed(0)

### Emulating the data

In [29]:
# generate products
n_products = 100_000

# Define the characters to use in product codes
characters = list(string.ascii_uppercase)

# Generate all possible product codes of length 3
product_codes = [''.join(product) for product in itertools.product(characters, repeat=3)]

# Repeat the product codes until we have n_products number of products
products = list(itertools.islice(itertools.cycle(product_codes), n_products))

print(products[:5])
print("Length of products: ", len(products))

# prices - generate random prices from 100 to 50_000 - with no decimal places
prices = np.random.randint(100, 50_000, size=n_products)
print(prices[:5])
print("Length of prices: ", len(prices))

['AAA', 'AAB', 'AAC', 'AAD', 'AAE']
Length of products:  100000
[32082 30987 26443 23017 19958]
Length of prices:  100000


In [30]:
# create random data
# number of active customers
n_customers = 1000
n_activities = n_customers

end_timestamp = pd.Timestamp.now()
start_timestamp = end_timestamp - pd.DateOffset(months=2)

timestamps = pd.date_range(
    start=start_timestamp, end=end_timestamp, periods=n_activities
).round('T')  # round to minutes

timestamps[:3]

DatetimeIndex(['2024-01-20 08:05:00', '2024-01-20 09:31:00',
               '2024-01-20 10:58:00'],
              dtype='datetime64[ns]', freq=None)

In [31]:
user_searches_data = []
add_to_cart_data = []
remove_from_cart_data = []
cart_order_data = []

for activity_idx in range(n_activities):
	customer_id = np.random.randint(1, n_customers + 1)
	activity = np.random.choice(['search', 'add_to_cart', 'place_order', 'remove_from_cart'], p=[0.7, 0.15, 0.1, 0.05])
 
	# remove_from_cart can be done only if there are products in the cart
	if activity == 'remove_from_cart' and len(add_to_cart_data) == 0:
		activity = 'add_to_cart'
 
	if activity == 'search':
		# 10% chance for 0 results. 30% for only 1 result
		resultCount_probabilities = [0.1] + [0.3] + [0.6/98]*98  # 10% chance for 0, 30% for 1, and the rest distributed among 2-99
		resultCount = np.random.choice(range(100), p=resultCount_probabilities)

		user_searches_data.append({
			"customerID": customer_id,
			"keyword": np.random.choice(products),
			"resultCount": None if resultCount == 0 else resultCount,
			"timestamp": timestamps[activity_idx],
		})
	
	elif activity == 'add_to_cart':
		add_to_cart_data.append({
			"customerID": customer_id,
			"productID": np.random.randint(1, n_products + 1),
			"timestamp": timestamps[activity_idx],
		})
	
	elif activity == 'place_order':
		n_products_to_order = np.random.randint(1, 6)
		product_ids = np.random.choice(n_products, n_products_to_order, replace=False)
		
		cart_order_data.append({
			"customerID": customer_id,
			"productIDs": product_ids.tolist(),
			"timestamp": timestamps[activity_idx],
		})
	
	elif activity == 'remove_from_cart':
		# select a random product from add_to_cart_data
		sample = np.random.choice(add_to_cart_data)

		remove_from_cart_data.append({
			"customerID": sample["customerID"],
			"productID": sample["productID"],
			"timestamp": timestamps[activity_idx],
		})

user_searches_df = pd.DataFrame(user_searches_data)
add_to_cart_df = pd.DataFrame(add_to_cart_data)
remove_from_cart_df = pd.DataFrame(remove_from_cart_data)
cart_order_df = pd.DataFrame(cart_order_data)

user_searches_df.head()

Unnamed: 0,customerID,keyword,resultCount,timestamp
0,627,JVO,31.0,2024-01-20 09:31:00
1,734,IIJ,59.0,2024-01-20 10:58:00
2,499,SSC,16.0,2024-01-20 12:24:00
3,388,SWL,1.0,2024-01-20 15:17:00
4,173,YJO,72.0,2024-01-20 16:44:00


In [32]:
add_to_cart_df.head()

Unnamed: 0,customerID,productID,timestamp
0,494,49627,2024-01-20 13:51:00
1,721,38672,2024-01-20 19:37:00
2,916,44354,2024-01-20 22:30:00
3,732,8236,2024-01-21 10:02:00
4,173,22627,2024-01-22 00:27:00


In [33]:
remove_from_cart_df.head()

Unnamed: 0,customerID,productID,timestamp
0,916,44354,2024-01-21 07:09:00
1,392,19190,2024-01-23 11:02:00
2,732,8236,2024-01-24 21:38:00
3,916,44354,2024-01-27 07:17:00
4,521,58973,2024-01-27 20:16:00


In [34]:
cart_order_df.head()

Unnamed: 0,customerID,productIDs,timestamp
0,717,"[76943, 54912]",2024-01-20 08:05:00
1,464,"[85028, 44767, 6325, 38108, 49638]",2024-01-21 02:49:00
2,309,"[64077, 89996, 13253, 16105, 73641]",2024-01-21 05:42:00
3,748,"[20403, 49491, 66343, 57975, 94861]",2024-01-21 17:14:00
4,37,"[99032, 42677, 59340]",2024-01-21 21:34:00


### Data pre-processing

In [35]:
# user_searches_df
# fill NA in resultCount with 0
user_searches_df['resultCount'] = user_searches_df['resultCount'].fillna(0)

# replace resultCount with no_result using resultCount
user_searches_df['no_result'] = user_searches_df['resultCount'].apply(lambda x: True if x == 0 else False)
user_searches_df.drop(columns=['resultCount'], inplace=True)

# convert customerID to int
user_searches_df['customerID'] = user_searches_df['customerID'].astype(int)

# remove duplicates
user_searches_df.drop_duplicates(inplace=True)

# if keyword is empty string, replace with None
user_searches_df['keyword'] = user_searches_df['keyword'].replace("", np.nan)
# drop if keyword is NA
user_searches_df.dropna(subset=['keyword'], inplace=True)

user_searches_df.head()

Unnamed: 0,customerID,keyword,timestamp,no_result
0,627,JVO,2024-01-20 09:31:00,False
1,734,IIJ,2024-01-20 10:58:00,False
2,499,SSC,2024-01-20 12:24:00,False
3,388,SWL,2024-01-20 15:17:00,False
4,173,YJO,2024-01-20 16:44:00,False


In [36]:
# add_to_cart_df

# add column is_removed and use data from remove_from_cart_df
# match by customerID and productID
add_to_cart_df["is_removed"] = add_to_cart_df.apply(
    lambda x: (
        True
        if remove_from_cart_df[
            (remove_from_cart_df["customerID"] == x["customerID"])
            & (remove_from_cart_df["productID"] == x["productID"])
        ].shape[0]
        > 0
        else False
    ),
    axis=1,
)

add_to_cart_df.head()

Unnamed: 0,customerID,productID,timestamp,is_removed
0,494,49627,2024-01-20 13:51:00,False
1,721,38672,2024-01-20 19:37:00,False
2,916,44354,2024-01-20 22:30:00,True
3,732,8236,2024-01-21 10:02:00,True
4,173,22627,2024-01-22 00:27:00,True
