Documentation:

Why this project:
- Predict what the customer wants to buy, and recommend what else they can buy
	- For example, if a customer buys a phone, we can recommend a phone cover, screen guard, etc.
- Find list of searches or carts or user opening the project page, that are not converted to sales
	- If a customer searches for a product, but does not buy it, we can predict the reason, like the price is cheaper in another website.
- Find the list of searches that returned no results, and find most searched products that are not in our inventory.
	- With this data, we can stock-up those products

### Importing packages

In [1]:
# %pip install pandas numpy scikit-learn

In [2]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

import string
import itertools

np.random.seed(0)

### Defining constant values

In [3]:
n_products = 10_000  # 100_000
# number of active customers
n_customers = 100  # 1000
n_activities = n_customers

### Emulating the real data

In [4]:
# generate products

# Define the characters to use in product codes
characters = list(string.ascii_uppercase)

# Generate all possible product codes of length 3
product_codes = [''.join(product) for product in itertools.product(characters, repeat=3)]

# Repeat the product codes until we have n_products number of products
products = list(itertools.islice(itertools.cycle(product_codes), n_products))

print(products[:5])
print('Length of products: ', len(products))

# prices - generate random prices from 100 to 50_000 - with no decimal places
prices = np.random.randint(100, 50_000, size=n_products)
print(prices[:5])
print('Length of prices: ', len(prices))

products_df = pd.DataFrame({'product': products, 'price': prices})
products_df.head()

['AAA', 'AAB', 'AAC', 'AAD', 'AAE']
Length of products:  10000
[ 2832 43667 42713 45991 21343]
Length of prices:  10000


Unnamed: 0,product,price
0,AAA,2832
1,AAB,43667
2,AAC,42713
3,AAD,45991
4,AAE,21343


In [5]:
# create random data

end_timestamp = pd.Timestamp.now()
start_timestamp = end_timestamp - pd.DateOffset(months=2)

timestamps = pd.date_range(
    start=start_timestamp, end=end_timestamp, periods=n_activities
).round('min')  # round to minutes

timestamps[:3]

DatetimeIndex(['2024-01-20 10:23:00', '2024-01-21 00:56:00',
               '2024-01-21 15:29:00'],
              dtype='datetime64[ns]', freq=None)

In [6]:
user_search_data = []
add_to_cart_data = []
remove_from_cart_data = []
cart_order_data = []

for activity_idx in range(n_activities):
	customer_id = np.random.randint(1, n_customers + 1)
	activity = np.random.choice(['search', 'add_to_cart', 'place_order', 'remove_from_cart'], p=[0.7, 0.15, 0.1, 0.05])
 
	# remove_from_cart can be done only if there are products in the cart
	if activity == 'remove_from_cart' and len(add_to_cart_data) == 0:
		activity = 'add_to_cart'
 
	if activity == 'search':
		# 10% chance for 0 results. 30% for only 1 result
		resultCount_probabilities = [0.1] + [0.3] + [0.6/98]*98  # 10% chance for 0, 30% for 1, and the rest distributed among 2-99
		resultCount = np.random.choice(range(100), p=resultCount_probabilities)

		user_search_data.append({
			'customerID': customer_id,
			'keyword': np.random.choice(products),
			'resultCount': None if resultCount == 0 else resultCount,
			'timestamp': timestamps[activity_idx],
		})
	
	elif activity == 'add_to_cart':
		add_to_cart_data.append({
			'customerID': customer_id,
			'productID': np.random.randint(1, n_products + 1),
			'timestamp': timestamps[activity_idx],
		})
	
	elif activity == 'place_order':
		n_products_to_order = np.random.randint(1, 6)
		product_ids = np.random.choice(n_products, n_products_to_order, replace=False)
		
		cart_order_data.append({
			'customerID': customer_id,
			'productIDs': product_ids.tolist(),
			'timestamp': timestamps[activity_idx],
		})
	
	elif activity == 'remove_from_cart':
		# select a random product from add_to_cart_data
		sample = np.random.choice(add_to_cart_data)

		remove_from_cart_data.append({
			'customerID': sample['customerID'],
			'productID': sample['productID'],
			'timestamp': timestamps[activity_idx],
		})

user_search_df = pd.DataFrame(user_search_data)
add_to_cart_df = pd.DataFrame(add_to_cart_data)
remove_from_cart_df = pd.DataFrame(remove_from_cart_data)
cart_order_df = pd.DataFrame(cart_order_data)

user_search_df.head()

Unnamed: 0,customerID,keyword,resultCount,timestamp
0,33,KEG,,2024-01-21 00:56:00
1,52,KEX,1.0,2024-01-21 15:29:00
2,59,AOT,77.0,2024-01-22 06:02:00
3,72,EJJ,56.0,2024-01-24 16:13:00
4,4,CEE,1.0,2024-01-25 21:18:00


In [7]:
add_to_cart_df.head()

Unnamed: 0,customerID,productID,timestamp
0,25,1129,2024-01-23 11:07:00
1,69,8777,2024-01-25 06:45:00
2,45,6677,2024-01-28 07:29:00
3,90,3324,2024-01-31 22:45:00
4,6,8159,2024-02-01 13:18:00


In [8]:
remove_from_cart_df.head()

Unnamed: 0,customerID,productID,timestamp
0,41,899,2024-03-05 06:45:00


In [9]:
cart_order_df.head()

Unnamed: 0,customerID,productIDs,timestamp
0,54,"[4013, 3539, 875, 5237, 9590]",2024-01-20 10:23:00
1,26,[6138],2024-01-22 20:34:00
2,83,"[946, 7802, 1931]",2024-01-24 01:40:00
3,70,"[7459, 3661, 637, 4946]",2024-01-26 11:51:00
4,11,"[7576, 1703]",2024-02-02 18:23:00


### Data pre-processing

In [10]:
def apply_time_columns(df):
  # 0: 10pm-6am: night = N, 1: 6am-12pm: morning = M, 2: 12pm-6pm: afternoon = A, 3: 6pm-10pm: evening = E
  df['time_of_day'] = df['timestamp'].apply(lambda x: 'N' if x.hour < 6 else 'M' if x.hour < 12 else 'A' if x.hour < 18 else 'E')
  df['is_weekend'] = df['timestamp'].apply(lambda x: x.dayofweek >= 5)


In [11]:
# products_df

scaler = MinMaxScaler()
products_df[['scaled_price']] = scaler.fit_transform(products_df[['price']])
cost_levels = [1, 2, 3, 4, 5]
products_df['cost_level'] = pd.qcut(products_df['price'], 5, labels=cost_levels)

# One-hot encoding
products_df = pd.get_dummies(products_df, columns=['product'])

products_df.head()

Unnamed: 0,price,scaled_price,cost_level,product_AAA,product_AAB,product_AAC,product_AAD,product_AAE,product_AAF,product_AAG,...,product_OUG,product_OUH,product_OUI,product_OUJ,product_OUK,product_OUL,product_OUM,product_OUN,product_OUO,product_OUP
0,2832,0.054562,1,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,43667,0.873096,5,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,42713,0.853973,5,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,45991,0.91968,5,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,21343,0.425613,3,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# user_search_df

# fill NA in resultCount with 0
user_search_df['resultCount'] = user_search_df['resultCount'].fillna(0)

# replace resultCount with no_result using resultCount
user_search_df['no_result'] = user_search_df['resultCount'].apply(lambda x: True if x == 0 else False)
user_search_df.drop(columns=['resultCount'], inplace=True)

# convert customerID to int
user_search_df['customerID'] = user_search_df['customerID'].astype(int)

# remove duplicates
user_search_df.drop_duplicates(inplace=True)

# if keyword is empty string, replace with None
user_search_df['keyword'] = user_search_df['keyword'].replace('', np.nan)
# drop if keyword is NA
user_search_df.dropna(subset=['keyword'], inplace=True)

# Feature engineering
apply_time_columns(user_search_df)

user_search_df.head()

Unnamed: 0,customerID,keyword,timestamp,no_result,time_of_day,is_weekend
0,33,KEG,2024-01-21 00:56:00,True,N,True
1,52,KEX,2024-01-21 15:29:00,False,A,True
2,59,AOT,2024-01-22 06:02:00,False,M,False
3,72,EJJ,2024-01-24 16:13:00,False,A,False
4,4,CEE,2024-01-25 21:18:00,False,E,False


In [13]:
# add_to_cart_df

# add column 'is_removed' and use data from remove_from_cart_df
# match by customerID and productID
add_to_cart_df['is_removed'] = add_to_cart_df.apply(
    lambda x: (
        True
        if remove_from_cart_df[
            (remove_from_cart_df['customerID'] == x['customerID'])
            & (remove_from_cart_df['productID'] == x['productID'])
        ].shape[0]
        > 0
        else False
    ),
    axis=1,
)

# Feature engineering
apply_time_columns(add_to_cart_df)

add_to_cart_df.head()

Unnamed: 0,customerID,productID,timestamp,is_removed
0,25,1129,2024-01-23 11:07:00,False
1,69,8777,2024-01-25 06:45:00,False
2,45,6677,2024-01-28 07:29:00,False
3,90,3324,2024-01-31 22:45:00,False
4,6,8159,2024-02-01 13:18:00,False


In [None]:
# remove_from_cart_df

In [None]:
# cart_order_df