In [220]:
import pandas as pd
import os
import sys
from tqdm import tqdm
import numpy as np
from datetime import datetime
import pytz
import implicit

# from sklearn.model_selection import train_test_split
from implicit.evaluation import train_test_split as implicit_train_test_split
from implicit.evaluation import precision_at_k
from scipy.sparse import csr_matrix

from implicit import recommender_base

%load_ext jupyter_black

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [2]:
df = pd.read_csv("events.csv")

In [3]:
# Check Preview Data

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
event_time,2020-09-24 11:57:06 UTC,2020-09-24 11:57:26 UTC,2020-09-24 11:57:27 UTC,2020-09-24 11:57:33 UTC,2020-09-24 11:57:36 UTC
event_type,view,view,view,view,view
product_id,1996170,139905,215454,635807,3658723
category_id,2144415922528452715,2144415926932472027,2144415927158964449,2144415923107266682,2144415921169498184
category_code,electronics.telephone,computers.components.cooler,,computers.peripherals.printer,
brand,,zalman,,pantum,cameronsino
price,31.9,17.16,9.81,113.81,15.87
user_id,1515915625519388267,1515915625519380411,1515915625513238515,1515915625519014356,1515915625510743344
user_session,LJuJVLEjPT,tdicluNnRY,4TMArHtXQy,aGFYrNgC08,aa4mmk0kwQ


In [5]:
# Check NaN

In [6]:
df.isna().sum() / df.shape[0]

event_time       0.000000
event_type       0.000000
product_id       0.000000
category_id      0.000000
category_code    0.266875
brand            0.239924
price            0.000000
user_id          0.000000
user_session     0.000186
dtype: float64

In [7]:
# Check Unique Cat Id and Cat Code

In [8]:
df.category_id.unique().shape

(718,)

In [9]:
df.category_code.unique().shape  # only 1/7 have code

(108,)

In [10]:
df.product_id.unique().shape

(53453,)

In [11]:
# Check unique user ID

In [12]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-09-24 11:57:06 UTC,view,1996170,2144415922528452715,electronics.telephone,,31.9,1515915625519388267,LJuJVLEjPT
1,2020-09-24 11:57:26 UTC,view,139905,2144415926932472027,computers.components.cooler,zalman,17.16,1515915625519380411,tdicluNnRY
2,2020-09-24 11:57:27 UTC,view,215454,2144415927158964449,,,9.81,1515915625513238515,4TMArHtXQy
3,2020-09-24 11:57:33 UTC,view,635807,2144415923107266682,computers.peripherals.printer,pantum,113.81,1515915625519014356,aGFYrNgC08
4,2020-09-24 11:57:36 UTC,view,3658723,2144415921169498184,,cameronsino,15.87,1515915625510743344,aa4mmk0kwQ


In [13]:
df.user_id.unique().shape

(407283,)

In [14]:
# Check data period

In [15]:
df["event_time"] = df["event_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S %Z").replace(tzinfo=pytz.UTC)
)

In [16]:
print(df["event_time"].min())
print(df["event_time"].max())

2020-09-24 11:57:06+00:00
2021-02-28 23:59:09+00:00


In [17]:
df["year"] = df["event_time"].dt.year
df["month"] = df["event_time"].apply(lambda x: x.strftime("%m"))
df["period"] = df.apply(lambda x: str(x["year"]) + "-" + x["month"], axis=1)

In [18]:
# Check num of interactions monthly

In [19]:
df.groupby(by=["period"]).size()

period
2020-09     28074
2020-10    161544
2020-11    188225
2020-12    152720
2021-01    187587
2021-02    166979
dtype: int64

In [36]:
# Check kind of event

In [20]:
df["event_type"].unique()

array(['view', 'cart', 'purchase'], dtype=object)

In [None]:
# Data Exploration

In [124]:
# Categort Analysis

In [21]:
view_count = (
    (
        df.groupby("category_id")["event_type"]
        .apply(lambda x: (x == "view").sum())
        .reset_index(name="view_count")
    )
    .sort_values(by="view_count", ascending=False)
    .reset_index(drop=True)
)

In [22]:
cart_count = (
    (
        df.groupby("category_id")["event_type"]
        .apply(lambda x: (x == "cart").sum())
        .reset_index(name="cart_count")
    )
    .sort_values(by="cart_count", ascending=False)
    .reset_index(drop=True)
)

In [23]:
purchase_count = (
    (
        df.groupby("category_id")["event_type"]
        .apply(lambda x: (x == "purchase").sum())
        .reset_index(name="purchase_count")
    )
    .sort_values(by="purchase_count", ascending=False)
    .reset_index(drop=True)
)

In [24]:
agg_df = pd.merge(
    view_count, cart_count, left_on="category_id", right_on="category_id"
).merge(purchase_count, on="category_id")

In [25]:
agg_df["add_to_cart"] = agg_df["cart_count"] / agg_df["view_count"]
agg_df["checkout_ratio"] = agg_df["purchase_count"] / agg_df["cart_count"]

In [26]:
agg_df["checkout_ratio"] = agg_df["checkout_ratio"].fillna(0)
agg_df["checkout_ratio"] = agg_df["checkout_ratio"].apply(
    lambda x: 0 if x > 10000000000 else x
)

In [27]:
product_to_price = (
    df.drop_duplicates(subset=["product_id"])
    .groupby(by=["category_id"])
    .price.agg(["mean", "median", "count"])
    .reset_index()
)

product_to_price = product_to_price.rename(
    columns={
        "mean": "mean_price",
        "median": "median_price",
        "count": "num_unique_product",
    }
)

In [28]:
agg_df = agg_df.merge(product_to_price, on=["category_id"])

In [29]:
agg_df.head()

Unnamed: 0,category_id,view_count,cart_count,purchase_count,add_to_cart,checkout_ratio,mean_price,median_price,num_unique_product
0,2144415922427789416,97145,12684,6888,0.130568,0.543046,398.784304,308.43,539
1,2144415925011480748,32552,2975,2679,0.091392,0.900504,62.174043,27.55,3042
2,2144415922528452715,30442,3327,2759,0.10929,0.829276,31.161895,25.17,1657
3,2144415924491387038,23225,2113,1266,0.09098,0.599148,177.290902,112.05,399
4,2144415925196030129,22387,794,322,0.035467,0.405542,196.299719,102.54,569


In [122]:
# User Analysis

In [30]:
cached = df.groupby(by=["user_id"]).agg(
    num_view=("event_type", lambda x: (x == "view").sum()),
    num_cart=("event_type", lambda x: (x == "cart").sum()),
    num_purchase=("event_type", lambda x: (x == "purchase").sum()),
)

In [31]:
cached = cached.reset_index()

In [32]:
cached.reset_index()[["num_view", "num_cart", "num_purchase"]].corr("spearman")

Unnamed: 0,num_view,num_cart,num_purchase
num_view,1.0,0.345112,0.258649
num_cart,0.345112,1.0,0.701672
num_purchase,0.258649,0.701672,1.0


In [33]:
cached_2 = (
    df[df.event_type == "cart"]
    .groupby(by=["user_id"])
    .agg(
        num_unique_product_cart=("product_id", "nunique"),
        num_unique_category_cart=("category_id", "nunique"),
    )
    .reset_index()
)

In [34]:
cached_3 = cached.merge(cached_2, on=["user_id"], how="left")

In [35]:
cached_3["num_unique_product_cart"] = cached_3["num_unique_product_cart"].fillna(0)
cached_3["num_unique_category_cart"] = cached_3["num_unique_category_cart"].fillna(0)

In [36]:
cached_3.dropna()[
    [
        "num_view",
        "num_cart",
        "num_purchase",
        "num_unique_product_cart",
        "num_unique_category_cart",
    ]
].corr()

Unnamed: 0,num_view,num_cart,num_purchase,num_unique_product_cart,num_unique_category_cart
num_view,1.0,0.449734,0.310665,0.411137,0.34153
num_cart,0.449734,1.0,0.594417,0.919823,0.767005
num_purchase,0.310665,0.594417,1.0,0.562469,0.546721
num_unique_product_cart,0.411137,0.919823,0.562469,1.0,0.875415
num_unique_category_cart,0.34153,0.767005,0.546721,0.875415,1.0


In [37]:
view_threshold = 15
purchase_threshold = 5
unique_threshold = 3


def categorize(row):
    view_category = "High" if row["num_view"] >= view_threshold else "Low"
    purchase_category = "High" if row["num_purchase"] >= purchase_threshold else "Low"
    unique_category = (
        "High" if row["num_unique_category_cart"] >= unique_threshold else "Low"
    )

    return (
        f"{view_category} View, {purchase_category} Purchase, {unique_category} Unique"
    )

In [58]:
cached_3[cached_3.num_purchase > 0][
    [
        "num_view",
        "num_cart",
        "num_purchase",
        "num_unique_product_cart",
        "num_unique_category_cart",
    ]
].describe()

Unnamed: 0,num_view,num_cart,num_purchase,num_unique_product_cart,num_unique_category_cart
count,21304.0,21304.0,21304.0,21304.0,21304.0
mean,5.496386,1.528351,1.753004,1.206863,1.022155
std,9.714407,1.843867,1.742418,1.118467,0.554479
min,0.0,0.0,1.0,0.0,0.0
25%,2.0,1.0,1.0,1.0,1.0
50%,3.0,1.0,1.0,1.0,1.0
75%,6.0,2.0,2.0,1.0,1.0
max,318.0,85.0,56.0,33.0,11.0


In [39]:
cached_3["user_type"] = cached_3.apply(categorize, axis=1)

In [40]:
cached_3["user_type"].value_counts().sort_index()

High View, High Purchase, High Unique       124
High View, High Purchase, Low Unique        242
High View, Low Purchase, High Unique        173
High View, Low Purchase, Low Unique        2849
Low View, High Purchase, High Unique         41
Low View, High Purchase, Low Unique         519
Low View, Low Purchase, High Unique         191
Low View, Low Purchase, Low Unique       403144
Name: user_type, dtype: int64

In [129]:
cached_3[cached_3.num_purchase > 2]

Unnamed: 0,user_id,num_view,num_cart,num_purchase,num_unique_product_cart,num_unique_category_cart,user_type
29,1515915625353534622,5,2,3,1.0,1.0,"Low View, Low Purchase, Low Unique"
94,1515915625354561351,15,3,3,1.0,1.0,"High View, Low Purchase, Low Unique"
116,1515915625355179497,17,4,3,3.0,1.0,"High View, Low Purchase, Low Unique"
127,1515915625355398801,107,3,3,1.0,1.0,"High View, Low Purchase, Low Unique"
142,1515915625355805313,107,6,9,2.0,1.0,"High View, High Purchase, Low Unique"
...,...,...,...,...,...,...,...
406929,1515915625610973155,6,1,4,1.0,1.0,"Low View, Low Purchase, Low Unique"
406947,1515915625610976222,33,17,3,10.0,1.0,"High View, Low Purchase, Low Unique"
406953,1515915625610977027,3,1,3,1.0,1.0,"Low View, Low Purchase, Low Unique"
407094,1515915625610997879,3,2,3,2.0,1.0,"Low View, Low Purchase, Low Unique"


In [41]:
# Product Wise Analysis

In [42]:
temp = df.groupby(by=["product_id"]).agg(
    num_view=("event_type", lambda x: (x == "view").sum()),
    num_cart=("event_type", lambda x: (x == "cart").sum()),
    num_purchase=("event_type", lambda x: (x == "purchase").sum()),
)

In [43]:
temp

Unnamed: 0_level_0,num_view,num_cart,num_purchase
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
102,19,3,5
105,664,10,3
109,42,0,0
526,30,4,5
561,2,0,0
...,...,...,...
4183875,183,20,7
4183876,16,1,0
4183877,6,1,0
4183878,6,0,0


In [44]:
temp_2 = (
    df[df.event_type == "cart"]
    .groupby(by=["product_id"])
    .agg(
        num_unique_user=("user_id", "nunique"),
    )
    .reset_index()
)

In [45]:
temp_2.num_unique_user.describe()

count    9733.000000
mean        4.679852
std        22.403661
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max      1062.000000
Name: num_unique_user, dtype: float64

In [54]:
temp_2[temp_2.num_unique_user > 5].shape

(1246, 2)

In [48]:
temp_3 = temp.merge(temp_2, on=["product_id"], how="left")

In [50]:
temp_3["num_unique_user"] = temp_3["num_unique_user"].fillna(0)

In [53]:
temp_3[temp_3.num_purchase > 0][
    ["num_view", "num_cart", "num_purchase", "num_unique_user"]
].corr()

Unnamed: 0,num_view,num_cart,num_purchase,num_unique_user
num_view,1.0,0.930896,0.868575,0.936993
num_cart,0.930896,1.0,0.958048,0.99764
num_purchase,0.868575,0.958048,1.0,0.95085
num_unique_user,0.936993,0.99764,0.95085,1.0


In [57]:
temp_3[temp_3.num_purchase > 0][
    ["num_view", "num_cart", "num_purchase", "num_unique_user"]
].describe()

Unnamed: 0,num_view,num_cart,num_purchase,num_unique_user
count,6435.0,6435.0,6435.0,6435.0
mean,68.207148,7.686092,5.803574,6.414452
std,268.658666,32.836924,20.418118,27.387671
min,0.0,0.0,1.0,0.0
25%,8.0,1.0,1.0,1.0
50%,20.0,2.0,2.0,2.0
75%,51.0,5.0,4.0,4.0
max,12805.0,1220.0,564.0,1062.0


In [61]:
item_view_threshold = 60
item_purchase_threshold = 5
item_unique_threshold = 5


def item_categorize(row):
    view_category = "High" if row["num_view"] >= item_view_threshold else "Low"
    purchase_category = (
        "High" if row["num_purchase"] >= item_purchase_threshold else "Low"
    )
    unique_category = (
        "High" if row["num_unique_user"] >= item_unique_threshold else "Low"
    )

    return (
        f"{view_category} View, {purchase_category} Purchase, {unique_category} Unique"
    )

In [62]:
temp_3["item_type"] = temp_3.apply(item_categorize, axis=1)

In [63]:
temp_3["item_type"].value_counts().sort_index()

High View, High Purchase, High Unique      893
High View, High Purchase, Low Unique        48
High View, Low Purchase, High Unique       218
High View, Low Purchase, Low Unique        980
Low View, High Purchase, High Unique       229
Low View, High Purchase, Low Unique        326
Low View, Low Purchase, High Unique        190
Low View, Low Purchase, Low Unique       50569
Name: item_type, dtype: int64

In [234]:
temp_3["add_to_cart"] = temp_3["num_cart"] / temp_3["num_view"]

In [64]:
# Build ML Model using Turicreate by Apple

In [77]:
add_to_cart = df[df.event_type == "cart"].reset_index(drop=True)

In [82]:
cart_df = (
    add_to_cart.groupby(by=["user_id", "product_id"])
    .size()
    .to_frame("num_purchase")
    .reset_index()
)

In [98]:
user_list = cached_3[
    (cached_3.user_type != "Low View, Low Purchase, Low Unique")
    & (cached_3.user_type != "High View, High Purchase, High Unique ")
]["user_id"].tolist()

In [100]:
cart_df = cart_df[cart_df.user_id.isin(user_list)].reset_index(drop=True)

In [101]:
sparse_data = pd.crosstab(
    cart_df["user_id"],
    cart_df["product_id"],
    values=cart_df["num_purchase"],
    aggfunc="sum",
).fillna(0)

In [103]:
sparse_csr = csr_matrix(sparse_data.values)

In [121]:
train_data, test_data = implicit_train_test_split(sparse_csr, train_percentage=0.8)
model = implicit.als.AlternatingLeastSquares(
    factors=20, regularization=0.15, iterations=15
)
model.fit(train_data)

precision = precision_at_k(model, train_data.T, test_data.T, K=100)
print(f"Precision at K: {precision}")

100%|██████████| 15/15 [00:00<00:00, 1340.23it/s]
100%|██████████| 880/880 [00:00<00:00, 91110.75it/s]

Precision at K: 0.03488372093023256





In [207]:
def popular_item_model(user_item_matrix, K):
    popular_items = (
        sparse_data.sum(axis=0)
        .to_frame("num_purchase")
        .reset_index()
        .sort_values(by=["num_purchase"], ascending=False)
    ).index.tolist()
    return [list(popular_items[:K]) for _ in range(user_item_matrix.shape[0])]


def calculate_precision(predicted, actual):
    true_positives = sum(item in actual for item in predicted)
    false_positives = sum(item not in actual for item in predicted)

    if true_positives + false_positives == 0:
        return 0.0  # Handle the case where there are no positive predictions

    precision = true_positives / (true_positives + false_positives)
    return precision

In [232]:
# Get the top K recommended items for each user using the ALS model
user_top_k_items_als = [
    loaded_model.recommend(
        user, train_data[user], N=100, filter_already_liked_items=True
    )
    for user in range(train_data.shape[0])
]

user_top_k_items_als = [x[0].tolist() for x in user_top_k_items_als]


user_top_k_items_popular = popular_item_model(sparse_data, K=100)

precision_als = [
    calculate_precision(user_top_k_items_als[x], test_data[x].toarray()[0])
    for x in range(train_data.shape[0])
    if test_data[x].toarray()[0].sum() > 0
]

precision_popular_items = [
    calculate_precision(user_top_k_items_popular[x], test_data[x].toarray()[0])
    for x in range(train_data.shape[0])
    if test_data[x].toarray()[0].sum() > 0
]


print(
    f"Precision at K for ALS Implicit Model: {sum(precision_als) / len(precision_als)}"
)
print(
    f"Precision at K for Popular Item Model: {sum(precision_popular_items) / len(precision_popular_items)}"
)

Precision at K for ALS Implicit Model: 0.001098003629764066
Precision at K for Popular Item Model: 0.0


In [219]:
model.save("recommender_v1.model")

  check_blas_config()


In [231]:
loaded_model = implicit.als.AlternatingLeastSquares().load("recommender_v1.model.npz")

In [243]:
high_selling_items = (
    temp_3[temp_3.item_type == "High View, High Purchase, High Unique"]
    .sort_values(by=["add_to_cart"], ascending=False)
    .reset_index(drop=True)
)
high_selling_items.to_parquet("high_selling_items.parquet", index=False)

In [247]:
# Low View, High Purchase, High Unique

undersale_items = (
    temp_3[temp_3.item_type == "Low View, High Purchase, High Unique"]
    .sort_values(by=["add_to_cart"], ascending=False)
    .reset_index(drop=True)
)
undersale_items.to_parquet("undersale_items.parquet", index=False)

In [258]:
import pickle

user_to_id = {x: y for (y, x) in enumerate(sparse_data.index.tolist())}
user_to_type = {x: y for (x, y) in zip(cached_3["user_id"], cached_3["user_type"])}
id_to_item = {x: y for (x, y) in enumerate(sparse_data.columns.tolist())}


with open("user_to_id.pickle", "wb") as handle:
    pickle.dump(user_to_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("user_to_type.pickle", "wb") as handle:
    pickle.dump(user_to_type, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("id_to_item.pickle", "wb") as handle:
    pickle.dump(id_to_item, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
np.save("user_items_matrix.npy", sparse_data.values)

In [None]:
# Try Service

In [329]:
import requests

api_url = "http://127.0.0.1:5000/recommend_items"

data = {"user_id": 1515915625385482819}

response = requests.post(api_url, json=data)

In [None]:
cached_3[cached_3.user_type == "High View, High Purchase, Low Unique"]