## Application of BPR on Zazzle Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
import matplotlib as mpl
from subprocess import call
from copy import deepcopy
from pybpr import *
import scipy.sparse as sp
from functools import reduce
import matplotlib.pyplot as plt
from functools import partial
from datetime import datetime, timedelta
from scipy.special import expit

ModuleNotFoundError: No module named 'pybpr'

In [None]:
# load data
DATA_DIR = '/projects/zazzle/rsandhu/pybpr/examples/output/zazzle_data'
df_v = pd.read_parquet(os.path.join(DATA_DIR, 'view_data.parquet'))
df_c = pd.read_parquet(os.path.join(DATA_DIR, 'click_data.parquet'))
df_o = pd.read_parquet(os.path.join(DATA_DIR, 'order_data.parquet'))
df_v_not_c = pd.read_parquet(os.path.join(
    DATA_DIR, 'viewed_not_clicked_data.parquet'))
df_c_not_o = pd.read_parquet(os.path.join(
    DATA_DIR, 'clicked_not_ordered_data.parquet'))

In [None]:
# put data in a dict format
zazzle_data = {
    'clicked': df_c,
    'ordered': df_o,
    'clicked_not_ordered': df_c_not_o,
    'viewed_not_clicked': df_v_not_c
}
for iname, idf in zazzle_data.items():
    min_time = datetime.fromtimestamp(idf['date_created'].min())
    max_time = datetime.fromtimestamp(idf['date_created'].max())
    print(iname, ': ', min_time, ' - ', max_time)

In [None]:
# get time weigting
fn_train = partial(
    get_time_weighting, 
    datetime_min = datetime(2021,1,1),
    datetime_max = datetime(2023,1,1),
    scaling_days=180
)
# fn_train = partial(
#     get_time_weighting, 
#     datetime_min = datetime(2023,1,1),
#     datetime_max = datetime(2023,2,1),
#     scaling_days=10
# )
fn_test = partial(
    get_time_weighting, 
    datetime_min = datetime(2023,1,1),
    datetime_max = datetime(2025,1,1),
    scaling_days=-180
)
for iname, idf in zazzle_data.items():
    idf['time_wgt_train'] = fn_train(idf['date_created'])
    idf['time_wgt_test'] = fn_test(idf['date_created'])

In [None]:
list_of_dates = [datetime(2021,8,1) + timedelta(days=ix*10) for ix in range(100)] 
dates = mpl.dates.date2num(list_of_dates)
list_of_dates = np.array([ix.timestamp() for ix in list_of_dates])
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(dates, fn_train(list_of_dates), '-r', label='Train')
ax.plot(dates, fn_test(list_of_dates), '-b', label='Test')
myFmt = mpl.dates.DateFormatter('%Y-%m')
ax.xaxis.set_major_formatter(myFmt)
ax.set_xlabel('Time')
ax.set_ylabel('Time weighting')
ax.grid(True)
fig.autofmt_xdate()
ax.legend()

In [23]:
# create useriteminteraction instances
num_users = df_v.user_id.nunique()
num_items = df_v.product_id.nunique()

zazzle_ui = {}
for iname, idf in zazzle_data.items():
    train_bool = idf['time_wgt_train'] > 0.01
    zazzle_ui[iname] = UserItemInteractions(
        users_index=idf.loc[train_bool,'user_idx'],
        items_index=idf.loc[train_bool, 'product_idx'],
        num_users=num_users,
        num_items=num_items,
        weights=idf.loc[train_bool, 'time_wgt_train'],
        name=iname
    )
    test_bool = idf['time_wgt_test'] > 0.01
    zazzle_ui[iname].create_test_matrix(
        users_index = idf.loc[test_bool,'user_idx'],
        items_index = idf.loc[test_bool, 'product_idx']
    )
    print(zazzle_ui[iname])


---clicked---
# of users, train: 13555/16462
# of items, train: 257311/779192
# of users, test: 6353/16462
# of items, test: 117639/779192
# of interactions, train: 621358
# of interactions, test: 212287
Memory used by train/test mat: 1.19/0.4 MB

---ordered---
# of users, train: 13179/16462
# of items, train: 110833/779192
# of users, test: 5598/16462
# of items, test: 46891/779192
# of interactions, train: 197803
# of interactions, test: 68795
Memory used by train/test mat: 0.38/0.13 MB

---clicked_not_ordered---
# of users, train: 12759/16462
# of items, train: 201722/779192
# of users, test: 5980/16462
# of items, test: 89738/779192
# of interactions, train: 423555
# of interactions, test: 143492
Memory used by train/test mat: 0.81/0.27 MB

---viewed_not_clicked---
# of users, train: 13748/16462
# of items, train: 764264/779192
# of users, test: 6769/16462
# of items, test: 682202/779192
# of interactions, train: 20949067
# of interactions, test: 7144259
Memory used by train/test 

In [24]:
# spot check time weigting
pos_int = 'ordered'
neg_int = 'clicked_not_ordered'
iuser = 1823#1823
idf = zazzle_data[pos_int]
idf = idf[idf['user_idx']==iuser].sort_index(ascending=False)
jdf = zazzle_data[neg_int]
jdf = jdf[jdf['user_idx']==iuser].sort_index(ascending=False)
print('positive')
for ix, irow in idf.iterrows():
    itime = datetime.fromtimestamp(irow['date_created'])
    print(itime, ' : ', irow['product_idx'], ' = ', np.around(irow['time_wgt_train'],2))
print('negative')
for ix, irow in jdf.iterrows():
    itime = datetime.fromtimestamp(irow['date_created'])
    print(itime, ' : ', irow['product_idx'], ' = ', np.around(irow['time_wgt_train'],2))

print('selection')
sel_list = []
for _ in range(1000):
    pos_neg_pair = time_explicit_negative_sampler(
        iuser=iuser, 
        pos_uimat=zazzle_ui['ordered'].mat,
        neg_uimat=zazzle_ui['clicked_not_ordered'].mat
    )
    sel_list.append(pos_neg_pair[0])
entry, count = np.unique(np.array(sel_list), return_counts=True)
count = count.astype(np.float64)/np.sum(count).astype(np.float64)
for ix, iy in zip(entry,count):
    print(ix, np.around(iy,2))

positive
2022-09-16 05:38:11  :  425228  =  0.81
2022-09-16 05:38:11  :  736955  =  0.81
2022-09-16 05:38:11  :  743133  =  0.81
2022-09-16 05:38:11  :  527840  =  0.81
2022-07-22 12:11:39  :  530783  =  0.75
2022-07-22 12:11:39  :  413853  =  0.75
2022-07-22 12:11:39  :  405897  =  0.75
2022-07-22 12:11:39  :  642950  =  0.75
2022-07-22 12:11:39  :  507193  =  0.75
2022-07-22 12:11:39  :  491377  =  0.75
negative
2022-09-16 05:20:00  :  690793  =  0.81
2022-09-16 05:18:15  :  550388  =  0.81
2022-09-16 05:10:11  :  436756  =  0.81
2022-09-16 05:07:25  :  481925  =  0.81
2022-09-16 05:01:21  :  553371  =  0.81
2022-09-16 04:55:48  :  714018  =  0.81
2022-07-22 12:06:33  :  464754  =  0.75
2022-07-22 11:57:07  :  419772  =  0.75
selection
405897 0.11
413853 0.1
425228 0.1
491377 0.1
507193 0.08
527840 0.1
530783 0.12
642950 0.07
736955 0.11
743133 0.11


In [25]:
# # pair 1
# mat_train_pos = zazzle_ui['ordered'].mat_train
# mat_train_neg = zazzle_ui['clicked_not_ordered'].mat_train
# mat_test = zazzle_ui['ordered'].mat_test

# pair 2
mat_train_pos = zazzle_ui['clicked'].mat_train
mat_train_neg = zazzle_ui['viewed_not_clicked'].mat_train
mat_test = zazzle_ui['clicked'].mat_test

bpr_base = BPR(
    num_features=200,
    reg_lambda=0.0,
    num_iters=500,
    learning_rate = 0.1,
    batch_size=10000,
    initial_std=0.0001,
)
bpr_base

BPR(mname='bpr_model', num_features=200, num_iters=500, batch_size=10000, initial_std=0.0001, reg_lambda=0.0, learning_rate=0.1, verbose=False)

In [28]:
#uniform negative sampler
neg_sampler_uni = partial(
    uniform_negative_sampler, 
    uimat=mat_train_pos
)

bpr_uni = bpr_base
bpr_uni.initiate(num_users=num_users, num_items=num_items)
for _ in range(2):
    results = bpr_fit(
        bpr_obj=bpr_uni, 
        neg_sampler=neg_sampler_uni, 
        ncores=104
    )
    mfunc = partial(
        bpr_uni.get_metric_v1,
        perc_active_users=0.99,
        perc_active_items=0.99,
        num_recs=60,
        max_users_per_batch=2000
    )
    _ = mfunc(uimat=mat_test)

BPR-Train: 100%|██████████| 5000000/5000000 [00:08<00:00, 595207.14it/s]
BPR-Score: 100%|██████████| 4/4 [01:09<00:00, 17.28s/it]
BPR-Train: 100%|██████████| 5000000/5000000 [00:08<00:00, 579445.10it/s]
BPR-Score: 100%|██████████| 4/4 [01:07<00:00, 16.98s/it]


In [31]:
bpr_uni.metric_tracker

[array([0., 0., 0.]), array([0., 0., 0.])]

In [12]:
#explicit negative sampler
neg_sampler_exp = partial(
    explicit_negative_sampler, 
    pos_uimat=mat_train_pos,
    neg_uimat=mat_train_neg
)

bpr_exp = bpr_base
bpr_exp.initiate(num_users=num_users, num_items=num_items)
for _ in range(5):
    results = bpr_fit(
        bpr_obj=bpr_exp, 
        neg_sampler=neg_sampler_exp, 
        ncores=104
    )
    mfunc = partial(
        bpr_exp.get_metric_v1,
        perc_active_users=0.5,
        perc_active_items=0.5,
        num_recs=60,
        max_users_per_batch=100
    )
    _ = mfunc(uimat=mat_test)

BPR-Train: 100%|██████████| 5000000/5000000 [00:54<00:00, 91517.34it/s]
BPR-Score: 100%|██████████| 64/64 [01:04<00:00,  1.01s/it]
BPR-Train: 100%|██████████| 5000000/5000000 [00:54<00:00, 91207.55it/s]
BPR-Score: 100%|██████████| 64/64 [01:11<00:00,  1.12s/it]
BPR-Train: 100%|██████████| 5000000/5000000 [00:54<00:00, 91533.61it/s]
BPR-Score: 100%|██████████| 64/64 [01:18<00:00,  1.23s/it]
BPR-Train: 100%|██████████| 5000000/5000000 [00:54<00:00, 91369.43it/s]
BPR-Score: 100%|██████████| 64/64 [01:16<00:00,  1.19s/it]
BPR-Train: 100%|██████████| 5000000/5000000 [00:54<00:00, 91298.85it/s]
BPR-Score: 100%|██████████| 64/64 [01:16<00:00,  1.20s/it]


In [14]:
bpr_exp.metric_tracker

[array([0., 0., 0.]),
 array([0., 0., 0.]),
 array([0., 0., 0.]),
 array([0., 0., 0.]),
 array([0., 0., 0.])]

In [10]:
#time explicit negative sampler
neg_sampler_texp = partial(
    time_explicit_negative_sampler, 
    pos_uimat=mat_train_pos,
    neg_uimat=mat_train_neg
)

bpr_texp = bpr_base
bpr_texp.initiate(num_users=num_users, num_items=num_items)
for _ in range(5):
    results = bpr_fit(
        bpr_obj=bpr_texp, 
        neg_sampler=neg_sampler_texp, 
        ncores=104
    )
    mfunc = partial(
        bpr_texp.get_metric_v1,
        perc_active_users=0.5,
        perc_active_items=0.5,
        num_recs=60,
        max_users_per_batch=100
    )
    _ = mfunc(uimat=mat_train_pos)

BPR-Train:  25%|██▌       | 1250080/5000000 [00:30<01:10, 53184.14it/s]

Process ForkPoolWorker-2:
Process ForkPoolWorker-75:
Traceback (most recent call last):
Process ForkPoolWorker-68:
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
           ^^^^^^^^^^^^^^^^
  File "/kfs2/projects/zazzle/rsandhu/pybpr/pybpr/bpr.py", line 357, in bpr_update
    item_ith, item_jth = 

BPR-Train:  25%|██▌       | 1274119/5000000 [01:32<04:31, 13729.07it/s]

Process ForkPoolWorker-5:
Traceback (most recent call last):
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-73:
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/home/rsandhu/.conda-envs/bpr_env/lib/python3.12/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
           ^^^^^^^^^^^^^^^^
  File "/kfs2/projects/zazzle/rsandhu/pybpr/pybpr/bpr.py", line 357, in bpr_update
    item_ith, item_jth = negative_sampler(iuser=this_user)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/kfs2/projects/zazzle/rsandhu/pybpr/pybpr/sampling.py", line 118, in time_explicit_neg




KeyboardInterrupt: 

In [None]:
bpr_uni.metric_tracker

In [None]:
_ = plt.hist(np.asarray(mat_test.sum(axis=1)), bins=40, range=(1,60))

In [None]:
np.flip(np.sort(np.asarray(mat_test.sum(axis=1)).flatten()))[:50]

In [None]:
mat_test.dtype == np.float16

In [None]:

# # explicit negative sampler
# neg_sampler_exp = partial(
#     explicit_negative_sampler,
#     pos_uimat=mat_train_pos,
#     neg_uimat=mat_train_neg
# )
# # timed explicit negative sampler
# neg_sampler_texp = partial(
#     time_explicit_negative_sampler,
#     pos_uimat=mat_train_pos,
#     neg_uimat=mat_train_neg
# )

# bpr1 = BPR(
#     num_features=200,
#     reg_lambda=0.0,
#     num_iters=500,
#     learning_rate = 0.1,
#     batch_size=10000,
#     initial_std=0.0001,
# )


In [None]:
num_items

In [None]:
# def get_time_weighting(x, cutoff_days, scaling_days):
#     scale = timedelta(days=scaling_days).total_seconds()
#     cutoff = timedelta(days=cutoff_days).total_seconds()
#     iday = timedelta(days=x).total_seconds()
#     return expit(-(iday-cutoff)/scale)

# fig, ax = plt.subplots(figsize=(5,3))
# x = np.linspace(0,40,41)
# weekly_fn = partial(get_time_weighting, cutoff_days=7, scaling_days=1)
# ax.plot(x, [weekly_fn(ix) for ix in x],'-r', label='Weekly BPR')
# monthly_fn = partial(get_time_weighting, cutoff_days=25, scaling_days=2)
# ax.plot(x, [monthly_fn(ix) for ix in x],'-b', label='Monthly BPR')
# ax.grid(True)
# ax.legend()
# ax.set_xlabel('Time lag (days)')
# ax.set_ylabel('Time weighting')

In [None]:
df_c['date_created]

In [None]:
tlags

In [None]:
num_users = df_v.user_id.nunique()
num_items = df_v.product_id.nunique()

# viewed not clicked
test_ratio = 0.0
data_viewed_not_clicked = UserItemInteractions(
    users_index=df_v_not_c['user_idx'],
    items_index=df_v_not_c['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_viewed_not_clicked.generate_train_test(user_test_ratio=test_ratio)

# clicked
data_clicked = UserItemInteractions(
    users_index=df_c['user_idx'],
    items_index=df_c['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_clicked.generate_train_test(user_test_ratio=test_ratio)

# clicked not ordered
data_clicked_not_ordered = UserItemInteractions(
    users_index=df_c_not_o['user_idx'],
    items_index=df_c_not_o['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_clicked_not_ordered.generate_train_test(user_test_ratio=test_ratio)

# ordered
data_ordered = UserItemInteractions(
    users_index=df_o['user_idx'],
    items_index=df_o['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_ordered.generate_train_test(user_test_ratio=test_ratio)

## BPR 

In [None]:
bpr1 = BPR(
    num_features=200,
    reg_lambda=0.0,
    num_iters=500,
    learning_rate = 0.1,
    batch_size=15000,
    initial_std=0.0001,
)
bpr1.initiate(num_users=data_clicked.num_users, num_items=data_clicked.num_items)

In [None]:
pos_data = data_clicked
neg_data = data_clicked_not_ordered
metric_log_train = []
# neg_sampler = partial(
#     uniform_negative_sampler, 
#     uimat=training_data
# )
neg_sampler = partial(
    explicit_negative_sampler,
    pos_uimat=pos_data.mat,
    neg_uimat=neg_data.mat
)

for _ in range(4):
    results = bpr_fit(
        bpr_obj=bpr1, 
        neg_sampler=neg_sampler, 
        ncores=104
    )
    mfunc = partial(
        bpr1.get_metric_v1,
        perc_active_users=0.25,
        perc_active_items=0.25,
        num_recs=60,
        max_users_per_batch=1000
    )
    metric_log_train.append(mfunc(uimat=pos_data.mat))
metric_log_train = np.asarray(metric_log_train)

In [None]:
metric_log_train

In [None]:
OUT_DIR ='/projects/zazzle/rsandhu/pybpr/examples/output'
bpr1.save_model(dir_name=OUT_DIR)

In [None]:
bpr1.load_model(OUT_DIR)

In [None]:
bpr2 = BPR()
bpr2.load_model(OUT_DIR)

In [None]:
np.all(bpr2.umat == bpr1.umat)
np.all(bpr2.imat == bpr1.imat)

In [None]:
bpr2

In [None]:
bpr1