## Application of BPR on Zazzle Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from subprocess import call
from pybpr import *
import scipy.sparse as sp
from functools import reduce
import matplotlib.pyplot as plt
from functools import partial

## Process data from Zazzle

In [3]:
%%time
data_dir = os.path.join(os.path.curdir, 'data', 'NREL')
files = [os.path.join(data_dir,f'Clicks_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
df = pd.concat([pd.read_parquet(ifile) for ifile in files])
cdf = df[df['is_click']].copy() # click data
vdf = df[~df['is_click']].copy() # not click is view data
#df_click = df_click[~df_click['is_click']].copy()

CPU times: user 6.08 s, sys: 1.65 s, total: 7.73 s
Wall time: 6.3 s


In [4]:
%%time
files = [os.path.join(data_dir,f'OrderItems_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
odf = pd.concat([pd.read_parquet(ifile) for ifile in files])

CPU times: user 116 ms, sys: 30.7 ms, total: 146 ms
Wall time: 129 ms


In [5]:
%%time
files = [os.path.join(data_dir,f'Products_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
pdf = pd.concat([pd.read_parquet(ifile) for ifile in files])

CPU times: user 2.54 s, sys: 668 ms, total: 3.21 s
Wall time: 2.12 s


In [6]:
vdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
cdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
cdf = pd.concat([cdf, odf]) # ensures order data is part of click data
vdf = pd.concat([vdf, cdf]) # ensures click data is part of view data

In [23]:
odf.head()

Unnamed: 0,user_id,date_created,product_id,user_idx,product_idx
0,011fd642-8268-4b30-a9ab-038ffa5a33b6,1675585562,256455388267258097,67,538605
1,011fd642-8268-4b30-a9ab-038ffa5a33b6,1675585562,228981329284404608,67,200357
2,011fd642-8268-4b30-a9ab-038ffa5a33b6,1678206962,256009884926995133,67,375492
3,011fd642-8268-4b30-a9ab-038ffa5a33b6,1678206962,256944911110360906,67,716408
4,011fd642-8268-4b30-a9ab-038ffa5a33b6,1678206962,256996950913031084,67,735182


In [7]:
# remove repeated entries
cdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
vdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
odf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)

In [8]:
print(f'Views:  int={vdf.shape[0]} u={vdf.user_id.nunique()} i={vdf.product_id.nunique()}')
print(f'Clicks: int={cdf.shape[0]} u={cdf.user_id.nunique()} i={cdf.product_id.nunique()}')
print(f'Orders: int={odf.shape[0]} u={odf.user_id.nunique()} i={odf.product_id.nunique()}')

Views:  int=28927112 u=16511 i=779192
Clicks: int=833698 u=16402 i=313452
Orders: int=266638 u=16302 i=139654


In [9]:
# 
idf = vdf.groupby('user_id')['product_id'].count().gt(10)
valid_users = idf[idf].index.to_numpy()
jdf = vdf.groupby('product_id')['user_id'].count().gt(10)
valid_items = jdf[jdf].index.to_numpy()
print(f'Valid Users = {valid_users.shape[0]}')
print(f'Valid Items = {valid_items.shape[0]}')

Valid Users = 16414
Valid Items = 736254


In [10]:
common_users = reduce(np.intersect1d, (vdf.user_id, cdf.user_id, odf.user_id, valid_users))
common_users = valid_users
vdf = vdf[vdf.user_id.isin(common_users)].copy()
cdf = cdf[cdf.user_id.isin(common_users)].copy()
odf = odf[odf.user_id.isin(common_users)].copy()
print(vdf.user_id.nunique(), cdf.user_id.nunique(), odf.user_id.nunique())
common_items = reduce(np.intersect1d, (vdf.product_id, cdf.product_id, odf.product_id, valid_items))
common_items=valid_items
vdf = vdf[vdf.product_id.isin(common_items)].copy()
cdf = cdf[cdf.product_id.isin(common_items)].copy()
odf = odf[odf.product_id.isin(common_items)].copy()
print(vdf.product_id.nunique(), cdf.product_id.nunique(), odf.product_id.nunique())

16414 16338 16244
736254 305854 136873


In [11]:
%%time
# get indices for users and products
for cname in ['user_id','product_id']:
    vdf[cname] = vdf[cname].astype('category')
    vdf[f'{cname}x'] = vdf[cname].cat.codes.astype(int)
    cdf[cname] = pd.Categorical(
        cdf[cname], 
        categories=vdf[cname].unique(), 
        ordered=False
    )
    cdf[f'{cname}x'] = cdf[cname].cat.codes.astype(int)
    odf[cname] = pd.Categorical(
        odf[cname], 
        categories=vdf[cname].unique(), 
        ordered=False
    )
    odf[f'{cname}x'] = odf[cname].cat.codes.astype(int)
cname='product_id'
pdf[cname] = pd.Categorical(
    pdf[cname], 
    categories=vdf[cname].unique(), 
    ordered=False
)
pdf[f'{cname}x'] = pdf[cname].cat.codes.astype(int)
# odf.groupby('product_id')['user_id'].count().sort_values()

CPU times: user 2.94 s, sys: 238 ms, total: 3.18 s
Wall time: 3.18 s


In [12]:
vdf[vdf.product_idx==33]

Unnamed: 0,user_id,date_created,product_id,user_idx,product_idx
245167,6db7097f-7109-4e0a-b1d7-6941f9361231,1668170220,106001757029220378,6988,33
405687,c2c9f150-1462-4e06-b6ac-d8fb984470c9,1668763238,106001757029220378,12461,33
528956,e92b45f2-ea6c-4daa-bd3a-8bfab2e126dc,1671377316,106001757029220378,14975,33
483320,cdff49f8-8663-4aa4-9ce8-e2e6dad503d9,1667721154,106001757029220378,13198,33
107846,37c1651e-269a-46f4-b11e-02985ed1a0d7,1667992571,106001757029220378,3576,33
...,...,...,...,...,...
552998,f5f6014e-4705-440c-a61e-81159fc8d8fb,1669613849,106001757029220378,15806,33
63005,2cc74a67-5f36-49b0-aead-43b3e0b6d222,1669725218,106001757029220378,2854,33
178142,3b265f20-8b23-4d24-be12-b6abebaf11a4,1665512560,106001757029220378,3791,33
2932,d7e49f54-a018-42a7-9767-d7db39c35030,1668407876,106001757029220378,13842,33


In [13]:
cdf[cdf.product_idx==33]

Unnamed: 0,user_id,date_created,product_id,user_idx,product_idx
552998,f5f6014e-4705-440c-a61e-81159fc8d8fb,1669613849,106001757029220378,15806,33
63005,2cc74a67-5f36-49b0-aead-43b3e0b6d222,1669725218,106001757029220378,2854,33
178142,3b265f20-8b23-4d24-be12-b6abebaf11a4,1665512560,106001757029220378,3791,33
2932,d7e49f54-a018-42a7-9767-d7db39c35030,1668407876,106001757029220378,13842,33
136,0c27b758-e2d1-4aac-b1b8-a5bc9a830fc1,1669122077,106001757029220378,779,33


## Put Zazzle data in UserItemInteration Class object

In [14]:
%%time
num_users = vdf['user_idx'].max()+1
num_items = vdf['product_idx'].max()+1
view_data = UserItemInteractions(
    name='ZAZZLE VIEW DATA',
    users_index=vdf['user_idx'],
    items_index=vdf['product_idx']
)
view_data.generate_train_test(user_test_ratio=0.2)

----UserItemInteractions--ZAZZLE VIEW DATA
# of users (active/total): 16414/16414
# of items (active/total): 736254/736254
# of interactions: 28497248
Sparsity in the UI mat: 0.002358
Memory used by sparse UI mat: 27.18 MB

Generating train-test split..done
CPU times: user 1min, sys: 1.82 s, total: 1min 2s
Wall time: 1min 2s


In [15]:
%%time
click_data = UserItemInteractions(
    name='ZAZZLE CLICK DATA',
    users_index=cdf['user_idx'],
    items_index=cdf['product_idx'],
    num_users = num_users,
    num_items = num_items
)
click_data.generate_train_test(user_test_ratio=0.2)

----UserItemInteractions--ZAZZLE CLICK DATA
# of users (active/total): 16317/16414
# of items (active/total): 305854/736254
# of interactions: 824856
Sparsity in the UI mat: 6.8e-05
Memory used by sparse UI mat: 0.79 MB

Generating train-test split..done
CPU times: user 3.38 s, sys: 13.1 ms, total: 3.39 s
Wall time: 3.4 s


In [16]:
%%time
order_data = UserItemInteractions(
    name='ZAZZLE ORDER DATA',
    users_index=odf['user_idx'],
    items_index=odf['product_idx'],
    num_users = num_users,
    num_items = num_items
)
order_data.generate_train_test(user_test_ratio=0.2)

----UserItemInteractions--ZAZZLE ORDER DATA
# of users (active/total): 16218/16414
# of items (active/total): 136873/736254
# of interactions: 263623
Sparsity in the UI mat: 2.2e-05
Memory used by sparse UI mat: 0.25 MB

Generating train-test split..done
CPU times: user 2.11 s, sys: 11.7 ms, total: 2.12 s
Wall time: 2.11 s


## BPR 

In [53]:
bpr1 = BPR(
    num_features=30,
    reg_lambda=0.0001,
    num_iters=500,
    learning_rate = 0.02,
    batch_size=200,
    initial_std=0.001,
)
bpr1.initiate(uimat=view_data.mat)

bpr2 = BPR(
    num_features=50,
    reg_lambda=0.0001,
    num_iters=500,
    learning_rate = 0.02,
    batch_size=200,
    initial_std=0.001,
)
bpr1.initiate(uimat=view_data.mat)


In [18]:
# bpr1.umat, bpr1.imat = random
results = bpr_fit(bpr_obj=bpr1, iumat=view_data.mat_train, ncores=104)
# bpr1.umat, bpr1.imat = optimized based on view data
#bpr1.initiate(uimat=view_data.mat) = bpr1.umat, bpr1.imat = random
# reuse the optimized bpr1.umat, bpr1.imat matrices from view_data+bpr
results = bpr_fit(bpr_obj=bpr1, iumat=click_data.mat_train, ncores=104)
#results = bpr_fit(bpr_obj=bpr1, iumat=order_data.mat_train, ncores=104)

Fit: 100%|██████████| 10000/10000 [00:02<00:00, 4348.50it/s]


In [54]:
bpr1.umat, bpr1.imat

(array([[ 4.71435164e-04, -1.19097569e-03,  1.43270697e-03, ...,
         -1.81702723e-03, -1.83108540e-04,  1.05896919e-03],
        [-3.97840228e-04,  3.37437654e-04,  1.04757857e-03, ...,
         -3.55130253e-05,  5.65738306e-04,  1.54565880e-03],
        [-9.74236334e-04, -7.03448771e-05,  3.07968855e-04, ...,
          9.84919842e-04,  2.70835849e-04,  1.39198619e-03],
        ...,
        [ 9.40525513e-04, -1.61735474e-03,  1.11887761e-03, ...,
         -2.08147712e-03, -1.13396655e-05,  1.40192829e-03],
        [ 3.22198654e-04, -5.85506895e-04, -7.57560851e-04, ...,
          3.69511923e-04,  6.40454740e-04,  1.30747529e-03],
        [-1.05409526e-03, -4.49854298e-04, -3.60128825e-04, ...,
          1.21015351e-03,  1.53883427e-03, -1.23499802e-03]]),
 array([[ 2.40535992e-03, -1.55488771e-04,  1.26797171e-03, ...,
         -1.06594109e-03, -7.30695079e-04,  2.93247923e-04],
        [-5.46111734e-06,  2.32809644e-04,  4.36806429e-04, ...,
          9.69326898e-04, -9.73523851e

In [19]:
# computes the performance metric
compute_metric(bpr1.umat, bpr1.imat, order_data.mat, user_count=2000, num_recs=60)

1200

In [36]:
rec_items = order_data.get_top_items_for_this_user(30, user_mat=bpr1.umat, item_mat=bpr1.imat, num_items=60, exclude_liked=True)

In [40]:
bpr1.umat.shape

(16414, 30)

In [41]:
2000*5000

100000

In [50]:
view_data.mat_train.nnz

22791229

In [48]:
(2000*50000)/(16414*736254 - view_data.mat.nnz)

0.00829436646327899

In [44]:
view_data.mat.nnz

28497248

In [None]:
view_data