## Application of BPR on Zazzle Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from subprocess import call
from pybpr import *
import scipy.sparse as sp
from functools import reduce
import matplotlib.pyplot as plt
from functools import partial

## Process data from Zazzle

In [None]:
%%time
data_dir = os.path.join(os.path.curdir, 'data', 'NREL')
files = [os.path.join(data_dir,f'Clicks_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
df = pd.concat([pd.read_parquet(ifile) for ifile in files])
cdf = df[df['is_click']].copy() # click data
vdf = df[~df['is_click']].copy() # not click is view data
#df_click = df_click[~df_click['is_click']].copy()

In [None]:
%%time
files = [os.path.join(data_dir,f'OrderItems_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
odf = pd.concat([pd.read_parquet(ifile) for ifile in files])

In [None]:
%%time
files = [os.path.join(data_dir,f'Products_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
pdf = pd.concat([pd.read_parquet(ifile) for ifile in files])

In [None]:
vdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
cdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
cdf = pd.concat([cdf, odf]) # ensures order data is part of click data
vdf = pd.concat([vdf, cdf]) # ensures click data is part of view data

In [None]:
odf.head()

In [None]:
# remove repeated entries
cdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
vdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
odf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)

In [None]:
print(f'Views:  int={vdf.shape[0]} u={vdf.user_id.nunique()} i={vdf.product_id.nunique()}')
print(f'Clicks: int={cdf.shape[0]} u={cdf.user_id.nunique()} i={cdf.product_id.nunique()}')
print(f'Orders: int={odf.shape[0]} u={odf.user_id.nunique()} i={odf.product_id.nunique()}')

In [None]:
# 
idf = vdf.groupby('user_id')['product_id'].count().gt(10)
valid_users = idf[idf].index.to_numpy()
jdf = vdf.groupby('product_id')['user_id'].count().gt(10)
valid_items = jdf[jdf].index.to_numpy()
print(f'Valid Users = {valid_users.shape[0]}')
print(f'Valid Items = {valid_items.shape[0]}')

In [None]:
common_users = reduce(np.intersect1d, (vdf.user_id, cdf.user_id, odf.user_id, valid_users))
common_users = valid_users
vdf = vdf[vdf.user_id.isin(common_users)].copy()
cdf = cdf[cdf.user_id.isin(common_users)].copy()
odf = odf[odf.user_id.isin(common_users)].copy()
print(vdf.user_id.nunique(), cdf.user_id.nunique(), odf.user_id.nunique())
common_items = reduce(np.intersect1d, (vdf.product_id, cdf.product_id, odf.product_id, valid_items))
common_items=valid_items
vdf = vdf[vdf.product_id.isin(common_items)].copy()
cdf = cdf[cdf.product_id.isin(common_items)].copy()
odf = odf[odf.product_id.isin(common_items)].copy()
print(vdf.product_id.nunique(), cdf.product_id.nunique(), odf.product_id.nunique())

In [None]:
%%time
# get indices for users and products
for cname in ['user_id','product_id']:
    vdf[cname] = vdf[cname].astype('category')
    vdf[f'{cname}x'] = vdf[cname].cat.codes.astype(int)
    cdf[cname] = pd.Categorical(
        cdf[cname], 
        categories=vdf[cname].unique(), 
        ordered=False
    )
    cdf[f'{cname}x'] = cdf[cname].cat.codes.astype(int)
    odf[cname] = pd.Categorical(
        odf[cname], 
        categories=vdf[cname].unique(), 
        ordered=False
    )
    odf[f'{cname}x'] = odf[cname].cat.codes.astype(int)
cname='product_id'
pdf[cname] = pd.Categorical(
    pdf[cname], 
    categories=vdf[cname].unique(), 
    ordered=False
)
pdf[f'{cname}x'] = pdf[cname].cat.codes.astype(int)
# odf.groupby('product_id')['user_id'].count().sort_values()

In [None]:
vdf, [cdf, odf] = generate_user_item_indices(
    parent_df = vdf, 
    children_dfs = [cdf, odf],
    userid_column = 'user_id',
    itemid_column = 'product_id',
    index_suffix='x_new'
)

In [None]:
vdf['user_idx'].compare(vdf['user_idx_new'])

In [None]:
cdf[cdf.product_idx==33]

In [None]:
vdf[vdf.product_idx==33]

In [None]:
cdf[cdf.product_idx==33]

In [None]:
vdf.to_parquet(os.path.join(os.path.curdir, 'output', 'zazzle_data', 'view_data.parquet'))
cdf.to_parquet(os.path.join(os.path.curdir, 'output', 'zazzle_data', 'click_data.parquet'))
odf.to_parquet(os.path.join(os.path.curdir, 'output', 'zazzle_data', 'order_data.parquet'))

## Plotting

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
iter_dict = {'Views': vdf, 'Clicks':cdf, 'Orders':odf}
for ilbl, idf in iter_dict.items():
    data = idf.groupby('user_idx')['product_idx'].count().values
    xbins, ylocs = get_cdf(data, bins=100, range=(0,200))
    ax.plot(xbins, ylocs, label=ilbl) 
ax.legend()
ax.grid(True)
ax.set_xlabel('Number of interactions')
ax.set_ylabel('CDF')

## Put Zazzle data in UserItemInteration Class object

In [None]:
%%time
num_users = vdf['user_idx'].max()+1
num_items = vdf['product_idx'].max()+1
view_data = UserItemInteractions(
    name='ZAZZLE VIEW DATA',
    users_index=vdf['user_idx'],
    items_index=vdf['product_idx']
)
#view_data.generate_train_test(user_test_ratio=0.2)

In [None]:
view_data.mat

In [None]:
%%time
click_data = UserItemInteractions(
    name='ZAZZLE CLICK DATA',
    users_index=cdf['user_idx'],
    items_index=cdf['product_idx'],
    num_users = num_users,
    num_items = num_items
)
click_data.generate_train_test(user_test_ratio=0.2)

In [None]:
%%time
order_data = UserItemInteractions(
    name='ZAZZLE ORDER DATA',
    users_index=odf['user_idx'],
    items_index=odf['product_idx'],
    num_users = num_users,
    num_items = num_items
)
order_data.generate_train_test(user_test_ratio=0.2)

In [None]:
view_data.mat != click_data.mat

## BPR 

In [None]:
bpr1 = BPR(
    num_features=100,
    reg_lambda=0.0,
    num_iters=50,
    learning_rate = 0.02,
    batch_size=10000,
    initial_std=0.001,
)
bpr1.initiate(uimat=order_data.mat)

In [None]:
xx = click_data.get_metric_v1(
        umat=bpr1.umat, 
        imat=bpr1.imat, 
        perc_active_users=0.5,
        perc_active_items=0.5,
        num_recs=60
)

In [None]:
xxx = xx != click_data.mat

In [None]:
click_data.mat

In [None]:
rec1 = np.asarray(xx.sum(axis=1)).reshape(-1)

In [None]:
rec2 = np.asarray(click_data.mat.sum(axis=1)).reshape(-1)

In [None]:
rec2.take([2,3,4,5,10])

In [None]:
np.repeat([10,20],10)

In [None]:
bb = csr_matrix(
    ([True, True,True,True], ([0, 2,1,0], [0,0,1,2])),
    shape=(3, 3),
    dtype=bool
)
cc = csr_matrix(
    ([True,True,True], ([1,2,2], [1,1,2])),
    shape=(3, 3),
    dtype=bool
)

In [None]:
bb.toarray(), cc.toarray()

In [None]:
(bb-cc).toarray()

In [None]:
dd = np.asarray((bb-cc).sum(axis=1)).reshape(-1)

In [None]:
self.mat.

In [None]:
# bpr1.umat, bpr1.imat = random
for _ in range(10):
    results = bpr_fit(bpr_obj=bpr1, iumat=order_data.mat, ncores=104)
    imetric = click_data.get_metric_v1(
        umat=bpr1.umat, 
        imat=bpr1.imat, 
        perc_active_users=0.5,
        perc_active_items=0.5,
        num_recs=60
    )
    print(imetric)

In [None]:
%%time
metric1 = order_data.get_metric_v1(
    umat=bpr1.umat, 
    imat=bpr1.imat, 
    perc_active_users=0.25, 
    perc_active_items=0.25,
    num_recs=60
)

In [None]:
metric1

In [None]:
order_data.num_users*60

In [None]:
order_data.mat

In [None]:
(xx + order_data.mat).count_nonzero()

In [None]:
xx.nnz + order_data.mat.nnz - (xx + order_data.mat).count_nonzero()

In [None]:
active_users = order_data.users_sorted_by_activity()

In [None]:
umat_sliced = bpr1.umat.take(active_users, axis=0)
rec_mat = umat_sliced.dot(bpr1.imat.T)

In [None]:
np.amax(rec_mat)

In [None]:
np.argsort(rec_mat)

In [None]:
# computes the performance metric
click_data.get_zazzle_metric_v1(umat=bpr1.umat, imat=bpr1.imat, user_count=2000, num_recs=60)

In [None]:
rec_items = order_data.get_top_items_for_this_user(30, user_mat=bpr1.umat, item_mat=bpr1.imat, num_items=60, exclude_liked=True)

In [None]:
bpr1.umat.shape

In [None]:
2000*5000

In [None]:
view_data.mat_train.nnz

In [None]:
(2000*50000)/(16414*736254 - view_data.mat.nnz)

In [None]:
view_data.mat.nnz

In [None]:
xx = []
for i in range(10):
    xx.append(i)

In [None]:
xx

In [36]:
test1 = UserItemInteractions(
    users_index=np.array([1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
                          5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8])-1,
    items_index=np.array([1, 2, 3, 4, 5, 1, 2, 3, 3, 4, 5, 2, 3, 4,
                          6, 7, 8, 9, 10, 6, 7, 8, 8, 9, 10, 7, 8, 9])-1,
    num_items=10,
    num_users=8
)

----UserItemInteractions--Sample
# of users (active/total): 8/8
# of items (active/total): 10/10
# of interactions: 28
Sparsity in the UI mat: 0.35
Memory used by sparse UI mat: 0.0 MB



In [37]:
test1.mat.toarray().astype(int)

array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]])

In [50]:
bpr1 = BPR(
    num_features=2,
    reg_lambda=0.0,
    num_iters=200,
    learning_rate = 0.2,
    batch_size=500,
    initial_std=0.01,
)
bpr1.initiate(uimat=test1.mat)

Setting batch size to 8


In [51]:
bpr_fit(bpr_obj=bpr1, iumat=test1.mat, ncores=1)

BPR-Fit: 100%|██████████| 1600/1600 [00:00<00:00, 27432.13it/s]


In [52]:
bpr1.get_recomendations_for_this_user(0, num_recs=1)

(10,)


[2]

In [None]:
aa, bb, cc = test1.get_metric_v1(
    umat=bpr1.umat,
    imat=bpr1.imat,
    perc_active_users=1.0,
    perc_active_items=1.0,
    num_recs=3
)

In [None]:
aa, bb, cc

In [None]:
xmat = bpr1.umat.dot(bpr1.imat.T)

In [None]:
yy = np.around(xmat,1)
yy

In [None]:
nn = np.argpartition(
    a=-1*yy,  # the recomendation matrix, -1 to counter ascending sort
    kth=3,  # where to partition
    axis=1  # sort it along the column, i.e. for each user
)
nn = nn[:,:3]
nn

In [None]:
# how to fast sort this matrix
# rewrite bpr_* functions into class and try parralel pool to see if it works

In [None]:
np.random.seed(123)
#xx = np.random.choice(int(1e8), size=int(1e8))
xx = np.random.randn(10, int(1e5))

In [None]:
xx

In [None]:
#%%timeit
yy1 = np.argsort(xx, axis=1)[:, ::-1]

In [None]:
data = np.asarray(order_data.mat.sum(axis=1)).reshape(-1)
count, bins_count = np.histogram(data, bins=100, range=(0,200)) 
pdf = count / sum(count) 
cdf = np.cumsum(pdf) 
plt.plot(bins_count[1:], cdf,  '-r', label="Order") 
plt.xlim([0,100])

data = np.asarray(click_data.mat.sum(axis=1)).reshape(-1)
count, bins_count = np.histogram(data, bins=200, range=(0,200)) 
pdf = count / sum(count) 
cdf = np.cumsum(pdf) 
plt.plot(bins_count[1:], cdf,  '-b', label="Click") 
plt.legend()
plt.grid(True)

In [None]:
#%%timeit
yy2 = np.argpartition(xx, kth=xx.shape[1]-60, axis=1)[:,::-1][:,:60]
yy2.shape

In [None]:
yy2[1,:]

In [None]:
np.ravel(yy2)[58:62]

In [None]:
yy2[0, :60]

In [None]:
xxx = np.array([[1,2,3,-1,0,6,7,-4,-2],[10,20,30,-10,0,60,70,-40,-20]])
np.argsort(xxx, axis=1)[::-1]

In [None]:
np.argpartition(xxx, 6)[::-1]