## Application of BPR on Zazzle Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from subprocess import call
from pybpr import *
import scipy.sparse as sp
from functools import reduce
import matplotlib.pyplot as plt
from functools import partial

## Process data from Zazzle

In [3]:
%%time
data_dir = os.path.join(os.path.curdir, 'data', 'NREL')
files = [os.path.join(data_dir,f'Clicks_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
df = pd.concat([pd.read_parquet(ifile) for ifile in files])
cdf = df[df['is_click']].copy() # click data
vdf = df[~df['is_click']].copy() # not click is view data
#df_click = df_click[~df_click['is_click']].copy()

CPU times: user 6.23 s, sys: 2.06 s, total: 8.29 s
Wall time: 10.5 s


In [4]:
%%time
files = [os.path.join(data_dir,f'OrderItems_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
odf = pd.concat([pd.read_parquet(ifile) for ifile in files])

CPU times: user 139 ms, sys: 88.4 ms, total: 227 ms
Wall time: 1.52 s


In [5]:
%%time
files = [os.path.join(data_dir,f'Products_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
pdf = pd.concat([pd.read_parquet(ifile) for ifile in files])

CPU times: user 2.54 s, sys: 1.02 s, total: 3.56 s
Wall time: 6.34 s


In [6]:
vdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
cdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
cdf = pd.concat([cdf, odf]) # ensures order data is part of click data
vdf = pd.concat([vdf, cdf]) # ensures click data is part of view data

In [7]:
# remove repeated entries
cdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
vdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
odf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)

In [8]:
print(f'Views:  int={vdf.shape[0]} u={vdf.user_id.nunique()} i={vdf.product_id.nunique()}')
print(f'Clicks: int={cdf.shape[0]} u={cdf.user_id.nunique()} i={cdf.product_id.nunique()}')
print(f'Orders: int={odf.shape[0]} u={odf.user_id.nunique()} i={odf.product_id.nunique()}')

Views:  int=28927112 u=16511 i=779192
Clicks: int=833698 u=16402 i=313452
Orders: int=266638 u=16302 i=139654


In [9]:
idf = vdf.groupby('user_id')['product_id'].count().gt(10)
valid_users = idf[idf].index.to_numpy()
jdf = vdf.groupby('product_id')['user_id'].count().gt(10)
valid_items = jdf[jdf].index.to_numpy()
print(f'Valid Users = {valid_users.shape[0]}')
print(f'Valid Items = {valid_items.shape[0]}')

Valid Users = 16414
Valid Items = 736254


In [10]:
common_users = reduce(np.intersect1d, (vdf.user_id, cdf.user_id, odf.user_id, valid_users))
common_users = valid_users
vdf = vdf[vdf.user_id.isin(common_users)].copy()
cdf = cdf[cdf.user_id.isin(common_users)].copy()
odf = odf[odf.user_id.isin(common_users)].copy()
print(vdf.user_id.nunique(), cdf.user_id.nunique(), odf.user_id.nunique())
common_items = reduce(np.intersect1d, (vdf.product_id, cdf.product_id, odf.product_id, valid_items))
common_items=valid_items
vdf = vdf[vdf.product_id.isin(common_items)].copy()
cdf = cdf[cdf.product_id.isin(common_items)].copy()
odf = odf[odf.product_id.isin(common_items)].copy()
print(vdf.product_id.nunique(), cdf.product_id.nunique(), odf.product_id.nunique())

16414 16338 16244
736254 305854 136873


In [None]:
%%time
# get indices for users and products
for cname in ['user_id','product_id']:
    vdf[cname] = vdf[cname].astype('category')
    vdf[f'{cname}x'] = vdf[cname].cat.codes.astype(int)
    cdf[cname] = pd.Categorical(
        cdf[cname], 
        categories=vdf[cname].unique(), 
        ordered=False
    )
    cdf[f'{cname}x'] = cdf[cname].cat.codes.astype(int)
    odf[cname] = pd.Categorical(
        odf[cname], 
        categories=vdf[cname].unique(), 
        ordered=False
    )
    odf[f'{cname}x'] = odf[cname].cat.codes.astype(int)
cname='product_id'
pdf[cname] = pd.Categorical(
    pdf[cname], 
    categories=vdf[cname].unique(), 
    ordered=False
)
pdf[f'{cname}x'] = pdf[cname].cat.codes.astype(int)
# odf.groupby('product_id')['user_id'].count().sort_values()

In [None]:
vdf[vdf.product_idx==33]

In [None]:
cdf[cdf.product_idx==33]

## Put Zazzle data in UserItemInteration Class object

In [None]:
%%time
num_users = vdf['user_idx'].max()+1
num_items = vdf['product_idx'].max()+1
view_data = UserItemInteractions(
    name='ZAZZLE VIEW DATA',
    users_index=vdf['user_idx'],
    items_index=vdf['product_idx']
)
view_data.generate_train_test(user_test_ratio=0.2)

In [None]:
%%time
click_data = UserItemInteractions(
    name='ZAZZLE CLICK DATA',
    users_index=cdf['user_idx'],
    items_index=cdf['product_idx'],
    num_users = num_users,
    num_items = num_items
)
click_data.generate_train_test(user_test_ratio=0.2)

In [None]:
%%time
order_data = UserItemInteractions(
    name='ZAZZLE ORDER DATA',
    users_index=odf['user_idx'],
    items_index=odf['product_idx'],
    num_users = num_users,
    num_items = num_items
)
order_data.generate_train_test(user_test_ratio=0.2)

## BPR 

In [None]:
bpr1 = BPR(
    num_features=30,
    reg_lambda=0.0001,
    num_iters=50,
    learning_rate = 0.02,
    batch_size=200,
    initial_std=0.001,
)
bpr1.initiate(uimat=order_data.mat)

In [None]:
results = bpr_fit(bpr_obj=bpr1, iumat=order_data.mat_train, ncores=104)

In [None]:
# computes the performance metric
compute_metric(bpr1.umat, bpr1.imat, view_data.mat, user_count=2000, num_recs=60)

In [None]:
np.argmax([0,3,1,5])