## Application of BPR on Zazzle Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from subprocess import call
from pybpr import *
import scipy.sparse as sp
from functools import reduce
import matplotlib.pyplot as plt
from functools import partial, reduce

In [3]:
list(range(0,20,6))

[0, 6, 12, 18]

## Process data from Zazzle

In [4]:
%%time
data_dir = '/kfs2/projects/zazzle/raw_data/NREL'
#data_dir = os.path.join(os.path.curdir, 'data', 'NREL')
files = [os.path.join(data_dir,f'Clicks_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
df_viewed = pd.concat([pd.read_parquet(ifile, engine='fastparquet') for ifile in files])

CPU times: user 3.56 s, sys: 1.63 s, total: 5.19 s
Wall time: 18.6 s


In [5]:
%%time
files = [os.path.join(data_dir,f'OrderItems_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
df_ordered = pd.concat([pd.read_parquet(ifile) for ifile in files])

CPU times: user 81.9 ms, sys: 87.6 ms, total: 169 ms
Wall time: 3.26 s


In [6]:
%%time
files = [os.path.join(data_dir,f'Products_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
df_item_info = pd.concat([pd.read_parquet(ifile) for ifile in files])

CPU times: user 1.84 s, sys: 564 ms, total: 2.4 s
Wall time: 11.5 s


In [14]:
#df_item_info.vision_style_id_1.unique()

In [8]:
# %%time
# files = [os.path.join(data_dir,f'AddToCarts_{str(ix).zfill(4)}_part_00.parquet') for ix in range(80)]
# tdf = pd.concat([pd.read_parquet(ifile) for ifile in files])

In [9]:
#vdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
#cdf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
df_clicked_not_ordered = df_viewed[df_viewed['is_click']].copy()
df_viewed_not_clicked = df_viewed[~df_viewed['is_click']].copy()
df_clicked = pd.concat([df_ordered, df_clicked_not_ordered]) # ensures order data is part of click data

In [10]:
df_list = [df_viewed, df_clicked, df_ordered, df_viewed_not_clicked, df_clicked_not_ordered]
for idf in df_list:
    idf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
    #idf.drop(columns=['is_click','cleaned_url'], inplace=True, errors='ignore')
    print(idf.product_id.nunique(), idf.user_id.nunique(), idf.shape)

779192 16511 (28927112, 5)
313452 16402 (833698, 5)
139654 16302 (266638, 3)
779192 16508 (28873270, 5)
308622 16370 (805572, 5)


In [11]:
df1=df_clicked
df2=df_viewed_not_clicked[df1.columns]
common_rows = pd.merge(
    df1.reset_index(drop=True), 
    df2.reset_index(drop=True),
    how='inner'
)
common_rows

Unnamed: 0,user_id,date_created,product_id,is_click,cleaned_url


In [15]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 833698 entries, 0 to 640678
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       833698 non-null  object
 1   date_created  833698 non-null  int64 
 2   product_id    833698 non-null  int64 
 3   is_click      805572 non-null  object
 4   cleaned_url   805572 non-null  object
dtypes: int64(2), object(3)
memory usage: 38.2+ MB


In [16]:
ui = UserItemData(name='zazzle')
ui.add_positive_interactions(
        user_ids=df_ordered.user_id,
        item_ids=df_ordered.product_id
)
ui.add_negative_interactions(
    user_ids=df_clicked_not_ordered.user_id,
    item_ids=df_clicked_not_ordered.product_id
)
ui.add_user_features(
    user_ids=ui.user_ids_in_interactions,
    feature_ids=ui.user_ids_in_interactions
)
idf = df_item_info[df_item_info.product_id.isin(ui.item_ids_in_interactions)]
ui.add_item_features(
    item_ids=idf.product_id,
    feature_ids=idf.final_department_id
)

In [17]:
ui

UserItemData(zazzle)
  Fuser     :( 16402× 16402) nnz=    16,402 (0.006%), empty rows/cols=     0/     0
  Fitem     :(313452×   628) nnz=   309,582 (0.157%), empty rows/cols=  3870/     0
  Rpos      :( 16402×313452) nnz=   266,638 (0.005%), empty rows/cols=   100/173798
  Rneg      :( 16402×313452) nnz=   805,572 (0.016%), empty rows/cols=    32/  4830

In [18]:
ui.train_test_split(train_ratio_pos=0.8, train_ratio_neg=0.)

Train/Test Split (Rows): 100%|██████████| 165/165 [00:00<00:00, 180.56it/s]
Train/Test Split (Cols): 100%|██████████| 193934/193934 [11:27<00:00, 282.25it/s]


In [None]:
adam = partial(
    torch.optim.Adam,
    lr=0.02,
    weight_decay=0.0
)
rs = RecSys(
    data = ui,
    model= HybridMF(ui.n_user_features, ui.n_item_features, n_latent=64),
    optimizer=partial(torch.optim.Adam, lr=0.05, weight_decay=0.0),
    output_dir='/kfs2/projects/zazzle/pybpr/notebooks/output/ml-run',
    loss_function=bpr_loss,
    log_level=2
)
rs

In [None]:
rs.fit(n_iter=10, eval_every=20, batch_size=10000, eval_user_size=1000)

In [None]:
rs.metrics

In [None]:
X = ui.Rpos
rows, cols = X.nonzero()
values = X.data


indices = np.arange(len(values))

# Find all rows and columns with only one non-zero element
unique_rows, row_counts = np.unique(rows, return_counts=True)
unique_cols, col_counts = np.unique(cols, return_counts=True)

critical_rows = unique_rows[row_counts == 1]
critical_cols = unique_cols[col_counts == 1]

In [None]:
critical_cols.shape

In [None]:
df_item_info.final_department_id.nunique()

In [None]:
list(ui._id_to_idx_mappings['user'][0].keys())

In [None]:
xx = ui.Rpos.multiply(ui.Rneg)
xx.eliminate_zeros()
xx

In [None]:
idf = df_views[df_views['is_click']].copy()
print(idf.product_id.nunique(), idf.user_id.nunique())

In [None]:
df_item_info.product_id.nunique()

In [None]:
for icol in ['product_id','final_department_id']:
    pdf[icol] = pdf[icol].astype('category')
    pdf[f'{icol}_ids'] = pdf[icol].cat.codes.astype(int)
for icol in ['product_id','user_id']:
    odf[icol] = odf[icol].astype('category')
    odf[f'{icol}_ids'] = odf[icol].cat.codes.astype(int)

In [None]:
pdf.final_department_id.unique()

In [None]:
pdf.head()

In [None]:
rdf=odf
n_items = rdf.product_id.nunique()
n_users = rdf.user_id.nunique()
n_item_mfeatures = pdf.final_department_id.nunique()
ui_metadata = UserItemData(
    name='User-Metadata-only',
    n_users=n_users, 
    n_items = n_items,
    n_user_features=n_users,
    n_item_features=n_item_mfeatures
    
)
ui_metadata.add_interactions(
    user_indices=rdf.user_id_ids, 
    item_indices=rdf.product_id_ids,
    positive=True
)
# ui_metadata.add_interactions(
#     user_indices=rdf.UserID_ids[rdf.Rating<4.], 
#     item_indices=rdf.MovieID_ids[rdf.Rating<4.],
#     positive=False
# )
ui_metadata.add_user_features(
    user_indices=list(range(n_users)), 
    feature_indices=list(range(n_users))
)
ui_metadata.add_item_features(
    item_indices=tdf.product_id_ids, 
    feature_indices=tdf.final_department_id_ids,
    #feature_weights=tdf.Relevance
)
ui_metadata.train_test_split(0.2)
ui_metadata

In [None]:
# for i in range(100):
#     istring = pdf.vision_embedding2.iloc[100+i]
#     #print(istring)
#     if len(istring)>2:
#         ivec = [float(s) for s in istring.split(',')]
#         print(i, np.dot(ivec, ivec))

In [None]:
[s for s in pdf.vision_embedding2.iloc[107].split(',')]

In [None]:
pdf.info()

In [None]:
pdf.final_department_id.nunique()

In [None]:
# remove repeated entries
cdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
vdf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)
odf.drop_duplicates(subset=['user_id','product_id'], keep='last', inplace=True)

In [None]:
print(f'Views:  int={vdf.shape[0]} u={vdf.user_id.nunique()} i={vdf.product_id.nunique()}')
print(f'Clicks: int={cdf.shape[0]} u={cdf.user_id.nunique()} i={cdf.product_id.nunique()}')
print(f'Orders: int={odf.shape[0]} u={odf.user_id.nunique()} i={odf.product_id.nunique()}')

In [None]:
# make sure atleast some interaction data for each user/item
idf = vdf.groupby('user_id')['product_id'].count().gt(5)
valid_users = idf[idf].index.to_numpy()
jdf = vdf.groupby('product_id')['user_id'].count().gt(5)
valid_items = jdf[jdf].index.to_numpy()
print(f'Valid Users = {valid_users.shape[0]}')
print(f'Valid Items = {valid_items.shape[0]}')

In [None]:
common_users = reduce(np.intersect1d, (vdf.user_id, cdf.user_id, odf.user_id, valid_users))
common_users = valid_users
vdf = vdf[vdf.user_id.isin(common_users)].copy()
cdf = cdf[cdf.user_id.isin(common_users)].copy()
odf = odf[odf.user_id.isin(common_users)].copy()
print(vdf.user_id.nunique(), cdf.user_id.nunique(), odf.user_id.nunique())
common_items = reduce(np.intersect1d, (vdf.product_id, cdf.product_id, odf.product_id, valid_items))
common_items=valid_items
vdf = vdf[vdf.product_id.isin(common_items)].copy()
cdf = cdf[cdf.product_id.isin(common_items)].copy()
odf = odf[odf.product_id.isin(common_items)].copy()
print(vdf.product_id.nunique(), cdf.product_id.nunique(), odf.product_id.nunique())

In [None]:
# %%time
# # get indices for users and products
# for cname in ['user_id','product_id']:
#     vdf[cname] = vdf[cname].astype('category')
#     vdf[f'{cname}x'] = vdf[cname].cat.codes.astype(int)
#     cdf[cname] = pd.Categorical(
#         cdf[cname], 
#         categories=vdf[cname].unique(), 
#         ordered=False
#     )
#     cdf[f'{cname}x'] = cdf[cname].cat.codes.astype(int)
#     odf[cname] = pd.Categorical(
#         odf[cname], 
#         categories=vdf[cname].unique(), 
#         ordered=False
#     )
#     odf[f'{cname}x'] = odf[cname].cat.codes.astype(int)
# cname='product_id'
# pdf[cname] = pd.Categorical(
#     pdf[cname], 
#     categories=vdf[cname].unique(), 
#     ordered=False
# )
# pdf[f'{cname}x'] = pdf[cname].cat.codes.astype(int)
# # odf.groupby('product_id')['user_id'].count().sort_values()

In [None]:
# add index columns
vdf, (cdf, odf, pdf) = generate_user_item_indices(
    parent_df = vdf, 
    children_dfs = [cdf, odf, pdf],
    userid_column = 'user_id',
    itemid_column = 'product_id',
    index_suffix='x'
)

In [None]:
vdf.reset_index(inplace=True, drop=True)
cdf.reset_index(inplace=True, drop=True)
odf.reset_index(inplace=True, drop=True)

In [None]:
out_dir = '/kfs2/projects/zazzle/pybpr/examples/output'
vdf.to_parquet(os.path.join(out_dir, 'view_data.parquet'))
cdf.to_parquet(os.path.join(out_dir, 'click_data.parquet'))
odf.to_parquet(os.path.join(out_dir, 'order_data.parquet'))
pdf.to_parquet(os.path.join(out_dir, 'item_data.parquet'))

## Create pos/neg pair datasets

In [None]:
idf = vdf.merge(cdf, on=['user_id','product_id'], how='left', indicator=True)
df_viewed_but_not_clicked = vdf[(idf['_merge']=='left_only').values]

In [None]:
idf = cdf.merge(odf, on=['user_id','product_id'], how='left', indicator=True)
df_clicked_but_not_ordered = cdf[(idf['_merge']=='left_only').values]

In [None]:
# check if compatible
df_viewed_but_not_clicked.shape[0], cdf.shape[0], vdf.shape[0]
df_clicked_but_not_ordered.shape[0], odf.shape[0], cdf.shape[0]

In [None]:
df_viewed_but_not_clicked.to_parquet(os.path.join(os.path.curdir, 'output', 'zazzle_data', 'viewed_not_clicked_data.parquet'))
df_clicked_but_not_ordered.to_parquet(os.path.join(os.path.curdir, 'output', 'zazzle_data', 'clicked_not_ordered_data.parquet'))

## Plotting

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
iter_dict = {'Views': vdf, 'Clicks':cdf, 'Orders':odf}
for ilbl, idf in iter_dict.items():
    data = idf.groupby('user_idx')['product_idx'].count().values
    xbins, ylocs = get_cdf(data, bins=100, range=(0,200))
    ax.plot(xbins, ylocs, label=ilbl) 
ax.legend()
ax.grid(True)
ax.set_xlabel('Number of interactions')
ax.set_ylabel('CDF')

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_true=[1,0,1,0,1,0], y_score=[1,0.9,5,4,5,4]).item()

In [None]:
roc_auc_score(y_true=[1,0], y_score=[0,1]).item()

In [None]:
from pybpr import *
from scipy import sparse

In [None]:
# Create a sample sparse matrix
rows, cols = 10, 50
matrix = sparse.random(rows, cols, density=0.05, format='csr')

# Sample one random entry per row
row_indices, col_indices, values = sample_random_entry_per_row(matrix)
row_indices, col_indices

In [None]:
xx = matrix.tolil()

In [None]:
for ix in xx.rows:
    print(ix)

In [None]:
np.arange(10)