## Application of BPR on Zazzle Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from subprocess import call
from pybpr import *
import scipy.sparse as sp
from functools import reduce
import matplotlib.pyplot as plt
from functools import partial

## Put Zazzle data in UserItemInteration Class object

In [3]:
DATA_DIR = '/kfs2/projects/zazzle/rsandhu/pybpr/examples/data/zazzle_big'
df_v = pd.read_parquet(os.path.join(DATA_DIR, 'view_data.parquet'))
df_c = pd.read_parquet(os.path.join(DATA_DIR, 'click_data.parquet'))
df_o = pd.read_parquet(os.path.join(DATA_DIR, 'order_data.parquet'))
df_v_not_c = pd.read_parquet(os.path.join(
    DATA_DIR, 'viewed_not_clicked_data.parquet'))
df_c_not_o = pd.read_parquet(os.path.join(
    DATA_DIR, 'clicked_not_ordered_data.parquet'))

In [4]:
df_v.user_id.nunique(), df_v.product_id.nunique()
df_v.user_idx.nunique(), df_v.product_idx.nunique()

(16462, 779192)

In [5]:
# viewed not clicked
num_users = df_v.user_id.nunique()
num_items = df_v.product_id.nunique()

test_ratio = 0.0
data_viewed_not_clicked = UserItemInteractions(
    users_index=df_v_not_c['user_idx'],
    items_index=df_v_not_c['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_viewed_not_clicked.generate_train_test(user_test_ratio=test_ratio)

# clicked
data_clicked = UserItemInteractions(
    users_index=df_c['user_idx'],
    items_index=df_c['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_clicked.generate_train_test(user_test_ratio=test_ratio)

# clicked not ordered
data_clicked_not_ordered = UserItemInteractions(
    users_index=df_c_not_o['user_idx'],
    items_index=df_c_not_o['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_clicked_not_ordered.generate_train_test(user_test_ratio=test_ratio)

# ordered
data_ordered = UserItemInteractions(
    users_index=df_o['user_idx'],
    items_index=df_o['product_idx'],
    num_users=num_users,
    num_items=num_items
)
data_ordered.generate_train_test(user_test_ratio=test_ratio)


----UserItemInteractions--Sample
# of users (active/total): 16455/16462
# of items (active/total): 779192/779192
# of interactions: 28093326
Sparsity in the UI mat: 0.00219
Memory used by sparse UI mat: 26.79 MB

----UserItemInteractions--Sample
# of users (active/total): 16375/16462
# of items (active/total): 313420/779192
# of interactions: 833645
Sparsity in the UI mat: 6.5e-05
Memory used by sparse UI mat: 0.8 MB

----UserItemInteractions--Sample
# of users (active/total): 15446/16462
# of items (active/total): 250741/779192
# of interactions: 567047
Sparsity in the UI mat: 4.4e-05
Memory used by sparse UI mat: 0.54 MB

----UserItemInteractions--Sample
# of users (active/total): 16278/16462
# of items (active/total): 139623/779192
# of interactions: 266598
Sparsity in the UI mat: 2.1e-05
Memory used by sparse UI mat: 0.25 MB


## BPR 

In [6]:
bpr1 = BPR(
    num_features=200,
    reg_lambda=0.0,
    num_iters=500,
    learning_rate = 0.1,
    batch_size=15000,
    initial_std=0.0001,
)
bpr1.initiate(num_users=num_users, num_items=num_items)

In [7]:
pos_data = data_clicked
neg_data = data_clicked_not_ordered
metric_log_train = []
# neg_sampler = partial(
#     uniform_negative_sampler, 
#     uimat=training_data
# )
neg_sampler = partial(
    explicit_negative_sampler,
    pos_uimat=pos_data.mat,
    neg_uimat=neg_data.mat
)

for _ in range(10):
    results = bpr_fit(
        bpr_obj=bpr1, 
        neg_sampler=neg_sampler, 
        ncores=104
    )
    mfunc = partial(
        bpr1.get_metric_v1,
        perc_active_users=0.5,
        perc_active_items=0.5,
        num_recs=60,
        max_users_per_batch=160,
        percentiles=[0.25,0.5, 0.75],
        seed=1234
    )
    iscore = mfunc(uimat=pos_data.mat)
    metric_log_train.append(iscore)
    print(iscore)
metric_log_train = np.asarray(metric_log_train)

BPR-Train: 100%|██████████| 7500000/7500000 [00:11<00:00, 647046.78it/s]
BPR-Score: 100%|██████████| 52/52 [00:40<00:00,  1.29it/s]
[0.00769231 0.03182456 0.0697285 ]
BPR-Train: 100%|██████████| 7500000/7500000 [00:11<00:00, 643123.98it/s]
BPR-Score: 100%|██████████| 52/52 [00:41<00:00,  1.25it/s]
[0.03163094 0.07814036 0.1755354 ]
BPR-Train: 100%|██████████| 7500000/7500000 [00:11<00:00, 644451.60it/s]
BPR-Score: 100%|██████████| 52/52 [00:41<00:00,  1.27it/s]
[0.06220211 0.14851007 0.26101208]
BPR-Train: 100%|██████████| 7500000/7500000 [00:11<00:00, 646940.51it/s]
BPR-Score: 100%|██████████| 52/52 [00:40<00:00,  1.28it/s]
[0.09404842 0.19460033 0.29784134]
BPR-Train: 100%|██████████| 7500000/7500000 [00:11<00:00, 627729.08it/s]
BPR-Score: 100%|██████████| 52/52 [00:41<00:00,  1.25it/s]
[0.1196142  0.21527029 0.31494886]
BPR-Train: 100%|██████████| 7500000/7500000 [00:12<00:00, 622368.39it/s]
BPR-Score: 100%|██████████| 52/52 [00:42<00:00,  1.23it/s]
[0.13495134 0.22530432 0.32633533

In [8]:
OUT_DIR ='/projects/zazzle/rsandhu/pybpr/examples/output'
bpr1.save_model(dir_name=OUT_DIR)

Saving the model in /projects/zazzle/rsandhu/pybpr/examples/output


In [9]:
bpr1.load_model(OUT_DIR)

Loading the model from /projects/zazzle/rsandhu/pybpr/examples/output
NpzFile '/projects/zazzle/rsandhu/pybpr/examples/output/bpr_model.npz' with keys: umat, imat (16462, 200)


In [10]:
bpr2 = BPR()
bpr2.load_model(OUT_DIR)

Loading the model from /projects/zazzle/rsandhu/pybpr/examples/output
NpzFile '/projects/zazzle/rsandhu/pybpr/examples/output/bpr_model.npz' with keys: umat, imat (16462, 200)


In [11]:
np.all(bpr2.umat == bpr1.umat)
np.all(bpr2.imat == bpr1.imat)

True

In [12]:
bpr2

BPR(mname='bpr_model', num_features=200, num_iters=100, batch_size=32, initial_std=0.0001, reg_lambda=0.0, learning_rate=0.001, verbose=False)

In [13]:
bpr1

BPR(mname='bpr_model', num_features=200, num_iters=500, batch_size=15000, initial_std=0.0001, reg_lambda=0.0, learning_rate=0.1, verbose=False)