In [1]:
from scipy.stats import gamma
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import recometrics
from utils import convert_to_index_lst, batch_mapk
import hpfrec

In [2]:
def generate_poisson_matrix(latent_factor_size, n_users, n_items, a, a_prime, b_prime, c, c_prime, d_prime, weighting_array=None):
    """
    Generate a Poisson matrix with the given parameters.
    """
    if weighting_array is None:
        weighting_array = np.ones((latent_factor_size))
    user_factors = np.empty((0, latent_factor_size))
    item_factors = np.empty((0, latent_factor_size))
    for _ in range(n_users):
        activity = gamma.rvs(a=a_prime, scale=a_prime / b_prime)
        prefs = gamma.rvs(a=a, scale=activity, size=latent_factor_size)
        user_factors = np.vstack((user_factors, prefs))

    for _ in range(n_items):
        popularity = gamma.rvs(a=c_prime, scale=c_prime / d_prime)
        prefs = gamma.rvs(a=c, scale=popularity, size=latent_factor_size)
        item_factors = np.vstack((item_factors, prefs))
    print(user_factors)
    print(item_factors)

    item_scores_weighted = (user_factors * weighting_array) @ item_factors.T
    item_scores_unweighted = user_factors @ item_factors.T
    return item_scores_weighted, item_scores_unweighted

In [3]:
latent_factor_size = 30

In [4]:
weighting_array = np.array([-1] * 10 + [1] * 20)

In [5]:
item_scores_weighted, item_scores_unweighted = generate_poisson_matrix(latent_factor_size=latent_factor_size, n_users=1000, n_items=1000, a=0.3, a_prime=0.3, b_prime=1, c=0.3, c_prime=0.3, d_prime=1, weighting_array=weighting_array)

[[1.66320726e-03 4.34633090e-03 1.58059317e-08 ... 1.01006098e-02
  9.01220555e-07 1.12906053e-02]
 [2.50132428e-01 9.68952006e-02 2.15599535e-07 ... 8.13223007e-02
  4.72316378e-05 5.03075853e-01]
 [1.19770350e-03 4.18694589e-03 4.65460129e-04 ... 7.44402238e-09
  2.09056886e-02 1.30026807e-02]
 ...
 [3.04023098e-04 6.77889577e-08 9.81581321e-05 ... 1.84853577e-06
  4.11460562e-03 1.10647850e-05]
 [1.29358978e-01 3.35038509e-05 1.10012372e-04 ... 1.39284015e-04
  1.21571385e-02 1.31987640e-05]
 [4.78529201e-02 2.37249350e-02 4.53257824e-02 ... 1.77734068e-06
  1.17028461e-02 8.54580019e-02]]
[[1.85657409e-06 3.85041288e-08 1.45587821e-04 ... 1.39110801e-04
  9.08937500e-04 2.10123393e-06]
 [3.35496228e-06 6.44352412e-03 1.60539300e-02 ... 5.41133605e-02
  4.42876008e-03 8.53142165e-03]
 [1.22717445e-04 6.90257637e-03 3.94671517e-03 ... 3.25826838e-05
  9.98831184e-04 8.87310048e-04]
 ...
 [6.03357317e-04 3.42819846e-03 3.00267899e-02 ... 1.06492884e-01
  8.36872698e-05 2.81391999e-02]

In [6]:
item_scores_weighted.min(), item_scores_weighted.max(), item_scores_weighted.std()

(-6.491745970722523, 8.025739875979186, 0.0661880329643494)

In [7]:
item_scores_unweighted.min(), item_scores_unweighted.max(), item_scores_unweighted.std()

(2.505055174571272e-31, 10.015659212733654, 0.09847930341445094)

In [8]:
(item_scores_unweighted > 0.01).sum()

181902

In [9]:
(item_scores_weighted > 0.01).sum()

92730

In [10]:
uw_cutoff = 0.01
w_cutoff = 0.01

In [11]:
del_lst = []
for i in range(len(item_scores_unweighted)):
    proxy_pref = np.where(item_scores_unweighted[i] > uw_cutoff)[0]
    if len(proxy_pref) < 10:
        del_lst.append(i)
item_scores_unweighted = np.delete(item_scores_unweighted, del_lst, axis=0)
item_scores_weighted = np.delete(item_scores_weighted, del_lst, axis=0)

In [12]:
item_scores_unweighted.shape, item_scores_weighted.shape

((633, 1000), (633, 1000))

In [13]:
proxy_watch_matrix = sparse.csr_matrix((item_scores_unweighted > uw_cutoff).astype(int))

In [14]:
# Use recometrics to create train/test split
X_train, X_test = recometrics.split_reco_train_test(proxy_watch_matrix, split_type="all", items_test_fraction=0.2)

In [15]:
# COMMENTED OUT BECAUSE CAN TAKE DIRECT COO ARRAY

# Convert X_test to np array and then index list. 
# Generate true prefs
# Convert X_train to dataframe 



X_train_arr = X_train.toarray()
concat_lst = []
for i in range(X_train_arr.shape[0]):
    for j in range(X_train_arr.shape[1]):
        if X_train_arr[i][j] > 0:
            concat_lst.append(pd.DataFrame({"UserId": [i], "ItemId": [j], "Count": [X_train_arr[i][j]]}))
X_train_df = pd.concat(concat_lst)
X_train_df

item_set = X_train_df['ItemId'].unique()

In [16]:
proxy_prefs = convert_to_index_lst(X_test.toarray())
proxy_prefs[0]

[78, 224, 267, 278, 301, 327, 377, 753, 980]

In [17]:
true_prefs = []
for i in range(item_scores_weighted.shape[0]):
    true_pref = np.where(item_scores_weighted[i] > w_cutoff)[0] #FIXME: Try turning this into 0.001 if no results still.
    true_prefs.append(list(true_pref))
true_prefs[0]

[212, 224, 327, 370, 489, 555, 597, 623, 707, 753, 807]

In [18]:
# Check whether strictly subset/this is a potential hypothesis for why no observed curve.

#FIXME: Get this to compare with entire set.

# count = 0
# for i in range(len(proxy_prefs)):
#     if len(np.setdiff1d(np.array(true_prefs[i]), np.array(proxy_prefs[i]))) > 0:
#         count += 1
# count        

In [19]:
proxy_losses, true_losses = [], []
def validation_hook(solver):
    recommendations = [solver.topN(i) for i in range(item_scores_weighted.shape[0])]

    # PROXY PREFS AND TRUE PREFS MUST BE A LISt OF INDEXES
    global proxy_prefs
    global true_prefs
    proxy_loss, true_loss = batch_mapk(recommendations, proxy_prefs, true_prefs)
    proxy_losses.append(proxy_loss)
    true_losses.append(true_loss)
    # Calculate validation loss

In [20]:
X_train_coo = X_train.tocoo()

In [21]:
# FIXME: This isn't finishing running. Hypothesis: matrix has too many sparse columns.

In [22]:
# Hypothesis: Has to do with my changes. Try first with original code.

In [23]:
recommender = hpfrec.HPF()
recommender.fit(X_train_df)

**********************************
Hierarchical Poisson Factorization
**********************************

Number of users: 633
Number of items: 746
Latent factors to use: 30

Initializing parameters...
Allocating Phi matrix...
Initializing optimization procedure...


In [None]:
recommender = hpfrec.HPF()
recommender.fit(X_train_coo, callback=validation_hook)

TypeError: HPF.fit() got an unexpected keyword argument 'callback'

In [None]:
proxy_losses, true_losses

TypeError: HPF.fit() missing 1 required positional argument: 'counts_df'