In [3]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import polara
from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from dataprep import transform_indices
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items

from polara.lib.tensor import hooi
from polara.lib.sparse import tensor_outer_at

from sa_hooi import sa_hooi, form_attention_matrix, get_scaling_weights, generate_position_projector

from scipy.sparse import csr_matrix

In [1]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

Collecting polara
  Cloning https://github.com/evfro/polara.git (to revision develop) to /tmp/pip-install-e0gwdjlv/polara_f31ec3749fd9403bac3ebbf7baaa5481
  Running command git clone -q https://github.com/evfro/polara.git /tmp/pip-install-e0gwdjlv/polara_f31ec3749fd9403bac3ebbf7baaa5481
  Running command git checkout -b develop --track origin/develop
  Switched to a new branch 'develop'
  Branch 'develop' set up to track remote branch 'develop' from 'origin'.
Building wheels for collected packages: polara
  Building wheel for polara (setup.py) ... [?25l[?25hdone
  Created wheel for polara: filename=polara-0.7.2.dev0-py3-none-any.whl size=87177 sha256=6223186f87246ed35b0c79d2b261d12f8acdc7edeba704f3e049ca776221b721
  Stored in directory: /tmp/pip-ephem-wheel-cache-a5zk_8kf/wheels/19/88/81/920c4189a6b0b92f19b02f18fcb44ded22ae852f7b09ab2e28
Successfully built polara
Installing collected packages: polara
Successfully installed polara-0.7.2.dev0


# Data preparation

In [4]:
data = get_movielens_data(include_time=True)

In [5]:
data.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Data splitting

In [7]:
test_timepoint = data['timestamp'].quantile(
    q=0.9, interpolation='nearest'
)

In [8]:
test_data_ = data.query('timestamp >= @test_timepoint')

In [None]:
test_data_.nunique()

userid        1209
movieid       3407
rating           5
timestamp    63411
dtype: int64

In [9]:
train_data_ = data.query(
    'userid not in @test_data_.userid.unique() and timestamp < @test_timepoint'
)

In [10]:
training, data_index = transform_indices(train_data_.copy(), 'userid', 'movieid')

In [11]:
test_data = reindex(test_data_, data_index['items'])

Filtered 113 invalid observations.


In [None]:
test_data.nunique()

userid        1208
movieid       3365
rating           5
timestamp    63322
dtype: int64

In [12]:
testset_, holdout_ = leave_one_out(
    test_data, target='timestamp', sample_top=True, random_state=0
)
testset_valid_, holdout_valid_ = leave_one_out(
    testset_, target='timestamp', sample_top=True, random_state=0
)

In [13]:
test_users = np.intersect1d(testset_valid_.userid.unique(), holdout_valid_.userid.unique())
testset_valid = testset_valid_.query('userid in @test_users').sort_values('userid')
holdout_valid = holdout_valid_.query('userid in @test_users').sort_values('userid')

In [14]:
testset_valid.nunique()

userid        1137
movieid       3357
rating           5
timestamp    61488
dtype: int64

In [15]:
holdout_valid.shape

(1137, 4)

In [16]:
assert holdout_valid.set_index('userid')['timestamp'].ge(
    testset_valid
    .groupby('userid')
    ['timestamp'].max()
).all()

In [17]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    n_ratings = training['rating'].nunique(),
    min_rating = training['rating'].min(),
    test_users = holdout_valid[data_index['users'].name].drop_duplicates().values
)

# CoFFee with attention

In [145]:
def tf_model_build(config, data, data_description, attention_matrix):
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]

    idx = data[[userid, itemid, feedback]].values
    idx[:, -1] = idx[:, -1] - data_description['min_rating'] # works only for integer ratings!
    val = np.ones(idx.shape[0], dtype='f8')
    
    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    n_ratings = data_description["n_ratings"]
    shape = (n_users, n_items, n_ratings)
    core_shape = config['mlrank']
    num_iters = config["num_iters"]
    
    item_popularity = (
        data[itemid]
        .value_counts(sort=False)
        .reindex(range(n_items))
        .fillna(1)
        .values
    )
    scaling_weights = get_scaling_weights(item_popularity, scaling=config["scaling"])
    
    u0, u1, u2 = sa_hooi(
        idx, val, shape, config["mlrank"],
        attention_matrix = attention_matrix,
        scaling_weights = scaling_weights,
        max_iters = config["num_iters"],
        parallel_ttm = False,
        randomized = config["randomized"],
        growth_tol = config["growth_tol"],
        seed = config["seed"],
        iter_callback = None,
    )
    
    return u0, u1, u2, attention_matrix
        

In [146]:
config = {
    "scaling": 1,
    "mlrank": (30, 30, 5),
    "n_ratings": data_description['n_ratings'],
    "num_iters": 5,
    "attention_decay": 1,
    "randomized": True,
    "growth_tol": 1e-4,
    "seed": 42
}

In [None]:
tf_params = tf_model_build(config, training, data_description, attention_matrix)

NameError: name 'attention_matrix' is not defined

In [148]:
userid = data_description['users']
seen_data = testset_valid

In [147]:
def tf_scoring(params, data, data_description):
    user_factors, item_factors, feedback_factors, attention_matrix = params
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]

    data = data.sort_values(userid)
    useridx = data[userid]
    
    n_users = useridx.nunique()
    n_items = data_description['n_items']
    n_ratings = data_description['n_ratings']
    
    scores = np.zeros((n_users, n_items))
    inv_attention = np.linalg.inv(attention_matrix.A)
    for i, u in tqdm(enumerate(np.unique(useridx))):
        data_u = data[data.userid==u]
        P = csr_matrix((np.ones(data_u.shape[0]), (data_u[itemid].values, data_u[feedback].values - data_description['min_rating'])), (n_items, n_ratings))
        
        res = item_factors @ (item_factors.T @ (P @ (attention_matrix @ (feedback_factors @ (inv_attention.T @ feedback_factors).T))))
        
        scores[i] = np.sum(res[:, 2:], axis=1) #- np.sum(res[:, :2], axis=1)
        
    return scores

In [None]:
# tf_scores = tf_scoring(tf_params, seen_data, data_description)
# tf_scores.shape

In [None]:
# downvote_seen_items(tf_scores, seen_data, data_description)

In [None]:
# tf_recs = topn_recommendations(tf_scores, topn=10)
# model_evaluate(tf_recs, holdout_valid, data_description)

In [None]:
# userid = data_description['users']
# seen_data = testset_valid

# Attention experiments

In [141]:
from IPython.utils import io
from scipy.sparse import csr_matrix

In [None]:
data_description['n_ratings']

5

In [142]:
def run_exp(attention_matrix, params=None):

    with io.capture_output() as captured:

        tf_params = tf_model_build(config, training, data_description, attention_matrix)
        tf_scores = tf_scoring(tf_params, seen_data, data_description)
        downvote_seen_items(tf_scores, seen_data, data_description)

        tf_recs = topn_recommendations(tf_scores, topn=10)

    out = model_evaluate(tf_recs, holdout_valid, data_description)

    print('HR={:.4f}, MRR={:.4f}, COV={:.4f}'.format(*out))
    
    if params is not None:
        print(params, '\n')

### Predefined attention grid

In [None]:
attentions_params_list = [
    {'decay_factor': 1, 'exponential_decay': False, 'reverse': True},
    {'decay_factor': 1, 'exponential_decay': True, 'reverse': True},
    {'decay_factor': 1, 'exponential_decay': False, 'reverse': False},
    {'decay_factor': 1, 'exponential_decay': True, 'reverse': False}
]

In [None]:
for params in attentions_params_list:
    attention_matrix = form_attention_matrix(
                data_description['n_ratings'],
                **params,
                format = 'csr'
            )
    run_exp(attention_matrix, params)    

HR=0.0633, MRR=0.0206, COV=0.1329
{'decay_factor': 1, 'exponential_decay': False, 'reverse': True} 

HR=0.0519, MRR=0.0193, COV=0.1136
{'decay_factor': 1, 'exponential_decay': True, 'reverse': True} 

HR=0.0633, MRR=0.0225, COV=0.1543
{'decay_factor': 1, 'exponential_decay': False, 'reverse': False} 

HR=0.0580, MRR=0.0197, COV=0.1554
{'decay_factor': 1, 'exponential_decay': True, 'reverse': False} 



### Euclidean distance attention

In [None]:
eucl_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        eucl_matrix[i, j] = abs(i - j) / np.exp(abs(i - j)) if i != j else 5 + 1e-2
        
a = np.linalg.cholesky(eucl_matrix)

for i in range(5):
    a[i, i] = 1e-5

attention_matrix = csr_matrix(a)

In [None]:
run_exp(attention_matrix)  

HR=0.0642, MRR=0.0233, COV=0.1483


### Rating distribution attention

In [None]:
rating_dist = []

total_cnt = training.shape[0]

for i in range(5):
    val = training.query(f'rating == {i + 1}').count()[0] / total_cnt
    
    rating_dist.append(val)

rating_dist

[0.056644931644931645,
 0.10500693000693001,
 0.25943898443898444,
 0.3442087192087192,
 0.2347004347004347]

In [None]:
rat_dist_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        diff = abs(rating_dist[i] - rating_dist[j])
        rat_dist_matrix[i, j] = diff / np.exp(diff) if i != j else 1. + 1e-1
        
a = np.linalg.cholesky(rat_dist_matrix)

for i in range(5):
    a[i, i] = 1e-5

attention_matrix = csr_matrix(a)
a

array([[ 1.00000000e-05,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 4.39343845e-02,  1.00000000e-05,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 1.57865247e-01,  1.19666749e-01,  1.00000000e-05,
         0.00000000e+00,  0.00000000e+00],
       [ 2.05660310e-01,  1.71084548e-01,  2.42154277e-02,
         1.00000000e-05,  0.00000000e+00],
       [ 1.42079211e-01,  1.02755253e-01, -1.02838852e-02,
         5.08948653e-02,  1.00000000e-05]])

In [None]:
run_exp(attention_matrix)

HR=0.0774, MRR=0.0255, COV=0.1508


### Trigonometry scale attention

In [None]:
def rescale_score(x, func=None):
    
    if func is None:
        func = np.arctan
    
    return func(x)

In [None]:
eucl_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        
        k, l = rescale_score(i + 1), rescale_score(j + 1)
        
        diff = abs(k - l)
        
        eucl_matrix[i, j] = diff / np.exp(diff) if i != j else 5 + 1e-2
        
a = np.linalg.cholesky(eucl_matrix)

for i in range(5):
    a[i, i] = 1e-5

attention_matrix = csr_matrix(a)

In [None]:
run_exp(attention_matrix)

HR=0.0730, MRR=0.0243, COV=0.1527


### Conditional probability approach

In [63]:
train_new = training.sort_values('timestamp')

In [None]:
train_new_part = train_new.query('rating == 1| rating == 2')

In [71]:
train_new_part = train_new.query('rating == 1| rating == 2')
users = train_new_part.userid.unique()
users[0]
train_new_part.query('userid == @users[0]') 

Unnamed: 0,userid,movieid,rating,timestamp
999790,4830,3128,2,956705323
999824,4830,1130,2,956705351
999842,4830,2422,2,956705508
999837,4830,725,2,956706051


In [92]:
train_new_part = train_new.query('rating == 1| rating == 2')
users = train_new_part.userid.unique()
count12_tot = 0
count12_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 1 and train_new_part_user.loc[i-1, 'rating' ]== 2:
     count12_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 2 and train_new_part_user.loc[i-1, 'rating' ]== 1:
      count12_loc += 1
  count12_tot += count12_loc

In [93]:
train_new_part = train_new.query('rating == 1| rating == 3')
users = train_new_part.userid.unique()
count13_tot = 0
count13_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 1 and train_new_part_user.loc[i-1, 'rating' ]== 3:
     count13_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 3 and train_new_part_user.loc[i-1, 'rating' ]== 1:
      count13_loc += 1
  count13_tot += count13_loc

In [94]:
train_new_part = train_new.query('rating == 1| rating == 4')
users = train_new_part.userid.unique()
count14_tot = 0
count14_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 1 and train_new_part_user.loc[i-1, 'rating' ]== 4:
     count14_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 4 and train_new_part_user.loc[i-1, 'rating' ]== 1:
      count14_loc += 1
  count14_tot += count14_loc

In [95]:
train_new_part = train_new.query('rating == 1| rating == 5')
users = train_new_part.userid.unique()
count15_tot = 0
count15_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 1 and train_new_part_user.loc[i-1, 'rating' ]== 5:
     count15_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 5 and train_new_part_user.loc[i-1, 'rating' ]== 1:
      count15_loc += 1
  count15_tot += count15_loc

In [96]:
train_new_part = train_new.query('rating == 2| rating == 3')
users = train_new_part.userid.unique()
count23_tot = 0
count23_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 2 and train_new_part_user.loc[i-1, 'rating' ]== 3:
     count23_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 3 and train_new_part_user.loc[i-1, 'rating' ]== 2:
      count23_loc += 1
  count23_tot += count23_loc

In [97]:
train_new_part = train_new.query('rating == 2| rating == 4')
users = train_new_part.userid.unique()
count24_tot = 0
count24_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 2 and train_new_part_user.loc[i-1, 'rating' ]== 4:
     count24_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 4 and train_new_part_user.loc[i-1, 'rating' ]== 2:
      count24_loc += 1
  count24_tot += count24_loc

In [98]:
train_new_part = train_new.query('rating == 2| rating == 5')
users = train_new_part.userid.unique()
count25_tot = 0
count25_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 2 and train_new_part_user.loc[i-1, 'rating' ]== 5:
     count25_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 5 and train_new_part_user.loc[i-1, 'rating' ]== 2:
      count25_loc += 1
  count25_tot += count25_loc

In [99]:
train_new_part = train_new.query('rating == 3| rating == 4')
users = train_new_part.userid.unique()
count34_tot = 0
count34_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 3 and train_new_part_user.loc[i-1, 'rating' ]== 4:
     count34_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 4 and train_new_part_user.loc[i-1, 'rating' ]== 3:
      count34_loc += 1
  count34_tot += count34_loc

In [100]:
train_new_part = train_new.query('rating == 3| rating == 5')
users = train_new_part.userid.unique()
count35_tot = 0
count35_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 3 and train_new_part_user.loc[i-1, 'rating' ]== 5:
     count35_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 5 and train_new_part_user.loc[i-1, 'rating' ]== 3:
      count35_loc += 1
  count35_tot += count35_loc

In [101]:
train_new_part = train_new.query('rating == 4| rating == 5')
users = train_new_part.userid.unique()
count45_tot = 0
count45_loc = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  train_new_part_user = train_new_part_user.reset_index()
  for i in range(1, len(train_new_part_user)):
    if train_new_part_user.loc[i, 'rating' ] == 4 and train_new_part_user.loc[i-1, 'rating' ]== 5:
     count45_loc += 1
    if train_new_part_user.loc[i, 'rating' ] == 5 and train_new_part_user.loc[i-1, 'rating' ]== 4:
      count45_loc += 1
  count45_tot += count45_loc

In [102]:
train_new_part = train_new.query('rating == 1')
users = train_new_part.userid.unique()
count11_tot = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  count11_tot += len(train_new_part_user)-1

In [103]:
train_new_part = train_new.query('rating == 2')
users = train_new_part.userid.unique()
count22_tot = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  count22_tot += len(train_new_part_user)-1

In [104]:
train_new_part = train_new.query('rating == 3')
users = train_new_part.userid.unique()
count33_tot = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  count33_tot += len(train_new_part_user)-1

In [105]:
train_new_part = train_new.query('rating == 4')
users = train_new_part.userid.unique()
count44_tot = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  count44_tot += len(train_new_part_user)-1

In [106]:
train_new_part = train_new.query('rating == 5')
users = train_new_part.userid.unique()
count55_tot = 0
for user in users:
  train_new_part_user = train_new_part.query('userid == @user') 
  count55_tot += len(train_new_part_user)-1

In [135]:
rat_dist_matrix = np.zeros((5, 5))

rat_dist_matrix[0][0] = count11_tot
rat_dist_matrix[0][1] = rat_dist_matrix[1][0]= count12_tot
rat_dist_matrix[0][2] = rat_dist_matrix[2][0]= count13_tot
rat_dist_matrix[0][3] = rat_dist_matrix[3][0]= count14_tot
rat_dist_matrix[0][4] = rat_dist_matrix[4][0]= count15_tot
rat_dist_matrix[1][1] = count22_tot
rat_dist_matrix[1][2] = rat_dist_matrix[2][1] = count23_tot
rat_dist_matrix[1][3] = rat_dist_matrix[3][1] = count24_tot
rat_dist_matrix[1][4] = rat_dist_matrix[4][1] = count25_tot
rat_dist_matrix[2][2] = count33_tot
rat_dist_matrix[2][3] = rat_dist_matrix[3][2] = count34_tot
rat_dist_matrix[2][4] = rat_dist_matrix[4][2] = count35_tot
rat_dist_matrix[3][3] = count44_tot
rat_dist_matrix[3][4] = rat_dist_matrix[4][3] = count45_tot
rat_dist_matrix[4][4] = count55_tot       
#a = np.linalg.cholesky(rat_dist_matrix)

#for i in range(5):
#    a[i, i] = 1e-5

#attention_matrix = csr_matrix(a)

In [117]:
rat_dist_matrix 

array([[3.23670000e+04, 8.13348610e+07, 9.57270910e+07, 8.60867170e+07,
        6.59491590e+07],
       [8.13348610e+07, 6.22000000e+04, 2.02155670e+08, 1.80041353e+08,
        1.22274451e+08],
       [9.57270910e+07, 2.02155670e+08, 1.59916000e+05, 4.03570832e+08,
        2.53789508e+08],
       [8.60867170e+07, 1.80041353e+08, 4.03570832e+08, 2.13720000e+05,
        3.62494017e+08],
       [6.59491590e+07, 1.22274451e+08, 2.53789508e+08, 3.62494017e+08,
        1.44211000e+05]])

In [136]:
summ = np.sum(rat_dist_matrix)
rat_dist_matrix /= summ

In [124]:
summ_new = count11_tot+count12_tot+count13_tot+ count14_tot+count15_tot+ count23_tot+ count24_tot+ count25_tot+count34_tot+count35_tot+count45_tot++count11_tot+count22_tot+count33_tot+count44_tot+count55_tot

In [125]:
rat_dist_matrix /= summ_new

In [132]:
np.random.uniform(low=0.0, high=1.0)

0.5652188487209112

In [137]:
rat_dist_matrix[0,0] += np.random.uniform(low=0.0, high=1.0)
rat_dist_matrix[1,1] += np.random.uniform(low=0.0, high=1.0)
rat_dist_matrix[2,2] += np.random.uniform(low=0.0, high=1.0)
rat_dist_matrix[3,3] += np.random.uniform(low=0.0, high=1.0)
rat_dist_matrix[4,4] += np.random.uniform(low=0.0, high=1.0)

In [138]:
a = np.linalg.cholesky(rat_dist_matrix)

for i in range(5):
    a[i, i] = 1e-5

attention_matrix = csr_matrix(a)

In [139]:
attention_matrix

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [149]:
run_exp(attention_matrix)

HR=0.0695, MRR=0.0249, COV=0.1560


In [None]:
train_new.group_by()

# Sequential TF

## Preprocessing

In [None]:
n_pos = 200

In [None]:
def assign_positions(s, maxlen=n_pos):
    return np.arange(maxlen-len(s), maxlen)

In [None]:
training.head()

Unnamed: 0,userid,movieid,rating,timestamp
3768,0,2725,2,978129223
3769,0,1134,5,978128920
3770,0,1136,5,978128920
3771,0,1138,5,978129176
3772,0,1070,4,978129392


In [None]:
training_data = (
    training
    .sort_values('timestamp')
    .assign(
        pos = lambda df: df.groupby('userid')['movieid'].transform(assign_positions)
    )
    .sort_values(['userid', 'timestamp'])
    .query('pos>=0')
)

In [None]:
training_data.head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
3805,0,786,5,978128786,130
3819,0,1789,2,978128821,131
3785,0,289,4,978128821,132
3832,0,1545,3,978128865,133
3816,0,3290,1,978128865,134


In [None]:
training_data.query('userid == 66').head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
16861,66,301,4,977427611,111
16837,66,2269,3,977427611,112
16807,66,3620,2,977427611,113
16855,66,2320,1,977427611,114
16814,66,1917,3,977427611,115


In [None]:
testset_valid = (
    testset_valid
    .sort_values('timestamp')
    .assign(
        pos = lambda df: df.groupby('userid')['movieid'].transform(assign_positions)
    )
    .sort_values(['userid', 'timestamp'])
)

In [None]:
testset_valid

Unnamed: 0,userid,movieid,rating,timestamp,pos
31,1,2915,4,978300019,149
37,1,937,5,978300055,150
22,1,1156,5,978300055,151
27,1,1544,4,978300055,152
24,1,2104,3,978300103,153
...,...,...,...,...,...
1000119,6040,3365,4,997454367,195
999923,6040,224,5,997454398,196
1000019,6040,2659,4,997454429,197
999988,6040,1708,4,997454464,198


In [None]:
holdout_valid.head()

Unnamed: 0,userid,movieid,rating,timestamp
32,1,1412,4,978824330
136,2,1704,3,978300174
187,3,101,4,978298486
237,4,951,4,978294282
418,5,1671,3,978246576


## Model

In [None]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    positions = 'pos',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    n_pos = n_pos
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'positions': 'pos',
 'n_users': 4831,
 'n_items': 3635,
 'n_pos': 200}

In [144]:
def seqtf_model_build(config, data, data_description):
    userid = data_description["users"]
    itemid = data_description["items"]
    positions = data_description["positions"]

    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    max_pos = data_description["n_pos"]
    shape = (n_users, n_items, max_pos)

    attention_matrix = form_attention_matrix(
        data_description["n_pos"],
        config["attention_decay"],
        format = 'csr'
    )

    item_popularity = (
        data[itemid]
        .value_counts(sort=False)
        .reindex(range(n_items))
        .fillna(1)
        .values
    )
    scaling_weights = get_scaling_weights(item_popularity, scaling=config["scaling"])

    idx = data[[userid, itemid, positions]].values
    val = np.ones(idx.shape[0], dtype='f8')

    user_factors, item_factors, feedback_factors = sa_hooi(
        idx, val, shape, config["mlrank"],
        attention_matrix = attention_matrix,
        scaling_weights = scaling_weights,
        max_iters = config["num_iters"],
        parallel_ttm = False,
        randomized = config["randomized"],
        growth_tol = config["growth_tol"],
        seed = config["seed"],
        iter_callback = None,
    )
    return user_factors, item_factors, feedback_factors, attention_matrix


In [None]:
config = {
    "scaling": 1,
    "mlrank": (30, 30, 5),
    "n_pos": n_pos,
    "num_iters": 5,
    "attention_decay": 1,
    "randomized": True,
    "growth_tol": 1e-4,
    "seed": 42
}

In [None]:
tf_params = seqtf_model_build(config, training_data, data_description)

growth of the core: 1.0
growth of the core: 0.3624574470983329
growth of the core: 0.030755154018483048
growth of the core: 0.0033607523844195937
growth of the core: 0.001020242607428142


In [None]:
def tf_scoring(params, data, data_description):
    user_factors, item_factors, pos_factors, attention_matrix = params
    last_position_projector = generate_position_projector(attention_matrix, pos_factors)

    userid = data_description["users"]
    itemid = data_description["items"]
    posid = data_description["positions"]

    tset_data = data.sort_values([userid, posid])
    useridx = tset_data[userid].values
    itemidx = tset_data[itemid].values
    indptr, = np.where(np.diff(useridx, prepend=0, append=1))
    scores = user_scoring(indptr, itemidx, item_factors, last_position_projector)
    return scores

def user_scoring(indptr, indices, item_factors, last_position_projector):
    sequences = np.array_split(indices, indptr[1:-1])
    n_items = item_factors.shape[0]
    scores = np.zeros((len(sequences), n_items))
    for u, seq in enumerate(sequences):
        scores[u] = sequences_score(seq, item_factors, last_position_projector)
    return scores

def sequences_score(seq, item_factors, last_position_projector):
    n_pos = len(last_position_projector)
    user_profile = item_factors[seq[-(n_pos-1):], :]
    n_items = user_profile.shape[0]
    scores = item_factors @ (user_profile.T @ last_position_projector[-(n_items+1):-1])
    return scores

In [None]:
tf_scores = tf_scoring(tf_params, testset_valid, data_description)

  warn("Constructing a DIA matrix with %d diagonals "


In [None]:
downvote_seen_items(tf_scores, testset_valid, data_description)

In [None]:
tf_recs = topn_recommendations(tf_scores, topn=10)
model_evaluate(tf_recs, holdout_valid, data_description)

(0.07827616534740545, 0.028646465915595207, 0.16203576341127923)