In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import polara
from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from dataprep import transform_indices
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items

from polara.lib.tensor import hooi
from polara.lib.sparse import tensor_outer_at

from sa_hooi import sa_hooi, form_attention_matrix, get_scaling_weights, generate_position_projector

from scipy.sparse import csr_matrix

# Data preparation

In [2]:
data = get_movielens_data(include_time=True)

In [3]:
data.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Data splitting

In [4]:
test_timepoint = data['timestamp'].quantile(
    q=0.9, interpolation='nearest'
)

In [5]:
test_data_ = data.query('timestamp >= @test_timepoint')

In [6]:
test_data_.nunique()

userid        1209
movieid       3407
rating           5
timestamp    63411
dtype: int64

In [7]:
train_data_ = data.query(
    'userid not in @test_data_.userid.unique() and timestamp < @test_timepoint'
)

In [8]:
training, data_index = transform_indices(train_data_.copy(), 'userid', 'movieid')

In [9]:
test_data = reindex(test_data_, data_index['items'])

Filtered 113 invalid observations.


In [10]:
test_data.nunique()

userid        1208
movieid       3365
rating           5
timestamp    63322
dtype: int64

In [11]:
testset_, holdout_ = leave_one_out(
    test_data, target='timestamp', sample_top=True, random_state=0
)
testset_valid_, holdout_valid_ = leave_one_out(
    testset_, target='timestamp', sample_top=True, random_state=0
)

In [12]:
test_users = np.intersect1d(testset_valid_.userid.unique(), holdout_valid_.userid.unique())
testset_valid = testset_valid_.query('userid in @test_users').sort_values('userid')
holdout_valid = holdout_valid_.query('userid in @test_users').sort_values('userid')

In [13]:
testset_valid.nunique()

userid        1137
movieid       3357
rating           5
timestamp    61488
dtype: int64

In [14]:
holdout_valid.shape

(1137, 4)

In [15]:
assert holdout_valid.set_index('userid')['timestamp'].ge(
    testset_valid
    .groupby('userid')
    ['timestamp'].max()
).all()

In [16]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    n_ratings = training['rating'].nunique(),
    min_rating = training['rating'].min(),
    test_users = holdout_valid[data_index['users'].name].drop_duplicates().values
)

# CoFFee with attention

In [17]:
def tf_model_build(config, data, data_description, attention_matrix):
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]

    idx = data[[userid, itemid, feedback]].values
    idx[:, -1] = idx[:, -1] - data_description['min_rating'] # works only for integer ratings!
    val = np.ones(idx.shape[0], dtype='f8')
    
    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    n_ratings = data_description["n_ratings"]
    shape = (n_users, n_items, n_ratings)
    core_shape = config['mlrank']
    num_iters = config["num_iters"]
    
    item_popularity = (
        data[itemid]
        .value_counts(sort=False)
        .reindex(range(n_items))
        .fillna(1)
        .values
    )
    scaling_weights = get_scaling_weights(item_popularity, scaling=config["scaling"])
    
    u0, u1, u2 = sa_hooi(
        idx, val, shape, config["mlrank"],
        attention_matrix = attention_matrix,
        scaling_weights = scaling_weights,
        max_iters = config["num_iters"],
        parallel_ttm = False,
        randomized = config["randomized"],
        growth_tol = config["growth_tol"],
        seed = config["seed"],
        iter_callback = None,
    )
    
    return u0, u1, u2, attention_matrix
        

In [18]:
config = {
    "scaling": 1,
    "mlrank": (30, 30, 5),
    "n_ratings": data_description['n_ratings'],
    "num_iters": 5,
    "attention_decay": 1,
    "randomized": True,
    "growth_tol": 1e-4,
    "seed": 42
}

In [19]:
tf_params = tf_model_build(config, training, data_description, attention_matrix)

NameError: name 'attention_matrix' is not defined

In [21]:
userid = data_description['users']
seen_data = testset_valid

In [137]:
def tf_scoring(params, data, data_description):
    user_factors, item_factors, feedback_factors, attention_matrix = params
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]

    data = data.sort_values(userid)
    useridx = data[userid]
    
    n_users = useridx.nunique()
    n_items = data_description['n_items']
    n_ratings = data_description['n_ratings']
    
    scores = np.zeros((n_users, n_items))
    inv_attention = np.linalg.inv(attention_matrix.A)
    for i, u in tqdm(enumerate(np.unique(useridx))):
        data_u = data[data.userid==u]
        P = csr_matrix((np.ones(data_u.shape[0]), (data_u[itemid].values, data_u[feedback].values - data_description['min_rating'])), (n_items, n_ratings))
        
        res = item_factors @ (item_factors.T @ (P @ (attention_matrix @ (feedback_factors @ (inv_attention.T @ feedback_factors).T))))
        
        scores[i] = np.sum(res[:, 2:], axis=1) #- np.sum(res[:, :2], axis=1)
        
    return scores

In [135]:
# tf_scores = tf_scoring(tf_params, seen_data, data_description)
# tf_scores.shape

In [84]:
# downvote_seen_items(tf_scores, seen_data, data_description)

In [136]:
# tf_recs = topn_recommendations(tf_scores, topn=10)
# model_evaluate(tf_recs, holdout_valid, data_description)

In [24]:
# userid = data_description['users']
# seen_data = testset_valid

# Attention experiments

In [180]:
from IPython.utils import io
from scipy.sparse import csr_matrix

In [182]:
data_description['n_ratings']

5

In [186]:
def run_exp(attention_matrix, params=None):

    with io.capture_output() as captured:

        tf_params = tf_model_build(config, training, data_description, attention_matrix)
        tf_scores = tf_scoring(tf_params, seen_data, data_description)
        downvote_seen_items(tf_scores, seen_data, data_description)

        tf_recs = topn_recommendations(tf_scores, topn=10)

    out = model_evaluate(tf_recs, holdout_valid, data_description)

    print('HR={:.4f}, MRR={:.4f}, COV={:.4f}'.format(*out))
    
    if params is not None:
        print(params, '\n')

### Predefined attention grid

In [184]:
attentions_params_list = [
    {'decay_factor': 1, 'exponential_decay': False, 'reverse': True},
    {'decay_factor': 1, 'exponential_decay': True, 'reverse': True},
    {'decay_factor': 1, 'exponential_decay': False, 'reverse': False},
    {'decay_factor': 1, 'exponential_decay': True, 'reverse': False}
]

In [187]:
for params in attentions_params_list:
    attention_matrix = form_attention_matrix(
                data_description['n_ratings'],
                **params,
                format = 'csr'
            )
    run_exp(attention_matrix, params)    

HR=0.0633, MRR=0.0206, COV=0.1329
{'decay_factor': 1, 'exponential_decay': False, 'reverse': True} 

HR=0.0519, MRR=0.0193, COV=0.1136
{'decay_factor': 1, 'exponential_decay': True, 'reverse': True} 

HR=0.0633, MRR=0.0225, COV=0.1543
{'decay_factor': 1, 'exponential_decay': False, 'reverse': False} 

HR=0.0580, MRR=0.0197, COV=0.1554
{'decay_factor': 1, 'exponential_decay': True, 'reverse': False} 



### Euclidean distance attention

In [367]:
eucl_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        
        k, l = center_and_rescale_score(i + 1), center_and_rescale_score(j + 1)
        
        diff = abs(k - l)
        
        eucl_matrix[i, j] = 1 / (diff + 1)

similarity = eucl_matrix
        
print(similarity)
    
a = np.linalg.cholesky(similarity)        

attention_matrix = csr_matrix(a)

[[1.         0.75657241 0.47457495 0.34571609 0.31110998]
 [0.75657241 1.         0.56009915 0.38898453 0.34571609]
 [0.47457495 0.56009915 1.         0.56009915 0.47457495]
 [0.34571609 0.38898453 0.56009915 1.         0.75657241]
 [0.31110998 0.34571609 0.47457495 0.75657241 1.        ]]


In [368]:
run_exp(attention_matrix)  

HR=0.0686, MRR=0.0246, COV=0.1552


### Rating distribution attention

In [179]:
rating_dist = []

total_cnt = training.shape[0]

for i in range(5):
    val = training.query(f'rating == {i + 1}').count()[0] / total_cnt
    
    rating_dist.append(val)

rating_dist

[0.056644931644931645,
 0.10500693000693001,
 0.25943898443898444,
 0.3442087192087192,
 0.2347004347004347]

In [375]:
rat_dist_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        diff = abs(rating_dist[i] - rating_dist[j])
        rat_dist_matrix[i, j] = 1 / (diff + 1)
        
a = np.linalg.cholesky(rat_dist_matrix)

for i in range(5):
    a[i, i] = 1e-5

attention_matrix = csr_matrix(a)
a

array([[1.00000000e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [9.53868989e-01, 1.00000000e-05, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [8.31397526e-01, 2.43760319e-01, 1.00000000e-05, 0.00000000e+00,
        0.00000000e+00],
       [7.76660550e-01, 2.20298323e-01, 4.45453340e-01, 1.00000000e-05,
        0.00000000e+00],
       [8.48856440e-01, 2.51472977e-01, 4.18181191e-01, 8.97911408e-04,
        1.00000000e-05]])

In [376]:
run_exp(attention_matrix)

HR=0.0695, MRR=0.0256, COV=0.1538


### Trigonometry scale attention

In [290]:
def center_and_rescale_score(x, func=None):
    
    if func is None:
        func = np.arctan
    
    return func(x - 3)

In [319]:
eucl_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        
        k, l = center_and_rescale_score(i + 1), center_and_rescale_score(j + 1)
        
        diff = abs(k - l)
        
        eucl_matrix[i, j] = diff

similarity = np.ones_like(eucl_matrix) - eucl_matrix / np.max(eucl_matrix)
        
a = np.linalg.cholesky(similarity)

# for i in range(5):
#     a[i, i] = 1e-5

attention_matrix = csr_matrix(a)

In [320]:
run_exp(attention_matrix)

HR=0.0431, MRR=0.0136, COV=0.1235


# Sequential TF

## Preprocessing

In [29]:
n_pos = 200

In [30]:
def assign_positions(s, maxlen=n_pos):
    return np.arange(maxlen-len(s), maxlen)

In [31]:
training.head()

Unnamed: 0,userid,movieid,rating,timestamp
3768,0,2725,2,978129223
3769,0,1134,5,978128920
3770,0,1136,5,978128920
3771,0,1138,5,978129176
3772,0,1070,4,978129392


In [32]:
training_data = (
    training
    .sort_values('timestamp')
    .assign(
        pos = lambda df: df.groupby('userid')['movieid'].transform(assign_positions)
    )
    .sort_values(['userid', 'timestamp'])
    .query('pos>=0')
)

In [33]:
training_data.head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
3805,0,786,5,978128786,130
3819,0,1789,2,978128821,131
3785,0,289,4,978128821,132
3832,0,1545,3,978128865,133
3816,0,3290,1,978128865,134


In [34]:
training_data.query('userid == 66').head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
16861,66,301,4,977427611,111
16837,66,2269,3,977427611,112
16807,66,3620,2,977427611,113
16855,66,2320,1,977427611,114
16814,66,1917,3,977427611,115


In [35]:
testset_valid = (
    testset_valid
    .sort_values('timestamp')
    .assign(
        pos = lambda df: df.groupby('userid')['movieid'].transform(assign_positions)
    )
    .sort_values(['userid', 'timestamp'])
)

In [36]:
testset_valid

Unnamed: 0,userid,movieid,rating,timestamp,pos
31,1,2915,4,978300019,149
37,1,937,5,978300055,150
22,1,1156,5,978300055,151
27,1,1544,4,978300055,152
24,1,2104,3,978300103,153
...,...,...,...,...,...
1000119,6040,3365,4,997454367,195
999923,6040,224,5,997454398,196
1000019,6040,2659,4,997454429,197
999988,6040,1708,4,997454464,198


In [37]:
holdout_valid.head()

Unnamed: 0,userid,movieid,rating,timestamp
32,1,1412,4,978824330
136,2,1704,3,978300174
187,3,101,4,978298486
237,4,951,4,978294282
418,5,1671,3,978246576


## Model

In [38]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    positions = 'pos',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    n_pos = n_pos
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'positions': 'pos',
 'n_users': 4831,
 'n_items': 3635,
 'n_pos': 200}

In [39]:
def seqtf_model_build(config, data, data_description):
    userid = data_description["users"]
    itemid = data_description["items"]
    positions = data_description["positions"]

    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    max_pos = data_description["n_pos"]
    shape = (n_users, n_items, max_pos)

    attention_matrix = form_attention_matrix(
        data_description["n_pos"],
        config["attention_decay"],
        format = 'csr'
    )

    item_popularity = (
        data[itemid]
        .value_counts(sort=False)
        .reindex(range(n_items))
        .fillna(1)
        .values
    )
    scaling_weights = get_scaling_weights(item_popularity, scaling=config["scaling"])

    idx = data[[userid, itemid, positions]].values
    val = np.ones(idx.shape[0], dtype='f8')

    user_factors, item_factors, feedback_factors = sa_hooi(
        idx, val, shape, config["mlrank"],
        attention_matrix = attention_matrix,
        scaling_weights = scaling_weights,
        max_iters = config["num_iters"],
        parallel_ttm = False,
        randomized = config["randomized"],
        growth_tol = config["growth_tol"],
        seed = config["seed"],
        iter_callback = None,
    )
    return user_factors, item_factors, feedback_factors, attention_matrix


In [40]:
config = {
    "scaling": 1,
    "mlrank": (30, 30, 5),
    "n_pos": n_pos,
    "num_iters": 5,
    "attention_decay": 1,
    "randomized": True,
    "growth_tol": 1e-4,
    "seed": 42
}

In [41]:
tf_params = seqtf_model_build(config, training_data, data_description)

growth of the core: 1.0
growth of the core: 0.3624574470983329
growth of the core: 0.030755154018483048
growth of the core: 0.0033607523844195937
growth of the core: 0.001020242607428142


In [42]:
def tf_scoring(params, data, data_description):
    user_factors, item_factors, pos_factors, attention_matrix = params
    last_position_projector = generate_position_projector(attention_matrix, pos_factors)

    userid = data_description["users"]
    itemid = data_description["items"]
    posid = data_description["positions"]

    tset_data = data.sort_values([userid, posid])
    useridx = tset_data[userid].values
    itemidx = tset_data[itemid].values
    indptr, = np.where(np.diff(useridx, prepend=0, append=1))
    scores = user_scoring(indptr, itemidx, item_factors, last_position_projector)
    return scores

def user_scoring(indptr, indices, item_factors, last_position_projector):
    sequences = np.array_split(indices, indptr[1:-1])
    n_items = item_factors.shape[0]
    scores = np.zeros((len(sequences), n_items))
    for u, seq in enumerate(sequences):
        scores[u] = sequences_score(seq, item_factors, last_position_projector)
    return scores

def sequences_score(seq, item_factors, last_position_projector):
    n_pos = len(last_position_projector)
    user_profile = item_factors[seq[-(n_pos-1):], :]
    n_items = user_profile.shape[0]
    scores = item_factors @ (user_profile.T @ last_position_projector[-(n_items+1):-1])
    return scores

In [43]:
tf_scores = tf_scoring(tf_params, testset_valid, data_description)

  warn("Constructing a DIA matrix with %d diagonals "


In [44]:
downvote_seen_items(tf_scores, testset_valid, data_description)

In [45]:
tf_recs = topn_recommendations(tf_scores, topn=10)
model_evaluate(tf_recs, holdout_valid, data_description)

(0.07827616534740545, 0.028646465915595207, 0.16203576341127923)