In [1]:
%%capture
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from dataprep import transform_indices
from evaluation import downvote_seen_items, topn_recommendations, model_evaluate
from sa_hooi import sa_hooi, form_attention_matrix, get_scaling_weights, generate_position_projector

# Preparing Data

In [3]:
mldata = get_movielens_data(include_time=True)

In [4]:
mldata.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Data splitting

In [5]:
test_timepoint = mldata['timestamp'].quantile(
    q=0.95, interpolation='nearest'
)

In [6]:
test_data_ = mldata.query('timestamp >= @test_timepoint')

In [7]:
test_data_.nunique()

userid         813
movieid       3249
rating           5
timestamp    33729
dtype: int64

In [8]:
train_data_ = mldata.query(
    'userid not in @test_data_.userid.unique() and timestamp < @test_timepoint'
)

In [9]:
training, data_index = transform_indices(train_data_.copy(), 'userid', 'movieid')

In [10]:
test_data = reindex(test_data_, data_index['items'])

Filtered 64 invalid observations.


In [11]:
test_data.nunique()

userid         813
movieid       3220
rating           5
timestamp    33678
dtype: int64

We sample the last item for each test user.  
As all these users are excluded from the training, and training is additionally filtered by timestamp,
there will be no "recommendation from future" issue - our model is unaware of future correlations.


In [12]:
testset_, holdout_ = leave_one_out(
    test_data, target='timestamp', sample_top=True, random_state=0
)
testset_valid_, holdout_valid_ = leave_one_out(
    testset_, target='timestamp', sample_top=True, random_state=0
)

Let's focus on validation part.

In [13]:
test_users = np.intersect1d(testset_valid_.userid.unique(), holdout_valid_.userid.unique())
testset_valid = testset_valid_.query('userid in @test_users').sort_values('userid')
holdout_valid = holdout_valid_.query('userid in @test_users').sort_values('userid')

In [14]:
testset_valid.nunique()

userid         750
movieid       3202
rating           5
timestamp    32400
dtype: int64

In [15]:
holdout_valid.shape

(750, 4)

In [16]:
assert holdout_valid.set_index('userid')['timestamp'].ge(
    testset_valid
    .groupby('userid')
    ['timestamp'].max()
).all()

## Assigning positional info

In [17]:
n_pos = 200

In [18]:
def assign_positions(s, maxlen=n_pos):
    return np.arange(maxlen-len(s), maxlen)


In [19]:
training.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,0,1084,5,978300760
1,0,631,3,978302109
2,0,835,3,978301968
3,0,3129,4,978300275
4,0,2126,5,978824291


In [20]:
training_data = (
    training
    .sort_values('timestamp')
    .assign(
        pos = lambda df: df.groupby('userid')['movieid'].transform(assign_positions)
    )
    .sort_values(['userid', 'timestamp'])
    .query('pos>=0')
)

In [21]:
training_data.head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
31,0,2925,4,978300019,147
27,0,1547,4,978300055,148
22,0,1158,5,978300055,149
37,0,939,5,978300055,150
24,0,2111,3,978300103,151


In [22]:
training_data.query('userid == 66').head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
10581,66,609,3,977850585,25
10460,66,2798,3,977850614,26
10500,66,2386,5,977850661,27
10565,66,1100,4,977850704,28
10428,66,1086,4,977850778,29


In [23]:
testset_valid = (
    testset_valid
    .sort_values('timestamp')
    .assign(
        pos = lambda df: df.groupby('userid')['movieid'].transform(assign_positions)
    )
    .sort_values(['userid', 'timestamp'])
)

In [24]:
testset_valid

Unnamed: 0,userid,movieid,rating,timestamp,pos
2503,20,1529,3,1009669071,191
2512,20,1334,3,1009669071,192
2513,20,3425,2,1009669071,193
2517,20,2610,4,1009669071,194
2504,20,2399,4,1009669115,195
...,...,...,...,...,...
1000119,6040,3379,4,997454367,195
999923,6040,224,5,997454398,196
1000019,6040,2667,4,997454429,197
1000172,6040,1591,3,997454464,198


In [25]:
holdout_valid.head()

Unnamed: 0,userid,movieid,rating,timestamp
2507,20,3244,4,1009669181
3143,23,1906,3,993707016
5076,36,575,4,1040545109
6091,44,522,4,1004412155
8776,59,3035,3,1041968324


# Sequential TF

In [26]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    positions = 'pos',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    n_pos = n_pos
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'positions': 'pos',
 'n_users': 5227,
 'n_items': 3652,
 'n_pos': 200}

In [27]:
def seqtf_model_build(config, data, data_description):
    userid = data_description["users"]
    itemid = data_description["items"]
    positions = data_description["positions"]

    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    max_pos = data_description["n_pos"]
    shape = (n_users, n_items, max_pos)

    attention_matrix = form_attention_matrix(
        data_description["n_pos"],
        config["attention_decay"],
        format = 'csr'
    )

    item_popularity = (
        data[itemid]
        .value_counts(sort=False)
        .reindex(range(n_items))
        .fillna(1)
        .values
    )
    scaling_weights = get_scaling_weights(item_popularity, scaling=config["scaling"])

    idx = data[[userid, itemid, positions]].values
    val = np.ones(idx.shape[0], dtype='f8')

    user_factors, item_factors, feedback_factors = sa_hooi(
        idx, val, shape, config["mlrank"],
        attention_matrix = attention_matrix,
        scaling_weights = scaling_weights,
        max_iters = config["num_iters"],
        parallel_ttm = False,
        randomized = config["randomized"],
        growth_tol = config["growth_tol"],
        seed = config["seed"],
        iter_callback = None,
    )
    return user_factors, item_factors, feedback_factors, attention_matrix


In [28]:
config = {
    "scaling": 1,
    "mlrank": (30, 30, 5),
    "n_pos": n_pos,
    "num_iters": 5,
    "attention_decay": 1,
    "randomized": True,
    "growth_tol": 1e-4,
    "seed": 42
}

In [29]:
tf_params = seqtf_model_build(config, training_data, data_description)

growth of the core: 1.0
growth of the core: 0.37456442180636484
growth of the core: 0.029937756743187664
growth of the core: 0.0027209342785914224
growth of the core: 0.0009144049581350129


In [30]:
def tf_scoring(params, data, data_description):
    user_factors, item_factors, pos_factors, attention_matrix = params
    last_position_projector = generate_position_projector(attention_matrix, pos_factors)

    userid = data_description["users"]
    itemid = data_description["items"]
    posid = data_description["positions"]

    tset_data = data.sort_values([userid, posid])
    useridx = tset_data[userid].values
    itemidx = tset_data[itemid].values
    indptr, = np.where(np.diff(useridx, prepend=0, append=1))
    scores = user_scoring(indptr, itemidx, item_factors, last_position_projector)
    return scores

def user_scoring(indptr, indices, item_factors, last_position_projector):
    sequences = np.array_split(indices, indptr[1:-1])
    n_items = item_factors.shape[0]
    scores = np.zeros((len(sequences), n_items))
    for u, seq in enumerate(sequences):
        scores[u] = sequences_score(seq, item_factors, last_position_projector)
    return scores

def sequences_score(seq, item_factors, last_position_projector):
    n_pos = len(last_position_projector)
    user_profile = item_factors[seq[-(n_pos-1):], :]
    n_items = user_profile.shape[0]
    scores = item_factors @ (user_profile.T @ last_position_projector[-(n_items+1):-1])
    return scores

In [31]:
tf_scores = tf_scoring(tf_params, testset_valid, data_description)

  warn("Constructing a DIA matrix with %d diagonals "


In [32]:
downvote_seen_items(tf_scores, testset_valid, data_description)

In [33]:
tf_recs = topn_recommendations(tf_scores, topn=10)
model_evaluate(tf_recs, holdout_valid, data_description)

(0.056, 0.017387301587301587, 0.14786418400876233)

# Baseline

## PureSVD

In [34]:
def matrix_from_observations(data, data_description):
    useridx = data[data_description['users']]
    itemidx = data[data_description['items']]
    values = data[data_description['feedback']]
    return csr_matrix((values, (useridx, itemidx)), dtype='f8')


def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_observations(data, data_description)
    _, s, vt = svds(source_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, singular_values

In [35]:
svd_config = {'rank': 50}
svd_params = build_svd_model(svd_config, training, data_description)

In [36]:
def svd_model_scoring(params, data, data_description):
    item_factors, sigma = params
    test_data = data.assign(
        userid = pd.factorize(data['userid'])[0]
    )
    test_matrix = matrix_from_observations(test_data, data_description)
    scores = test_matrix.dot(item_factors) @ item_factors.T
    return scores

In [37]:
svd_scores = svd_model_scoring(svd_params, testset_valid, data_description)

In [38]:
downvote_seen_items(svd_scores, testset_valid, data_description)

In [39]:
svd_recs = topn_recommendations(svd_scores, topn=10)
model_evaluate(svd_recs, holdout_valid, data_description)

(0.052, 0.015026984126984125, 0.15580503833515882)