In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import diags
from scipy.sparse.linalg import svds
from tqdm import tqdm

from dataprep import transform_indices, reindex_data, generate_interactions_matrix
from evaluation import downvote_seen_items, topn_recommendations


from scipy.sparse import csr_matrix
from scipy.sparse import hstack as sp_hstack
from scipy.sparse import diags as spdiags
from scipy.sparse import eye as speye

from polara.preprocessing.dataframes import matrix_from_observations


In [3]:
data = pd.read_csv('archive/animelists_cleaned.csv')

In [4]:
df_users = pd.read_csv('archive/users_cleaned.csv')

In [5]:
username2userid = df_users.set_index('username').to_dict()['user_id']

In [6]:
data['userid'] = data['username'].apply(lambda x: username2userid[x])

In [7]:
data.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags,userid
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,2013-03-03 10:52:53,,2255153
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,2013-03-10 13:54:51,,2255153
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,,0,2013-04-27 16:43:35,,2255153
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,,0,2013-03-03 10:53:57,,2255153
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13,,2255153


In [8]:
train_matrix, user_index, anime_index = matrix_from_observations(
    data, userid='userid', itemid='anime_id', feedback='my_score'
)

In [9]:
df = data[['userid', 'anime_id', 'my_score', 'my_last_updated']]

In [10]:
df = df.rename(columns={"userid": "userid", "anime_id": "movieid", 'my_last_updated':'timestamp', 'my_score':'rating'})
df = df.loc[df.rating > 0]
df

Unnamed: 0,userid,movieid,rating,timestamp
0,2255153,21,9,2013-03-03 10:52:53
1,2255153,59,7,2013-03-10 13:54:51
2,2255153,74,7,2013-04-27 16:43:35
3,2255153,120,7,2013-03-03 10:53:57
4,2255153,178,7,2013-03-27 15:59:13
...,...,...,...,...
31284025,4862000,15611,9,2015-09-07 17:33:03
31284026,4862000,27815,9,2015-09-07 17:32:05
31284027,299167,5945,8,2010-03-29 04:24:12
31284028,263803,1316,9,2009-12-23 05:45:14


In [11]:
def plot_monthly_activity(data, ax=None, label=None, timeid='my_last_updated'):
    return (data
        .set_index(pd.to_datetime(data[timeid], unit='s'))
        .resample('M').size()
        .plot(ax=ax, logy=True, label=label, xlabel='Month',
              title='Monthly rating activity', legend=label is not None)
    )

In [12]:
def timepoint_split(data, time_split_q=0.95):
    """
    Split data into training, testset, and holdout datasets based on a timepoint split
    and according to the `warm-start` evaluation strategy.

    Parameters
    ----------
    data : pd.DataFrame
        The input dataset containing columns `userid`, `movieid`, and `timestamp`.
    time_split_q : float, optional
        The quantile value used to split the dataset based on the `timestamp` column.
        Default is 0.95.

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
        A tuple of three pandas DataFrames: training, testset, and holdout.
        `training` is a subset of `data` used for training the recommender system.
        `testset` is a subset of `data` used for generating recommendations for the test users.
        `holdout` is a subset excluded from `testset` containing only the most recent interactions for each test user.

    Notes
    -----
    The function splits the input `data` into three subsets: `training`, `testset`, and `holdout`.
    The split is performed based on the `timestamp` column of `data`, using `time_split_q` as the quantile value.
    The `holdout` dataset contains only the immediate interactions following the fixed timepoint for each test user from the `testset`.
    The set of users in `training` is disjoint with the set of users in the `testset`, which implements the `warm-start` scenario.
    """    
    timepoint = data.timestamp.quantile(q=time_split_q, interpolation='nearest')
    test_ = data.query('timestamp >= @timepoint')
    rest_ = data.drop(test_.index)
    holdout_ = (
        test_
        .sort_values('timestamp')
        .drop_duplicates(subset=['userid'], keep='first')
    )
    # the holdout dataframe contains interactions closest to certain timepoint from the right,
    # i.e., the corresponding items are the first in each test user profile after this timepoint
    training = rest_.query('userid not in @holdout_.userid')
    train_items = training.movieid.unique()
    testset_ = rest_.query('userid in @holdout_.userid and movieid in @train_items')
    test_users = testset_.userid.unique()
    holdout = holdout_.query(
        # if user is not in `test_users` then no evluation is possible,
        # if item is not in `train_items` it's cold start -> must be excluded
        'userid in @test_users and movieid in @train_items'
    ).sort_values('userid')
    testset = testset_.query(
        # make sure testset and holdout contain the same set of users
        'userid in @holdout.userid'
    ).sort_values('userid')
    return training, testset, holdout

In [13]:
topn = 5

In [14]:
train_val_, testset_val_, holdout_val_ = timepoint_split(df, time_split_q=0.96)

In [15]:
train_val_

Unnamed: 0,userid,movieid,rating,timestamp
0,2255153,21,9,2013-03-03 10:52:53
1,2255153,59,7,2013-03-10 13:54:51
2,2255153,74,7,2013-04-27 16:43:35
3,2255153,120,7,2013-03-03 10:53:57
4,2255153,178,7,2013-03-27 15:59:13
...,...,...,...,...
31284025,4862000,15611,9,2015-09-07 17:33:03
31284026,4862000,27815,9,2015-09-07 17:32:05
31284027,299167,5945,8,2010-03-29 04:24:12
31284028,263803,1316,9,2009-12-23 05:45:14


In [16]:
testset_val_

Unnamed: 0,userid,movieid,rating,timestamp
2910550,4,120,8,1970-01-01 00:00:00
2911167,4,2899,7,2011-07-14 03:32:17
2911165,4,2615,5,2012-07-18 12:11:13
2911164,4,2385,5,2007-05-28 23:36:51
2911163,4,2298,5,2007-05-29 22:42:41
...,...,...,...,...
31203572,6838054,11617,6,2017-12-28 19:11:28
31203564,6838054,1535,9,2017-12-28 19:25:32
31203561,6838054,226,8,2017-12-28 19:15:09
31203586,6838054,30276,7,2017-12-28 19:23:10


In [14]:
def transform_data(train, test, holdout=None, userid='userid', itemid='movieid'):
    train_new, data_index = transform_indices(train, 'userid', 'movieid')
    testset_new = reindex_data(test, data_index, fields='items')
    if holdout is None:
        return train_new, testset_new, data_index
    holdout_new = reindex_data(holdout, data_index, fields='items')
    return train_new, testset_new, holdout_new, data_index

In [15]:
train_val, testset_val, holdout_val, data_index = transform_data(train_val_, testset_val_, holdout_val_, userid='userid', itemid='movieid')

In [86]:
holdout_val

Unnamed: 0,userid,movieid,rating,timestamp
2910863,4,5855,7,2018-04-14 14:16:48
1371674,20,5874,7,2018-02-11 12:14:16
25885045,36,5867,8,2018-01-02 16:06:39
10577496,359,5953,7,2017-12-31 01:06:38
4925473,368,5418,8,2017-12-29 16:39:24
...,...,...,...,...
4651716,6835467,3987,9,2018-01-01 19:00:01
26279407,6835647,5502,9,2018-01-01 03:43:52
13947635,6837484,5218,10,2018-01-02 04:20:33
27633692,6838022,3590,7,2017-12-29 01:47:15


In [83]:
def evaluate(recommended_items, holdout, holdout_description, topn=5):
    itemid = holdout_description['items']
    holdout_items = holdout[itemid].values
    assert recommended_items.shape[0] == len(holdout_items)
    hits_mask = recommended_items[:, :topn] == holdout_items.reshape(-1, 1)
    # HR calculation
    hr = np.mean(hits_mask.any(axis=1))
    precision = []
    for i in range(recommended_items.shape[0]):
        precision.append(np.intersect1d(recommended_items[i], holdout_items).size / 5)
    
    
    # MRR calculation
    n_test_users = recommended_items.shape[0]
    hit_rank = np.where(hits_mask)[1] + 1.0
    mrr = np.sum(1 / hit_rank) / n_test_users
    # coverage calculation
    n_items = holdout_description['n_items']
    cov = np.unique(recommended_items).size / n_items
    return hr, mrr, cov, np.mean(precision)


In [79]:
holdout_val['movieid'].values.size

34479

In [84]:
recs = topn_recommendations(scores, topn=5)
evaluate(recs, holdout_val, data_description)

(0.048435279445459556,
 0.02726007134777691,
 0.4632646276595745,
 0.9348588996200586)

In [45]:
a = np.array([[1, 2], [3, 4]])
a.sum(axis=0)

array([4, 6])

In [17]:
def ssvd_grid_search(ranks, scalings, training, testset, holdout, data_description, topn = 20):
    max_rank = max(ranks)
    config = {'rank': max_rank}
    results = {}
    for scaling in tqdm(scalings):
        config['scaling'] = scaling
        item_factors, scaling_weights = build_ssvd_model(config, training, data_description)
        for rank in ranks:
            item_factors_trunc = item_factors[:, :rank]
            scores = ssvd_model_scoring((item_factors_trunc, scaling_weights), testset, data_description)
            recs = topn_recommendations(scores, topn=topn)
            results[(rank, scaling)] = evaluate(recs, holdout, data_description)
    return results

def build_ssvd_model(config, data, data_description):
    source_matrix = generate_interactions_matrix(data, data_description, rebase_users=False)
    scaled_matrix, scaling_weights = rescale_matrix(source_matrix, config['scaling'])
    *_, vt = svds(scaled_matrix, k=config['rank'], return_singular_vectors='vh')
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, scaling_weights

def rescale_matrix(matrix, scaling_factor):
    frequencies = matrix.getnnz(axis = 0)
    scaling_weights = np.power(frequencies, 0.5 * (scaling_factor - 1))
    return matrix.dot(diags(scaling_weights)), scaling_weights
    
def ssvd_model_scoring(params, data, data_description):
    item_factors, scaling_weights = params
    test_matrix = generate_interactions_matrix(data, data_description, rebase_users=True)
    scores = test_matrix.dot(item_factors) @ item_factors.T
    downvote_seen_items(scores, data, data_description)
    return scores

In [18]:
scalings = [0.2, 0.4, 0.6]
ranks = [b * 2 ** i for i in range(3, 9) for b in [2, 3]]
ranks

[16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768]

In [19]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items'])
)

In [20]:
ssvd_results = ssvd_grid_search(ranks, scalings, train_val, testset_val, holdout_val, data_description, topn = 5)

100%|██████████| 3/3 [06:44<00:00, 134.80s/it]


In [21]:
ssvd_results

{(16, 0.2): (0.031961483801734386, 0.017380144435743497, 0.18617021276595744),
 (24, 0.2): (0.03338263870761913, 0.01910293221961194, 0.21974734042553193),
 (32, 0.2): (0.03547086632442936, 0.02022003731740093, 0.24750664893617022),
 (48, 0.2): (0.037037037037037035, 0.02176252211491052, 0.28407579787234044),
 (64, 0.2): (0.0388352330404014, 0.0220530371143788, 0.30851063829787234),
 (96, 0.2): (0.039908350010151104, 0.022947784641859297, 0.3390957446808511),
 (128, 0.2): (0.04350474201687984, 0.02486537699275888, 0.3617021276595745),
 (192, 0.2): (0.04541895066562255, 0.026564478861529243, 0.40242686170212766),
 (256, 0.2): (0.046550073958061426, 0.02734901437590031, 0.42470079787234044),
 (384, 0.2): (0.048435279445459556, 0.02726007134777691, 0.4632646276595745),
 (512, 0.2): (0.047942225702601586, 0.026793120450129063, 0.48487367021276595),
 (768, 0.2): (0.046231039183270976, 0.02580266249021143, 0.5272606382978723),
 (16, 0.4): (0.03413672090257838, 0.019241180622021133, 0.1707114

In [27]:
pd.Series(ssvd_results).idxmax()

(384, 0.2)

In [23]:
best_config = dict(
    zip(
        ['rank', 'scaling'],
        # pd.Series(ssvd_results).idxmax()
        (384, 0.2)
    )
)

In [24]:
ssvd_params = build_ssvd_model(best_config, train_val, data_description)

In [30]:
scores = ssvd_model_scoring(ssvd_params, testset_val, data_description)


In [57]:
recs = topn_recommendations(scores, topn=topn)
evaluate(recs, holdout_val, data_description)

[560 354 300 234 222] (5,) [0 0 0 ... 1 0 0] (34479,)


(0.048435279445459556,
 0.02726007134777691,
 0.4632646276595745,
 0.048435279445459556)

In [32]:
holdout_val

Unnamed: 0,userid,movieid,rating,timestamp
2910863,4,5855,7,2018-04-14 14:16:48
1371674,20,5874,7,2018-02-11 12:14:16
25885045,36,5867,8,2018-01-02 16:06:39
10577496,359,5953,7,2017-12-31 01:06:38
4925473,368,5418,8,2017-12-29 16:39:24
...,...,...,...,...
4651716,6835467,3987,9,2018-01-01 19:00:01
26279407,6835647,5502,9,2018-01-01 03:43:52
13947635,6837484,5218,10,2018-01-02 04:20:33
27633692,6838022,3590,7,2017-12-29 01:47:15


In [33]:
data_index['items']

Int64Index([    1,     5,     6,     7,     8,    15,    16,    17,    18,
               19,
            ...
            36501, 36502, 36519, 36520, 36524, 36539, 36542, 36825, 36848,
            36861],
           dtype='int64', name='movieid', length=6016)

In [35]:
user_recs = np.zeros((holdout_val.userid.nunique(), 5))
for i in range(len(user_recs)):
    user_recs[i, :] = np.random.choice(data_index['items'], 5)
evaluate(user_recs, holdout_val, data_description)

(0.00026102845210127906, 0.0001435656486557035, 1.0)

In [38]:
user_recs = np.zeros((holdout_val.userid.nunique(), 5))
for i in range(len(user_recs)):
    user_recs[i, :] = [20, 269, 22319, 6702, 1535]
evaluate(user_recs, holdout_val, data_description)

(0.0019722149714318862, 0.0019722149714318862, 0.0008311170212765958)

Int64Index([    1,     5,     6,     7,     8,    15,    16,    17,    18,
               19,
            ...
            36501, 36502, 36519, 36520, 36524, 36539, 36542, 36825, 36848,
            36861],
           dtype='int64', name='movieid', length=6016)

In [25]:
ssvd_params[0]

array([[-0.04290313,  0.0356894 , -0.04045547, ...,  0.01433141,
        -0.01826316, -0.0005752 ],
       [-0.02952815,  0.03688406, -0.02922705, ...,  0.01791157,
        -0.01926287, -0.00709241],
       [-0.03680309,  0.03662262, -0.02320313, ..., -0.03621926,
        -0.01317875, -0.02844686],
       ...,
       [-0.00012369, -0.00014894, -0.00023153, ..., -0.00093735,
         0.00146778,  0.00151106],
       [-0.00026807, -0.00087081, -0.00042965, ...,  0.00541199,
         0.00826579, -0.01536734],
       [-0.00027395, -0.00087632, -0.00052325, ...,  0.00758282,
         0.00303888, -0.00079305]])

In [31]:
np.save('item_matrix_v4', ssvd_params[0])

In [32]:
np.save('weights_v4', ssvd_params[1])

In [33]:
np.save('itemid_v2', data_index['items'])