In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, diags, isspmatrix
from sklearn.metrics.pairwise import cosine_similarity

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from dataprep import transform_indices, verify_time_split
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items

In [190]:
def generate_interactions_matrix(data, data_description, rebase_users=False):
    '''
    Convert pandas dataframe with interactions into a sparse matrix.
    Allows reindexing user ids, which help ensure data consistency
    at the scoring stage (assumes user ids are sorted in scoring array).
    '''
    n_users = data_description['n_users']
    n_items = data_description['n_items']
    print(n_users, n_items)
    # get indices of observed data
    user_idx = data[data_description['users']].values
    if rebase_users:
        user_idx, user_index = pd.factorize(user_idx, sort=True)
        n_users = len(user_index)
    item_idx = data[data_description['items']].values
    feedback = data[data_description['feedback']].values
    # construct rating matrix
    print(user_idx.shape, item_idx.shape)
    return csr_matrix((feedback, (user_idx, item_idx)), shape=(n_users, n_items))

def to_numeric_id(data, field):
    '''
    Get new contiguous index by converting the data field
    into categorical values.
    '''
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map

def cosine_similarity_zd(*args):
    '''Build cosine similarity matrix with zero diagonal.'''
    similarity = cosine_similarity(*args, dense_output=False)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity.tocsr()

# Task

Implement two variants of user-based KNN for the top-$n$ recommendations task when:
1. similarity matrix is symmetric,
2. similarity matrix is asymmetric.

Recall, there's no reason for implementing row-wise weighting scheme in user-based KNN. So choose the weighting scheme wisely.

 In your experiments:  
- Test your solution against both weak and strong generalization. 
  - In total you'll have 4 different experiments.
- Follow the "most-recent-item" sampling strategy for constructing holdout.
  - Explain potential issues of this scheme in relation to both weak and strong generalization.  
- Report evaluation metrics, compare the models, and analyse the results.  
- Use Movielens-1M data.

**Note**: you can reuse some code from seminars if necessary.

In [3]:
data = get_movielens_data(include_time=True)

# Weak generalization test

## Preparing data (1 pts)

Your task is
- split data into training and holdout parts
- build a new internal contiguous representation of user and item index based on the training data
- make sure same index is used in the holdout data

In [4]:
# split most recent holdout item from each user
training_, holdout_ = leave_one_out(
    data,
    target='timestamp',
    sample_top=True,
    random_state=0
)

# check correct time splitting
verify_time_split(training_, holdout_)

In [5]:
# reindex data to make contiguous index starting from 0 for user and item IDs
training, data_index = transform_indices(training_, 'userid', 'movieid')

# apply new index to the holdout data
holdout = reindex(holdout_, data_index.values(), filter_invalid=True)
holdout = holdout.sort_values('userid')

Filtered 2 invalid observations.


- Let's also populate data description dictionary for convenience.
- It allows using uniform names for users and items field.
  - This way the code does't depend on the actual names in you dataset.
  - So later you can easily switch to another dataset without changing the code fo the pipeline.


In [6]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    test_users = holdout[data_index['users'].name].values
)

As previously, let's also explicitly store our testset (i.e., ratings of test users excluding holdout items).

In [7]:
userid = data_description['users']
seen_idx_mask = training[userid].isin(data_description['test_users'])
testset = training[seen_idx_mask]

## Models implementation

### Symmetric case (5 pts)

- You can consult the code from seminars or implement your own solution as long as it is fast enough.

- Recall that subsampling of the neighborhood not only makes the algorithm run faster, but can also improve the results.  
- **Make sure to implement some kind of neighborhood subsampling.**

In [8]:
def truncate_similarity(similarity, k=100):
    '''
    For every row in similarity matrix, pick at most k entities
    with the highest similarity scores. Disregard everything else.
    '''
    similarity = similarity.tocsr()
    inds = similarity.indices
    ptrs = similarity.indptr
    data = similarity.data
    new_ptrs = [0]
    new_inds = []
    new_data = []
    for i in range(len(ptrs)-1):
        start, stop = ptrs[i], ptrs[i+1]
        if start < stop:
            data_ = data[start:stop]
            topk = min(len(data_), k)
            idx = np.argpartition(data_, -topk)[-topk:]
            new_data.append(data_[idx])
            new_inds.append(inds[idx+start])
            new_ptrs.append(new_ptrs[-1]+len(idx))
        else:
            new_ptrs.append(new_ptrs[-1])
    new_data = np.concatenate(new_data)
    new_inds = np.concatenate(new_inds)
    truncated = csr_matrix(
        (new_data, new_inds, new_ptrs),
        shape=similarity.shape
    )
    return truncated  

In [9]:
def build_uknn_model(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description)
    # compute similarity matrix and normalization coefficients
    user_similarity = truncate_similarity(
        cosine_similarity_zd(user_item_mtx),
        k = config['n_neighbors']
    )
    weighted = config['weighted']
    return user_item_mtx, user_similarity, weighted

def uknn_model_scoring(params, testset, testset_description):
    user_item_mtx, user_similarity, weighted = params
    test_users = testset_description['test_users']
    
    scores = user_similarity.dot(user_item_mtx)
    
    if not weighted:
        return scores.toarray()[test_users, :]
    
    normalizer = user_similarity.dot(user_item_mtx.astype('bool'))
    scores = np.nan_to_num(np.divide(scores, normalizer))
    return np.array(scores[test_users, :])

In [10]:
n_neighbors = 100

uknn_params_uw = build_uknn_model(
    {'weighted': False, 'n_neighbors': n_neighbors}, training, data_description
)
uknn_params_ew = build_uknn_model(
    {'weighted': True, 'n_neighbors': n_neighbors}, training, data_description
)

In [11]:
uknn_scores_uw = uknn_model_scoring(uknn_params_uw, None, data_description)
uknn_scores_ew = uknn_model_scoring(uknn_params_ew, None, data_description)

In [12]:
downvote_seen_items(uknn_scores_uw, testset, data_description)
downvote_seen_items(uknn_scores_ew, testset, data_description)

In [14]:
uknn_recs_uw = topn_recommendations(uknn_scores_uw)
uknn_recs_ew = topn_recommendations(uknn_scores_ew)

Note: recommending items from user history doesn't make sense.

In [15]:
modes = ['unweighted', 'elementwise']
uknn_recs = dict(zip(modes, [uknn_recs_uw, uknn_recs_ew]))


uknn_metrics = {}
for mode, recs in uknn_recs.items():
    uknn_metrics[mode] = metrics = model_evaluate(recs, holdout, data_description)
    print(
        f'Weighting mode: {mode}\n'\
        'HR={:.3}, MRR={:.3}, COV={:.3}\n'.format(*metrics)
    )

Weighting mode: unweighted
HR=0.085, MRR=0.0286, COV=0.176

Weighting mode: elementwise
HR=0.000994, MRR=0.000426, COV=0.755



### Asymmetric case (5 pts)

- Your task here is to implement user-based KNN with asymmetric similarity.

$$R = KD^{-\alpha}A $$

In [68]:
def build_uknn_model_asym(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description)
    # compute similarity matrix and normalization coefficients
    user_similarity = truncate_similarity(
        cosine_similarity_zd(user_item_mtx),
        k = config['n_neighbors']
    )
    
    D = np.array(user_similarity.sum(axis=-1)).squeeze()
    normalizer = diags(1 / D)
    #print(user_similarity.shape, normalizer.shape)
    user_similarity = user_similarity.dot(normalizer)
    
    return user_item_mtx, user_similarity


def uknn_model_scoring_asym(params, testset, testset_description):
    user_item_mtx, user_similarity = params
    test_users = testset_description['test_users']
    scores = user_similarity.dot(user_item_mtx)
    return scores[test_users, :].toarray()

In [69]:
uknn_params_asym = build_uknn_model_asym(
    {'weighted': False, 'n_neighbors': n_neighbors}, training, data_description
)

In [70]:
uknn_scores_asym = uknn_model_scoring_asym(uknn_params_asym, None, data_description)

 ## Evaluation (1 pts)

#### Generate top-$n$ recommendations for both models

In [76]:
downvote_seen_items(uknn_scores_uw, testset, data_description)
downvote_seen_items(uknn_scores_asym, testset, data_description)
uknn_recs = topn_recommendations(uknn_scores_uw)
uknn_recs_asym = topn_recommendations(uknn_scores_asym)

### Calculate metrics

In [77]:
modes = ['symmetric', 'asymmetric']
uknn_recs = dict(zip(modes, [uknn_recs, uknn_recs_asym]))


uknn_metrics = {}
for mode, recs in uknn_recs.items():
    if recs is None: continue
    uknn_metrics[mode] = metrics = model_evaluate(recs, holdout, data_description)
    print(
        f'Similarity type: {mode}\n'\
        'HR={:.3}, MRR={:.3}, COV={:.3}\n'.format(*metrics)
    )

Similarity type: symmetric
HR=0.085, MRR=0.0286, COV=0.176

Similarity type: asymmetric
HR=0.0874, MRR=0.0294, COV=0.19



# Strong generalization test

- Recall that in the strong generalization test you work with the warm-start scenario.
- It means that the set of test users is disjoint from the set of users in the training.
- You're provided with the basic functions to help you perform correct splitting, but there're still a few places where your input is required. Make sure you understand the logic of data splitting in this scenario.

## Preparing data (2 pts)

- Your task is to select a subset of users who have the most recent interactions in their history across entire dataset.
- You will apply holdout splitting to only this subset.
  - Think, why simply taking all users (as in weak generalization test) makes no sense in this scenario. 

In [78]:
def split_by_time(data, time_q=0.95, timeid='timestamp'):
    '''
    Split the input `data` DataFrame into two parts based on the timestamp, with the split point
    being determined by the quantile value `time_q`. The function returns a tuple `(before, after)`
    containing the two DataFrames. The `after` DataFrame contains the rows with timestamps greater
    than or equal to the split point, while the `before` DataFrame contains the remaining rows. 

    Details:
    The `quantile` method of the pandas DataFrame is used to calculate the time point (i.e., timestamp)
    that divides the data into two parts based on the given quantile value `time_q`. Specifically,
    the time point `split_timepoint` is calculated as the `time_q`th quantile of the values in the `timeid`
    column of the `data` DataFrame, using the interpolation method of `nearest`. This means that
    `split_timepoint` is the timestamp at or immediately after which `time_q` percent of the data points occur.    
    '''
    split_timepoint = data[timeid].quantile(q=time_q, interpolation='nearest')
    after = data.query(f'{timeid} >= @split_timepoint') 
    before = data.drop(after.index)
    return before, after

Firstly, you need to select a candidate subset of observations, from which you'll construct the the training, testset, and holdout datssets. Check the `split_by_time` function below and its description in the above cell.

In [79]:
before, after = split_by_time(data, time_q=0.95)

- Now it's time to perform holdout sampling based on the obtained timepoint splitting. 
- Remember, you only sample from the test users.

In [131]:
testset_part_, holdout_ = leave_one_out(
    after,          
    target='timestamp',
    sample_top=True,
    random_state=0
)

test_ids = testset_part_['userid'].values
holdout_ = holdout_[holdout_.userid.isin(test_ids)]

# verify correctness of time-based splitting,
# i.e., for each test user, the holdout contains only future interactions w.r.t to testset
verify_time_split(testset_part_, holdout_)

In [132]:
training_ = before[~before.userid.isin(test_ids)]

- Note that `testset_part_` only contains interactions of the test users **after the timepoint**.
- You need to combine it with the remaining histories of these users.

In [137]:
# combine all test users data into a single `testset_` Dataframe.
testset_ = pd.concat(
    [before[before.userid.isin(test_ids)], testset_part_],
    axis = 0,
    ignore_index=False
)

### Building internal representation of user and item index

Use the `transform_indices` function for building a contiguous index starting from 0.

In [140]:
training, data_index = transform_indices(training_, 'userid', 'movieid')

- Before applying new index to the test data:
  - note that the users in the `testset` must be the same as the users in the `holdout`.
- Below is the corresponding function `align_test_by_users` that ensures these two datasets' alignment.

In [141]:
def align_test_by_users(testset, holdout):
    test_users = np.intersect1d(holdout['userid'].values, testset['userid'].values)
    # only allow the same users to be present in both datasets
    testset = testset.query('userid in @test_users').sort_values('userid')
    holdout = holdout.query('userid in @test_users').sort_values('userid')
    return testset, holdout

Let's apply new item index to test data and finalize the test split:

In [142]:
holdout = reindex(holdout_, data_index['items'], filter_invalid=True)
testset = reindex(testset_, data_index['items'], filter_invalid=True)

testset, holdout = align_test_by_users(testset, holdout)

Filtered 5 invalid observations.
Filtered 108 invalid observations.


- Think why we do not apply new index to users here.

## Models implementation

- In this section you'll need to implement user-based KNN models for the warm-start scenario.
- Think carefully which data must be generated at the build time and which data must be generated in the scoring function.

### Symmetric case (5 pts)

In [204]:
def build_uknn_model(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description)
    weighted = config['weighted']
    return user_item_mtx, weighted

def uknn_model_scoring(params, testset, testset_description):
    user_item_mtx = params
    user_item_mtx_test = generate_interactions_matrix(testset, testset_description)
    print(user_item_mtx_test.shape)
    
    user_similarity = truncate_similarity(
        cosine_similarity_zd(user_item_mtx, user_item_mtx_test),
        k = config['n_neighbors']
    )
    
    scores = user_similarity.dot(user_item_mtx)
    
    if not weighted:
        return scores.toarray()
    
    normalizer = user_similarity.dot(user_item_mtx.astype('bool'))
    scores = np.nan_to_num(np.divide(scores, normalizer))
    return np.array(scores)

In [205]:
uknn_params = build_uknn_model(
    {'weighted': False, 'n_neighbors': n_neighbors}, training, data_description
)

6040 3704
(732119,) (732119,)


In [206]:
def cosine_similarity_zd(m1, m2):
    print(m1, m2.shape)
    '''Build cosine similarity matrix with zero diagonal.'''
    similarity = cosine_similarity(m1, m2, dense_output=False)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity.tocsr()

In [203]:
test_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(testset['userid']),
    n_items = len(testset['movieid'])
)


uknn_scores = uknn_model_scoring(uknn_params, testset, test_description)

263087 263087
(263087,) (263087,)
(<6040x3704 sparse matrix of type '<class 'numpy.int64'>'
	with 732119 stored elements in Compressed Sparse Row format>, False) (263087, 263087)


ValueError: setting an array element with a sequence.

In [209]:
training.userid.unique().shape

(5261,)

In [208]:
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'n_users': 6040,
 'n_items': 3704,
 'test_users': array([   0,    1,    2, ..., 6037, 6038, 6039])}

### Asymmetric case (5 pts)

In [None]:
def build_uknn_model_asym(config, data, data_description):
    
    return ...

def uknn_model_scoring_asym(params, testset, testset_description):

    return ...

In [None]:
uknn_params_asym = ...

In [None]:
uknn_scores_asym = ...

 ## Evaluation (1 pts)

### Generate recommendations for both models

In [None]:
uknn_recs = ...

In [None]:
uknn_recs_asym = ...

### Calculate metrics

In [None]:
modes = ['symmetric', 'asymmetric']
uknn_recs = dict(zip(modes, [uknn_recs, None]))


uknn_metrics = {}
for mode, recs in uknn_recs.items():
    if recs is None: continue
    uknn_metrics[mode] = metrics = model_evaluate(recs, holdout, data_description)
    print(
        f'Similarity type: {mode}\n'\
        'HR={:.3}, MRR={:.3}, COV={:.3}\n'.format(*metrics)
    )

## Tuning (2 pts)
- Try to find a neighborhood size that gives you better results.
- Perform a simple grid-search experiment and report your findings.

# Final analysis (3 pts)

1. Provide an analysis on which model performs the best and explain why.
2. Explain the difference in computational complexity of your models. Consider how the training and the recommendation generation differ for different models in terms of
    - the amount of RAM,
    - the amount of disk storage,
    - the load on CPU.
3. How else would you modify the model to improve either the quality of recommendations or computational performance? Describe at least one modification and its envisioned effect.