# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook2: Correlation-based Recommender Systems

## Load dataset from pro-processed .npy files by using `np.load`

In [1]:
import numpy as np
import pandas as pd

train_pair = np.load('../dataset/train_pair.npy')
test_pair = np.load('../dataset/test_pair.npy')
train_ratings = np.load('../dataset/train_ratings.npy')
test_ratings = np.load('../dataset/test_ratings.npy')
n_user, n_item = train_pair[:,0].max()+1, train_pair[:,1].max()+1

## Implement Correlation-based (user-based) recommender systems
- Inpout: training set.

- Output: return predicted ratings for (user id, item id) user-item pairs in testing set.

- Goal: make prediction for testing set

## Algo 1: Correlation-based (user-based) recommender systems (stored sim-matrix)

In [2]:
pred_algo1 = np.zeros(len(test_ratings))
# Step 1: compute mean-centering ratings
MC_ratings = train_ratings.copy()
for u in range(n_user):
    # find all indice for user-u use np.where
    index_tmp = np.where(train_pair[:,0] == u)[0]
    if len(index_tmp) == 0:
        continue
    MC_ratings[index_tmp] = MC_ratings[index_tmp] - MC_ratings[index_tmp].mean()
print(MC_ratings)

[-0.96121666  2.89458581 -0.37234958 ... -2.72906623  2.05574082
  0.70626127]


In [16]:
# Step 2(a): compute pairwise cosine similarity between all users
from numpy.linalg import norm

def cossim_user(u,v,train_pair,train_ratings):
    index_u = np.where(train_pair[:,0] == u)[0]
    index_v = np.where(train_pair[:,0] == v)[0]
    item_u = train_pair[index_u][:,1]
    item_v = train_pair[index_v][:,1]
    # find co-rating items by `set`
    item_co = list(set(item_u).intersection(set(item_v)))
    if len(item_co) <= 0:
        # a tuning parameter
        return 0.0
    else:
        # find the co-rating vectors by using `np.isin`
        vec_u, vec_v = train_ratings[index_u], train_ratings[index_v]
        vec_co_u, vec_co_v = vec_u[np.isin(item_u, item_co)], vec_v[np.isin(item_v, item_co)]
        return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)

In [4]:
from scipy.sparse import csr_matrix, lil_matrix
# dense sim_matrix
S = np.zeros((n_user,n_user))
# sparse matrix 
# S = lil_matrix((n_user, n_user))
for u in range(n_user):
    for v in range(u):
        # S[u,v] = cossim_user(u,v,train_pair,train_ratings)
        # use mean-centering ratings to compute the similarity
        S[u,v] = cossim_user(u,v,train_pair,MC_ratings)
S = S + S.T

In [5]:
## Step 2(b): use `dict` to store rated users for all items
index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]

In [6]:
## Step 3: make prediction if S is stored
glb_mean = train_ratings.mean()

for j in range(len(test_pair)):
    user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
    index_tmp = index_item[item_tmp]
    rated_users = train_pair[index_tmp][:,0]
    rated_ratings = train_ratings[index_tmp]
    if len(rated_users) == 0:
        # if no rated users
        pred_algo1[j] = glb_mean
    else:
        # sim_weight = S[user_tmp, rated_users].toarray()[0]
        sim_weight = S[user_tmp, rated_users]
        # print(sim_weight)
        if max(sim_weight) == 0:
            pred_algo1[j] = glb_mean
        else:
            pred_algo1[j] = np.sum(sim_weight*rated_ratings) / np.sum(sim_weight)

### Summarize ALGO1 as a Python function

- `Input`: 'train_pair', 'train_ratings', 'test_pair'

- `Return`: 'pred_ratings'

In [2]:
from scipy.sparse import csr_matrix, lil_matrix
from numpy.linalg import norm

def cossim_user(u,v,train_pair,train_ratings):
    index_u = np.where(train_pair[:,0] == u)[0]
    index_v = np.where(train_pair[:,0] == v)[0]
    item_u = train_pair[index_u][:,1]
    item_v = train_pair[index_v][:,1]
    # find co-rating items by `set`
    item_co = list(set(item_u).intersection(set(item_v)))
    if len(item_co) <= 0:
        # a tuning parameter
        return 0.0
    else:
        # find the co-rating vectors by using `np.isin`
        vec_u, vec_v = train_ratings[index_u], train_ratings[index_v]
        vec_co_u, vec_co_v = vec_u[np.isin(item_u, item_co)], vec_v[np.isin(item_v, item_co)]
        return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)

def cor_RS_user(train_pair, train_ratings, test_pair):
    n_user, n_item = train_pair[:,0].max()+1, train_pair[:,1].max()+1
    ## Step 1: compute mean-centering ratings
    MC_ratings = train_ratings.copy()
    for u in range(n_user):
        # find all indice for user-u use `np.where`
        index_tmp = np.where(train_pair[:,0] == u)[0]
        if len(index_tmp) < 2:
            continue
        MC_ratings[index_tmp] = MC_ratings[index_tmp] - MC_ratings[index_tmp].mean()
    ## Step 2(a): compute similarity matrix
    # dense sim_matrix
    S = np.zeros((n_user,n_user))
    # sparse matrix 
    # S = lil_matrix((n_user, n_user))
    for u in range(n_user):
        for v in range(u):
            # S[u,v] = cossim_user(u,v,train_pair,train_ratings)
            # use mean-centering ratings to compute the similarity
            S[u,v] = cossim_user(u,v,train_pair,MC_ratings)
    S = S + S.T
    # Step 2(b): use `list` to store rated users for all items
    index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
    ## Step 3: make prediction if S is stored
    glb_mean = train_ratings.mean()
    pred_algo1 = np.zeros(len(test_ratings))
    for j in range(len(test_pair)):
        user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
        index_tmp = index_item[item_tmp]
        rated_users = train_pair[index_tmp][:,0]
        rated_ratings = train_ratings[index_tmp]
        if len(rated_users) == 0:
            # if no rated users
            pred_algo1[j] = glb_mean
        else:
            # sim_weight = S[user_tmp, rated_users].toarray()[0]
            sim_weight = S[user_tmp, rated_users]
            # print(sim_weight)
            if max(sim_weight) == 0:
                pred_algo1[j] = glb_mean
            else:
                pred_algo1[j] = np.sum(sim_weight*rated_ratings) / np.sum(sim_weight)
    return pred_algo1

## Algo 2: Correlation-based (user-based) recommender systems (without stored sim-matrix)

In [17]:
pred_algo2 = np.zeros(len(test_ratings))

## Updated Step 3: make prediction if S is stored
glb_mean = train_ratings.mean()

for u in range(n_user):
    index_list_tmp = np.where(test_pair[:,0] == u)[0]
    if len(index_list_tmp) == 0:
        # no record to predict for this user.
        continue
    for record in index_list_tmp:
        user_tmp, item_tmp = test_pair[record,0], test_pair[record,1]
        index_tmp = index_item[item_tmp]
        rated_users = train_pair[index_tmp][:,0]
        rated_ratings = train_ratings[index_tmp]
        if len(rated_users) == 0:
            # if no rated users
            pred_algo2[record] = glb_mean
        else:
            sim_weight = [cossim_user(user_tmp,v,train_pair,MC_ratings) for v in rated_ratings]
            # print(sim_weight)
            if max(sim_weight) == 0:
                pred_algo2[record] = glb_mean
            else:
                pred_algo2[record] = np.sum(sim_weight*rated_ratings) / np.sum(sim_weight)

## Baseline + correlation

- Pre-processed ratings by Baseline methods.

- Prediction by correlation-based algorithm.

## Evaluation: compute RMSE for baseline methods 

In [14]:
## RMSE for ALGO1
rmse_crs = np.sqrt(np.mean((pred_algo1 - test_ratings)**2))
print('RMSE for correlation-base RS (ALGO1): %.3f' %rmse_crs)

RMSE for correlation-base RS (ALGO1): 2.588


In [18]:
## RMSE for ALGO2
rmse_crs_new = np.sqrt(np.mean((pred_algo2 - test_ratings)**2))
print('RMSE for correlation-base RS (ALGO2): %.3f' %rmse_crs_new)

RMSE for correlation-base RS (ALGO2): 2.588
