# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook2: Correlation-based Recommender Systems

## Load and pro-processed dataset

In [1]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('./dataset/train.csv')
dtest = pd.read_csv('./dataset/test.csv')
## save real ratings for test set for evaluation.
test_rating = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

## convert string to user_id and item_id -> [user_id, item_id, rating]
# pre-process for training data
train_pair = dtrain[['user_id', 'movie_id']].values
train_rating = dtrain['rating'].values
# pre-process for testing set
test_pair = dtest[['user_id', 'movie_id']].values

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Implement Correlation-based (user-based) recommender systems
- Inpout: training set.

- Output: return predicted ratings for (user id, item id) user-item pairs in testing set.

- Goal: make prediction for testing set

- Note: time-space trade-off: [how to profiling and timing Code?](https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html)

## Algo: Correlation-based (user-based) recommender systems (stored sim-matrix by a sparse matrix)

In [2]:
# Step 2(a): compute pairwise cosine similarity between all users
from numpy.linalg import norm

def cossim_user(index_u,index_v,train_pair,train_rating):
    # index_u = np.where(train_pair[:,0] == u)[0]
    # index_v = np.where(train_pair[:,0] == v)[0]
    item_u = train_pair[index_u][:,1]
    item_v = train_pair[index_v][:,1]
    # find co-rating items by `set`
    item_co = list(set(item_u).intersection(set(item_v)))
    if len(item_co) < 3:
        # a tuning parameter
        return 0.0
    else:
        # find the co-rating vectors by using `np.isin`
        vec_u, vec_v = train_rating[index_u], train_rating[index_v]
        vec_co_u, vec_co_v = vec_u[np.isin(item_u, item_co)], vec_v[np.isin(item_v, item_co)]
        return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)

In [3]:
pred_algo1 = np.zeros(len(test_rating))
## Step 2(b): use `list` to store rated users and item for all items and user, respectively.
index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]

from scipy.sparse import lil_matrix
# dense sim_matrix
# S = np.zeros((n_user,n_user))
# sparse matrix 
S = lil_matrix((n_user, n_user))
for u in range(n_user):
    if u%500 == 0:
        print('sim-vec for user_id %d' %u)
    for v in range(u):
        if (len(index_user[u]) == 0) or (len(index_user[u]) == 0):
            continue
        S[u,v] = cossim_user(index_user[u],index_user[v],train_pair,train_rating)
S = S + S.T

sim-vec for user_id 0
sim-vec for user_id 500
sim-vec for user_id 1000
sim-vec for user_id 1500


In [4]:
glb_mean = train_rating.mean()
## Step 3: make prediction if S is stored
for j in range(len(test_pair)):
    user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
    index_tmp = index_item[item_tmp]
    rated_users = train_pair[index_tmp][:,0]
    rated_ratings = train_rating[index_tmp]
    sim_weight = S[user_tmp, rated_users].toarray()[0]
    if (len(rated_users) == 0) or (max(sim_weight) == 0):
        # if no rated users or no similar users
        index_user_tmp = index_user[user_tmp]
        if len(index_user_tmp) == 0:
            pred_algo1[j] = glb_mean
        else:
            pred_algo1[j] = train_rating[index_user_tmp].mean()
        
        # find top 5 rated-users
        # sim_weight_knn = np.zeros(len(sim_weight))
        # top_index = np.argsort(sim_weight)[-5:]
        # sim_weight_knn[top_index] = sim_weight[top_index]
        # sim_weight = S[user_tmp, rated_users]
        # print(sim_weight)
    else:
        pred_algo1[j] = np.sum(sim_weight*rated_ratings) / np.sum(sim_weight)

## Algo 2: Correlation-based (user-based) recommender systems (without stored sim-matrix)

In [5]:
pred_algo2 = np.zeros(len(test_rating))

## Step 2(b): use `list` to store rated users and item for all items and user, respectively.
index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
## Updated Step 3: make prediction if S is stored
glb_mean = train_rating.mean()

for user_tmp in range(n_user):
    if user_tmp%500 == 0:
        print('predict for user_id %d' %user_tmp)
    index_user_tmp = np.where(test_pair[:,0] == user_tmp)[0]
    train_index_user_tmp = index_user[user_tmp]
    if len(index_user_tmp) == 0:
        # no record to predict for this user.
        continue
    sim_weight = [cossim_user(index_user[user_tmp],index_user[v],train_pair,train_rating) for v in range(n_user)]
    sim_weight = np.array(sim_weight)
    for record in index_user_tmp:
        item_tmp = test_pair[record,1]
        index_item_tmp = index_item[item_tmp]
        rated_users = train_pair[index_item_tmp][:,0]
        rated_ratings = train_rating[index_item_tmp]
        sim_weight_rated = sim_weight[rated_users]
        if (len(rated_users) == 0) or (max(sim_weight_rated) == 0):
            # if no rated users, then predict by user-mean
            if len(train_index_user_tmp) == 0:
                pred_algo2[record] = glb_mean
            else:
                pred_algo2[record] = train_rating[train_index_user_tmp].mean()
        else:
            pred_algo2[record] = np.sum(sim_weight_rated*rated_ratings) / np.sum(sim_weight_rated)

predict for user_id 0
predict for user_id 500
predict for user_id 1000
predict for user_id 1500


## Baseline + correlation

- Pre-processed ratings by Baseline methods.

- Prediction by correlation-based algorithm.

## Evaluation: compute RMSE for baseline methods 

In [6]:
## RMSE for ALGO1
rmse_crs = np.sqrt(np.mean((pred_algo1 - test_rating)**2))
print('RMSE for correlation-base RS (ALGO1): %.3f' %rmse_crs)

RMSE for correlation-base RS (ALGO1): 1.053


In [7]:
## RMSE for ALGO2
rmse_crs_new = np.sqrt(np.mean((pred_algo2 - test_rating)**2))
print('RMSE for correlation-base RS (ALGO2): %.3f' %rmse_crs_new)

RMSE for correlation-base RS (ALGO2): 1.053


## Summarize `correlation-based RS` as Python functions

In [1]:
from numpy.linalg import norm
from scipy.sparse import lil_matrix

def cossim_user(index_u,index_v,train_pair,train_rating):
    # index_u = np.where(train_pair[:,0] == u)[0]
    # index_v = np.where(train_pair[:,0] == v)[0]
    item_u = train_pair[index_u][:,1]
    item_v = train_pair[index_v][:,1]
    # find co-rating items by `set`
    item_co = list(set(item_u).intersection(set(item_v)))
    if len(item_co) < 3:
        # a tuning parameter
        return 0.0
    else:
        # find the co-rating vectors by using `np.isin`
        vec_u, vec_v = train_rating[index_u], train_rating[index_v]
        vec_co_u, vec_co_v = vec_u[np.isin(item_u, item_co)], vec_v[np.isin(item_v, item_co)]
        return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)

def cor_rs_user(train_pair, train_rating, test_pair):
    n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1
    index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
    index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
    pred = np.zeros(len(test_pair))
    glb_mean = train_rating.mean()
    # Step 1: compute sim-matrix
    S = lil_matrix((n_user, n_user))
    for u in range(n_user):
        for v in range(u):
            if (len(index_user[u]) == 0) or (len(index_user[v]) == 0):
                continue
            S[u,v] = cossim_user(index_user[u],index_user[v],train_pair,train_rating)
    S = S + S.T
    ## Step 3: make prediction if S is stored
    for j in range(len(test_pair)):
        user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
        index_tmp = index_item[item_tmp]
        rated_users = train_pair[index_tmp][:,0]
        rated_ratings = train_rating[index_tmp]
        sim_weight = S[user_tmp, rated_users].toarray()[0]
        ## only keep top 5 closest users
        top_ind = sim_weight.argsort()[-10:][::-1]
        sim_weight_knn = np.zeros(len(sim_weight))
        sim_weight_knn[top_ind] = sim_weight[top_ind]
        if (len(rated_users) == 0) or (max(sim_weight_knn) == 0):
            # if no rated users or no similar users
            index_user_tmp = index_user[user_tmp]
            if len(index_user_tmp) == 0:
                pred[j] = glb_mean
            else:
                pred[j] = train_rating[index_user_tmp].mean()
        else:
            pred[j] = np.sum(sim_weight_knn*rated_ratings) / np.sum(sim_weight_knn)
    return pred

def cossim_item(index_i,index_j,train_pair,train_rating):
    # index_u = np.where(train_pair[:,0] == u)[0]
    # index_v = np.where(train_pair[:,0] == v)[0]
    user_i = train_pair[index_i][:,0]
    user_j = train_pair[index_j][:,0]
    # find co-rating items by `set`
    user_co = list(set(user_i).intersection(set(user_j)))
    if len(user_co) < 3:
        # a tuning parameter
        return 0.0
    else:
        # find the co-rating vectors by using `np.isin`
        vec_i, vec_j = train_rating[index_i], train_rating[index_j]
        vec_co_i, vec_co_j = vec_i[np.isin(user_i, user_co)], vec_j[np.isin(user_j, user_co)]
        return np.dot(vec_co_i, vec_co_j) / norm(vec_co_i) / norm(vec_co_j)

def cor_rs_item(train_pair, train_rating, test_pair):
    n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1
    index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
    index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
    pred = np.zeros(len(test_pair))
    glb_mean = train_rating.mean()
    # Step 1: compute sim-matrix
    S = lil_matrix((n_item, n_item))
    for i in range(n_item):
        for j in range(i):
            if (len(index_item[i]) == 0) or (len(index_item[j]) == 0):
                continue
            S[i,j] = cossim_item(index_item[i],index_item[j],train_pair,train_rating)
    S = S + S.T
    ## Step 3: make prediction if S is stored
    for j in range(len(test_pair)):
        user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
        index_tmp = index_item[item_tmp]
        rated_items = train_pair[index_tmp][:,1]
        rated_ratings = train_rating[index_tmp]
        sim_weight = S[item_tmp, rated_items].toarray()[0]
        ## only keep top 5 closest items
        top_ind = sim_weight.argsort()[-5:][::-1]
        sim_weight_knn = np.zeros(len(sim_weight))
        sim_weight_knn[top_ind] = sim_weight[top_ind]
        if (len(rated_items) == 0) or (max(sim_weight_knn) == 0):
            # if no rated users or no similar users
            index_item_tmp = index_item[item_tmp]
            if len(index_item_tmp) == 0:
                pred[j] = glb_mean
            else:
                pred[j] = train_rating[index_item_tmp].mean()
        else:
            pred[j] = np.sum(sim_weight_knn*rated_ratings) / np.sum(sim_weight_knn)
    return pred