# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook2: Correlation-based Recommender Systems

## Load and pro-processed dataset

In [2]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('./dataset/train.csv')
dtest = pd.read_csv('./dataset/test.csv')
## save real ratings for test set for evaluation.
test_ratings = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

## convert string to user_id and item_id -> [user_id, item_id, rating]
# pre-process for training data
train_pair = dtrain[['user_id', 'movie_id']].values
train_ratings = dtrain['rating'].values
# pre-process for testing set
test_pair = dtest[['user_id', 'movie_id']].values

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Implement Correlation-based (user-based) recommender systems
- Inpout: training set.

- Output: return predicted ratings for (user id, item id) user-item pairs in testing set.

- Goal: make prediction for testing set

## Algo 1: Correlation-based (user-based) recommender systems (stored sim-matrix)

In [3]:
pred_algo1 = np.zeros(len(test_ratings))

[ 1  0  0 ...  0  0 -1]


In [11]:
# Step 2(a): compute pairwise cosine similarity between all users
from numpy.linalg import norm

def cossim_user(index_u,index_v,train_pair,train_ratings):
    # index_u = np.where(train_pair[:,0] == u)[0]
    # index_v = np.where(train_pair[:,0] == v)[0]
    item_u = train_pair[index_u][:,1]
    item_v = train_pair[index_v][:,1]
    # find co-rating items by `set`
    item_co = list(set(item_u).intersection(set(item_v)))
    if len(item_co) <= 3:
        # a tuning parameter
        return 0.0
    else:
        # find the co-rating vectors by using `np.isin`
        vec_u, vec_v = train_ratings[index_u], train_ratings[index_v]
        vec_co_u, vec_co_v = vec_u[np.isin(item_u, item_co)], vec_v[np.isin(item_v, item_co)]
        return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)

In [7]:
%%time
index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]

from scipy.sparse import csr_matrix, lil_matrix
# dense sim_matrix
# S = np.zeros((n_user,n_user))
# sparse matrix 
S = lil_matrix((n_user, n_user))
for u in range(n_user):
    print('predict for user_id %d' %u)
    for v in range(u):
        if (len(index_user[u]) == 0) or (len(index_user[u]) == 0):
            continue
        S[u,v] = cossim_user(index_user[u],index_user[v],train_pair,train_ratings)
S = S + S.T

In [15]:
## Step 3: make prediction if S is stored
glb_mean = train_ratings.mean()

for j in range(len(test_pair)):
    user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
    index_tmp = index_item[item_tmp]
    rated_users = train_pair[index_tmp][:,0]
    rated_ratings = train_ratings[index_tmp]
    if len(rated_users) == 0:
        # if no rated users
        pred_algo1[j] = glb_mean
    else:
        sim_weight = S[user_tmp, rated_users].toarray()[0]
        # find top 5 rated-users
        # sim_weight_knn = np.zeros(len(sim_weight))
        # top_index = np.argsort(sim_weight)[-5:]
        # sim_weight_knn[top_index] = sim_weight[top_index]
        # sim_weight = S[user_tmp, rated_users]
        # print(sim_weight)
        if max(sim_weight) == 0:
            pred_algo1[j] = glb_mean
        else:
            pred_algo1[j] = np.sum(sim_weight*rated_ratings) / np.sum(sim_weight)

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

## Algo 2: Correlation-based (user-based) recommender systems (without stored sim-matrix)

In [10]:
pred_algo2 = np.zeros(len(test_ratings))

## Step 2(b): use `dict` to store rated users for all items
index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
## Updated Step 3: make prediction if S is stored
glb_mean = train_ratings.mean()

for u in range(n_user):
    print('predict for user_id %d' %u)
    index_list_tmp = np.where(test_pair[:,0] == u)[0]
    if len(index_list_tmp) == 0:
        # no record to predict for this user.
        continue
    for record in index_list_tmp:
        user_tmp, item_tmp = test_pair[record,0], test_pair[record,1]
        index_tmp = index_item[item_tmp]
        rated_users = train_pair[index_tmp][:,0]
        rated_ratings = train_ratings[index_tmp]
        if len(rated_users) == 0:
            # if no rated users
            pred_algo2[record] = glb_mean
        else:
            sim_weight = [cossim_user(index_user[user_tmp],index_user[v],train_pair,train_ratings) for v in rated_users]
            # print(sim_weight)
            if max(sim_weight) == 0:
                pred_algo2[record] = glb_mean
            else:
                pred_algo2[record] = np.sum(sim_weight*rated_ratings) / np.sum(sim_weight)

  return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)
  return np.dot(vec_co_u, vec_co_v) / norm(vec_co_u) / norm(vec_co_v)


KeyboardInterrupt: 

## Baseline + correlation

- Pre-processed ratings by Baseline methods.

- Prediction by correlation-based algorithm.

## Evaluation: compute RMSE for baseline methods 

In [12]:
## RMSE for ALGO1
rmse_crs = np.sqrt(np.mean((pred_algo1 - test_ratings)**2))
print('RMSE for correlation-base RS (ALGO1): %.3f' %rmse_crs)

RMSE for correlation-base RS (ALGO1): 3.769


In [18]:
## RMSE for ALGO2
rmse_crs_new = np.sqrt(np.mean((pred_algo2 - test_ratings)**2))
print('RMSE for correlation-base RS (ALGO2): %.3f' %rmse_crs_new)

RMSE for correlation-base RS (ALGO2): 2.588
