# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook4: ALS for Latent Factor Models II

## ALS for Latent factor model

- Parameter: 
  - \#Users: `n`
  - \#Items: `m`
  - latent factors for users: `P` 
  - latent factors for items: `Q`
  - tuning parameter: `lam`
  - \#Latent factors: `K`



In [3]:
class LFM(object):

    def __init__(self, n_user, n_item, lam=1., K=5, iterNum=10, tol=1e-4):
        self.P = np.random.randn((n_user, K))
        self.Q = np.random.randn((n_item, K))
        self.index_item = []
        self.index_user = []
        self.n_user = n_user
        self.n_item = n_item
        self.lam = lam
        self.K = K
        self.iterNum = iterNum
        self.tol = tol

    def fit(self, train_pair, train_rating):
        diff, tol = 1., self.tol
        n_user, n_item, n_obs = self.n_user, self.n_item, len(train_pair)
        K, iterNum, lam = self.K, self.iterNum, self.lam
        ## store user/item index set
        self.index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
        self.index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
        
        Q_old, P_old = self.P.copy(), self.Q.copy()
        # Q_res, P_res = np.ones((m,K)), np.ones((n,K))
        for i in range(iterNum):
            ## item update
            for item_id in range(n_item):
                index_item_tmp = self.index_item[item_id]
                if len(user_list_tmp) == 0:
                    continue
                user_list_tmp = train_pair[index_item_tmp][:,0]
                sum_pu, sum_matrix = np.zeros((K)), np.zeros((K, K))
                for user_id in user_list_tmp:
                    sum_matrix = sum_matrix + np.outer(self.P[user_id,:], self.P[user_id,:])
                    sum_pu = sum_pu + train_data[train_index_matrix[user_id+1, item_id+1]-1, 2] * P_old[user_id,:]
                Q[item_id,:] = np.dot(np.linalg.inv(sum_matrix + lam * np.identity(K)), sum_pu)
            
            
            
            
            diff1 = np.linalg.norm(Q - Q_old) / K / m
            Q_res = Q_old
            Q_old = np.array(Q)
            with pymp.Parallel(self.n_jobs) as q:
                ## iteration for users
                for user_id in q.range(n):
                    item_list_tmp = train_data[train_data[:,0] == user_id+1, 1]-1
                    if len(item_list_tmp) == 0:
                        continue
                    sum_qi, sum_matrix = np.zeros((K)), np.zeros((K, K))
                    for item_id in item_list_tmp:
                        sum_matrix = sum_matrix + np.outer(Q_old[item_id,:], Q_old[item_id,:])
                        sum_qi = sum_qi + train_data[train_index_matrix[user_id+1, item_id+1]-1, 2] * Q_old[item_id,:]
                    P[user_id,:] = np.dot(np.linalg.inv(sum_matrix + lam * np.identity(K)), sum_qi)
                    if user_id % (n/50) == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
            diff2 = np.linalg.norm(P - P_old) / K / n
            # print "\n users update finished."
            P_res = P_old
            P_old = np.array(P)
            diff = diff1 + diff2
            # diff = np.linalg.norm(np.dot(P,Q.T) - np.dot(P_res,Q_res.T)) / n / m
            SE = np.array([(line[2] - np.dot(P[int(line[0])-1], Q[int(line[1])-1]))**2 for line in train_data])
            RMSE = np.sqrt(SE.mean())
            print("\n Regularized-SVD - K: %s lam: %s diff: %s RMSE: %s" %(K, lam, diff, RMSE))
            if(diff < tol):
                break
        self.Q, self.P = Q, P

## Load and pro-processed dataset

In [5]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('./dataset/train.csv')
dtest = pd.read_csv('./dataset/test.csv')
## save real ratings for test set for evaluation.
test_ratings = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

## convert string to user_id and item_id -> [user_id, item_id, rating]
# pre-process for training data
train_pair = dtrain[['user_id', 'movie_id']].values
train_ratings = dtrain['rating'].values
# pre-process for testing set
test_pair = dtest[['user_id', 'movie_id']].values

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Define and training the predictive models based on `class`

In [6]:
## baseline user mean methods
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_user = user_ave.predict(test_pair)
print('RMSE for user_mean: %.3f' %rmse(test_ratings, pred_user) )

RMSE for user_mean: 1.017


In [8]:
## baseline item mean methods
item_ave = item_mean(n_item=n_item)
item_ave.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_item = item_ave.predict(test_pair)
print('RMSE for item_mean: %.3f' %rmse(test_ratings, pred_item) )

RMSE for item_mean: 1.052


In [9]:
## Correlation-based RS (user)
cor_user = cor_rs_user(n_user=n_user)
cor_user.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_cor_user = cor_user.predict(test_pair, train_ratings)
print('RMSE for item_mean: %.3f' %rmse(test_ratings, pred_cor_user) )

RMSE for item_mean: 1.073


In [10]:
## Correlation-based RS (item)
cor_item = cor_rs_item(n_item=n_item)
cor_item.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_cor_item = cor_item.predict(test_pair, train_ratings)
print('RMSE for item_mean: %.3f' %rmse(test_ratings, pred_cor_item) )

RMSE for item_mean: 1.074


In [11]:
## Baseline + Correlation-based RS
# glb mean
glb_ave = glb_mean()
glb_ave.fit(train_ratings)
pred = glb_ave.predict(test_pair)
# user_mean
train_ratings_cm = train_ratings - glb_ave.predict(train_pair)
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_ratings_cm)
train_ratings_res = train_ratings_cm - user_ave.predict(train_pair)
pred = pred + user_ave.predict(test_pair)
# fit correlation-based RS by residual ratings 
cor_user = cor_rs_user(n_user=n_user)
cor_user.fit(train_pair=train_pair, train_ratings=train_ratings_res)
pred = pred + cor_user.predict(test_pair, train_ratings_res, top=1000)

print('RMSE for glb + user_mean + cor_rs(user): %.3f' %rmse(test_ratings, pred) )

RMSE for ite	m_mean: 1.005
