# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook4: ALS for Latent Factor Models II

## ALS for Latent factor model

- Parameter: 
  - \#Users: `n`
  - \#Items: `m`
  - latent factors for users: `P` 
  - latent factors for items: `Q`
  - tuning parameter: `lam`
  - \#Latent factors: `K`



In [14]:
def rmse(true, pred):
	return np.sqrt(np.mean((pred - true)**2))

class LFM(object):

    def __init__(self, n_user, n_item, lam=.001, K=10, iterNum=10, tol=1e-4):
        self.P = np.random.randn(n_user, K)
        self.Q = np.random.randn(n_item, K)
        # self.index_item = []
        # self.index_user = []
        self.n_user = n_user
        self.n_item = n_item
        self.lam = lam
        self.K = K
        self.iterNum = iterNum
        self.tol = tol

    def fit(self, train_pair, train_rating):
        diff, tol = 1., self.tol
        n_user, n_item, n_obs = self.n_user, self.n_item, len(train_pair)
        K, iterNum, lam = self.K, self.iterNum, self.lam
        ## store user/item index set
        self.index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
        self.index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
        print('Fitting Reg-LFM: K: %d, lam: %.5f' %(K, lam))
        for i in range(iterNum):
            ## item update
            score_old = self.rmse(test_pair=train_pair, test_rating=train_rating)
            for item_id in range(n_item):
                index_item_tmp = self.index_item[item_id]
                if len(index_item_tmp) == 0:
                    self.Q[item_id,:] = 0.
                    continue
                sum_pu, sum_matrix = np.zeros((K)), np.zeros((K, K))
                for record_ind in index_item_tmp:
                    ## double-check
                    if item_id != train_pair[record_ind][1]:
                        raise ValueError('the item_id is waring in updating Q!')
                    user_id, rating_tmp = train_pair[record_ind][0], train_rating[record_ind]
                    sum_matrix = sum_matrix + np.outer(self.P[user_id,:], self.P[user_id,:])
                    sum_pu = sum_pu + rating_tmp * self.P[user_id,:]                    
                self.Q[item_id,:] = np.dot(np.linalg.inv(sum_matrix + lam*n_obs*np.identity(K)), sum_pu)
            
            for user_id in range(n_user):
                index_user_tmp = self.index_user[user_id]
                if len(index_user_tmp) == 0:
                    self.P[user_id,:] = 0.
                    continue
                sum_pu, sum_matrix = np.zeros((K)), np.zeros((K, K))
                for record_ind in index_user_tmp:
                    ## double-check
                    if user_id != train_pair[record_ind][0]:
                        raise ValueError('the user_id is waring in updating P!')
                    item_id, rating_tmp = train_pair[record_ind][1], train_rating[record_ind]
                    sum_matrix = sum_matrix + np.outer(self.Q[item_id,:], self.Q[item_id,:])
                    sum_pu = sum_pu + rating_tmp * self.Q[item_id,:]                    
                self.P[user_id,:] = np.dot(np.linalg.inv(sum_matrix + lam*n_obs*np.identity(K)), sum_pu)
            # compute the new rmse score
            score_new = self.rmse(test_pair=train_pair, test_rating=train_rating)
            diff = abs(score_new - score_old) / score_old
            print("Reg-LFM: ite: %d; diff: %.3f RMSE: %.3f" %(i, diff, score_new))
            if(diff < tol):
                break

    def predict(self, test_pair):
        # predict ratings for user-item pairs
        pred_rating = [np.dot(self.P[line[0]], self.Q[line[1]]) for line in test_pair]
        return np.array(pred_rating)
    
    def rmse(self, test_pair, test_rating):
        # report the rmse for the fitted `LFM`
        pred_rating = self.predict(test_pair=test_pair)
        return np.sqrt( np.mean( (pred_rating - test_rating)**2) )

## Load and pro-processed dataset

In [15]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('./dataset/train.csv')
dtest = pd.read_csv('./dataset/test.csv')
## save real ratings for test set for evaluation.
test_rating = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

## convert string to user_id and item_id -> [user_id, item_id, rating]
# pre-process for training data
train_pair = dtrain[['user_id', 'movie_id']].values
train_rating = dtrain['rating'].values
# pre-process for testing set
test_pair = dtest[['user_id', 'movie_id']].values

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Fit and predict based on `LFM`

In [48]:
# fitting
shiing = LFM(n_user, n_item, K=3, lam=.0001)
shiing.fit(train_pair=train_pair, train_rating=train_rating)

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.225 RMSE: 3.206
Reg-LFM: ite: 1; diff: 0.673 RMSE: 1.048
Reg-LFM: ite: 2; diff: 0.088 RMSE: 0.956
Reg-LFM: ite: 3; diff: 0.023 RMSE: 0.934
Reg-LFM: ite: 4; diff: 0.011 RMSE: 0.924
Reg-LFM: ite: 5; diff: 0.006 RMSE: 0.918
Reg-LFM: ite: 6; diff: 0.004 RMSE: 0.915
Reg-LFM: ite: 7; diff: 0.002 RMSE: 0.913
Reg-LFM: ite: 8; diff: 0.002 RMSE: 0.912
Reg-LFM: ite: 9; diff: 0.001 RMSE: 0.911


In [49]:
# pediction
pred_rating = shiing.predict(test_pair)

In [50]:
# evaluation
# rmse(test_rating, pred_rating)
# or we can just use
shiing.rmse(test_pair, test_rating)

1.166430089014828

In [52]:
# baseline methods
class glb_mean(object):
	def __init__(self):
		self.glb_mean = 0
	
	def fit(self, train_ratings):
		self.glb_mean = np.mean(train_ratings)
	
	def predict(self, test_pair):
		pred = np.ones(len(test_pair))
		pred = pred*self.glb_mean
		return pred

class user_mean(object):
	def __init__(self, n_user):
		self.n_user = n_user
		self.glb_mean = 0.
		self.user_mean = np.zeros(n_user)
	
	def fit(self, train_pair, train_ratings):
		self.glb_mean = train_ratings.mean()
		for u in range(self.n_user):
			ind_train = np.where(train_pair[:,0] == u)[0]
			if len(ind_train) == 0:
				self.user_mean[u] = self.glb_mean
			else:
				self.user_mean[u] = train_ratings[ind_train].mean()
	
	def predict(self, test_pair):
		pred = np.ones(len(test_pair))*self.glb_mean
		j = 0
		for row in test_pair:
			user_tmp, item_tmp = row[0], row[1]
			pred[j] = self.user_mean[user_tmp]
			j = j + 1
		return pred

class item_mean(object):
	def __init__(self, n_item):
		self.n_item = n_item
		self.glb_mean = 0.
		self.item_mean = np.zeros(n_item)
	
	def fit(self, train_pair, train_ratings):
		self.glb_mean = train_ratings.mean()
		for i in range(self.n_item):
			ind_train = np.where(train_pair[:,1] == i)[0]
			if len(ind_train) == 0:
				self.item_mean[i] = self.glb_mean
			else:
				self.item_mean[i] = train_ratings[ind_train].mean()
	
	def predict(self, test_pair):
		pred = np.ones(len(test_pair))*self.glb_mean
		j = 0
		for row in test_pair:
			user_tmp, item_tmp = row[0], row[1]
			pred[j] = self.item_mean[item_tmp]
			j = j + 1
		return pred

In [65]:
## Baseline + LFM
# glb mean
glb_ave = glb_mean()
glb_ave.fit(train_rating)
pred = glb_ave.predict(test_pair)
# user_mean
train_rating_cm = train_rating - glb_ave.predict(train_pair)
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_rating_cm)
train_rating_res = train_rating_cm - user_ave.predict(train_pair)
pred = pred + user_ave.predict(test_pair)
# fit correlation-based RS by residual ratings 
shiing = LFM(n_user, n_item, K=3, lam=.0001)
shiing.fit(train_pair=train_pair, train_rating=train_rating_res)
pred = pred + shiing.predict(test_pair)

print('RMSE for glb + user_mean + LFM: %.3f' %rmse(test_rating, pred) )

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.525 RMSE: 0.940
Reg-LFM: ite: 1; diff: 0.047 RMSE: 0.896
Reg-LFM: ite: 2; diff: 0.036 RMSE: 0.863
Reg-LFM: ite: 3; diff: 0.020 RMSE: 0.845
Reg-LFM: ite: 4; diff: 0.010 RMSE: 0.837
Reg-LFM: ite: 5; diff: 0.006 RMSE: 0.832
Reg-LFM: ite: 6; diff: 0.005 RMSE: 0.828
Reg-LFM: ite: 7; diff: 0.004 RMSE: 0.825
Reg-LFM: ite: 8; diff: 0.003 RMSE: 0.822
Reg-LFM: ite: 9; diff: 0.002 RMSE: 0.821
RMSE for glb + user_mean + LFM: 0.978
