# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook5: Latent Factor Models III: SGD

## We still focus on the `LFM` class, yet we will change the ALS in `fit` as the gradient descent

In [1]:
def rmse(true, pred):
    return np.sqrt(np.mean((pred - true)**2))

class LFM(object):

    def __init__(self, n_user, n_item, lam=.001, K=10, iterNum=1000, tol=1e-4):
        self.P = np.random.randn(n_user, K)
        self.Q = np.random.randn(n_item, K)
        # self.index_item = []
        # self.index_user = []
        self.n_user = n_user
        self.n_item = n_item
        self.lam = lam
        self.K = K
        self.iterNum = iterNum
        self.tol = tol

    def fit(self, train_pair, train_rating, learning_rate=0.0001):
        diff, tol = 1., self.tol
        n_user, n_item, n_obs = self.n_user, self.n_item, len(train_pair)
        K, iterNum, lam = self.K, self.iterNum, self.lam
        ## store user/item index set
        self.index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
        self.index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
        print('Fitting Reg-LFM: K: %d, lam: %.5f' %(K, lam))
        for i in range(iterNum):
            ## item update
            score_old = self.rmse(test_pair=train_pair, test_rating=train_rating)
            for item_id in range(n_item):
                index_item_tmp = self.index_item[item_id]
                if len(index_item_tmp) == 0:
                    self.Q[item_id,:] = 0.
                    continue
                sum_pu, sum_matrix = np.zeros((K)), np.zeros((K, K))
                for record_ind in index_item_tmp:
                    ## double-check
                    if item_id != train_pair[record_ind][1]:
                        raise ValueError('the item_id is wrong in updating Q!')
                    ## Gradient
                    user_id, rating_tmp = train_pair[record_ind][0], train_rating[record_ind]
                    # r_{ui} - p_u^T q_i
                    err_tmp = rating_tmp - np.dot(self.P[user_id], self.Q[item_id])
                    # (r_{ui} - p_u^T q_i) p_u
                    sum_pu = sum_pu + err_tmp * self.P[user_id,:] / n_obs
                self.Q[item_id,:] = self.Q[item_id,:] + 2*learning_rate * sum_pu - 2*learning_rate*lam*self.Q[item_id,:]
            
            for user_id in range(n_user):
                index_user_tmp = self.index_user[user_id]
                if len(index_user_tmp) == 0:
                    self.P[user_id,:] = 0.
                    continue
                sum_qi, sum_matrix = np.zeros((K)), np.zeros((K, K))
                for record_ind in index_user_tmp:
                    ## double-check
                    if user_id != train_pair[record_ind][0]:
                        raise ValueError('the user_id is waring in updating P!')
                    item_id, rating_tmp = train_pair[record_ind][1], train_rating[record_ind]
                    # r_{ui} - p_u^T q_i
                    err_tmp = rating_tmp - np.dot(self.P[user_id], self.Q[item_id])
                    # (r_{ui} - p_u^T q_i) q_i
                    sum_qi = sum_qi + err_tmp * self.Q[item_id,:] / n_obs
                self.P[user_id,:] = self.P[user_id,:] + 2*learning_rate*sum_qi - 2*learning_rate*lam*self.P[user_id,:]
            # compute the new rmse score
            score_new = self.rmse(test_pair=train_pair, test_rating=train_rating)
            diff = - score_new + score_old
            print("Reg-LFM: ite: %d; diff: %.3f RMSE: %.3f" %(i, diff, score_new))
            if (diff < tol):
                break

    def predict(self, test_pair):
        # predict ratings for user-item pairs
        pred_rating = [np.dot(self.P[line[0]], self.Q[line[1]]) for line in test_pair]
        return np.array(pred_rating)
    
    def rmse(self, test_pair, test_rating):
        # report the rmse for the fitted `LFM`
        pred_rating = self.predict(test_pair=test_pair)
        return np.sqrt( np.mean( (pred_rating - test_rating)**2) )

## Load and pro-processed dataset

In [2]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('./dataset/train.csv')
dtest = pd.read_csv('./dataset/test.csv')
## save real ratings for test set for evaluation.
test_rating = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

## convert string to user_id and item_id -> [user_id, item_id, rating]
# pre-process for training data
train_pair = dtrain[['user_id', 'movie_id']].values
train_rating = dtrain['rating'].values
# pre-process for testing set
test_pair = dtest[['user_id', 'movie_id']].values

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Fit and predict based on `LFM`

## Note that `learning_rate` is a tuning parameter. In practice, we will try a large one and gradually reduce to a small number until the termination condition.

In [8]:
# baseline methods
class glb_mean(object):
    def __init__(self):
        self.glb_mean = 0
    
    def fit(self, train_ratings):
        self.glb_mean = np.mean(train_ratings)
    
    def predict(self, test_pair):
        pred = np.ones(len(test_pair))
        pred = pred*self.glb_mean
        return pred

class user_mean(object):
    def __init__(self, n_user):
        self.n_user = n_user
        self.glb_mean = 0.
        self.user_mean = np.zeros(n_user)
    
    def fit(self, train_pair, train_ratings):
        self.glb_mean = train_ratings.mean()
        for u in range(self.n_user):
            ind_train = np.where(train_pair[:,0] == u)[0]
            if len(ind_train) == 0:
                self.user_mean[u] = self.glb_mean
            else:
                self.user_mean[u] = train_ratings[ind_train].mean()
    
    def predict(self, test_pair):
        pred = np.ones(len(test_pair))*self.glb_mean
        j = 0
        for row in test_pair:
            user_tmp, item_tmp = row[0], row[1]
            pred[j] = self.user_mean[user_tmp]
            j = j + 1
        return pred

class item_mean(object):
    def __init__(self, n_item):
        self.n_item = n_item
        self.glb_mean = 0.
        self.item_mean = np.zeros(n_item)
    
    def fit(self, train_pair, train_ratings):
        self.glb_mean = train_ratings.mean()
        for i in range(self.n_item):
            ind_train = np.where(train_pair[:,1] == i)[0]
            if len(ind_train) == 0:
                self.item_mean[i] = self.glb_mean
            else:
                self.item_mean[i] = train_ratings[ind_train].mean()
    
    def predict(self, test_pair):
        pred = np.ones(len(test_pair))*self.glb_mean
        j = 0
        for row in test_pair:
            user_tmp, item_tmp = row[0], row[1]
            pred[j] = self.item_mean[item_tmp]
            j = j + 1
        return pred

In [9]:
## Baseline + LFM
# glb mean
glb_ave = glb_mean()
glb_ave.fit(train_rating)
pred = glb_ave.predict(test_pair)
# user_mean
train_rating_cm = train_rating - glb_ave.predict(train_pair)
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_rating_cm)
train_rating_res = train_rating_cm - user_ave.predict(train_pair)
pred = pred + user_ave.predict(test_pair)
# fit correlation-based RS by residual ratings 
shiing = LFM(n_user, n_item, K=3, lam=.0001)
shiing.fit(train_pair=train_pair, train_rating=train_rating_res, learning_rate=100.)

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.673 RMSE: 1.325
Reg-LFM: ite: 1; diff: 0.149 RMSE: 1.175
Reg-LFM: ite: 2; diff: 0.067 RMSE: 1.108
Reg-LFM: ite: 3; diff: 0.040 RMSE: 1.068
Reg-LFM: ite: 4; diff: 0.027 RMSE: 1.041
Reg-LFM: ite: 5; diff: 0.019 RMSE: 1.022
Reg-LFM: ite: 6; diff: 0.014 RMSE: 1.008
Reg-LFM: ite: 7; diff: 0.011 RMSE: 0.998
Reg-LFM: ite: 8; diff: 0.008 RMSE: 0.989
Reg-LFM: ite: 9; diff: 0.007 RMSE: 0.983
Reg-LFM: ite: 10; diff: 0.005 RMSE: 0.977
Reg-LFM: ite: 11; diff: 0.004 RMSE: 0.973
Reg-LFM: ite: 12; diff: 0.004 RMSE: 0.969
Reg-LFM: ite: 13; diff: 0.003 RMSE: 0.966
Reg-LFM: ite: 14; diff: 0.003 RMSE: 0.963
Reg-LFM: ite: 15; diff: 0.002 RMSE: 0.961
Reg-LFM: ite: 16; diff: 0.002 RMSE: 0.959
Reg-LFM: ite: 17; diff: 0.002 RMSE: 0.957
Reg-LFM: ite: 18; diff: 0.002 RMSE: 0.955
Reg-LFM: ite: 19; diff: 0.001 RMSE: 0.954
Reg-LFM: ite: 20; diff: 0.001 RMSE: 0.953
Reg-LFM: ite: 21; diff: 0.001 RMSE: 0.952
Reg-LFM: ite: 22; diff: 0.001 RMSE: 0.951
Reg-LFM:

## Note that in `ite: 7`, `diff` = - 0.124, the loss function is getting worse, which means that `learning_rate` = 100 is too large now.

In [11]:
shiing.fit(train_pair=train_pair, train_rating=train_rating_res, learning_rate=50.)

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.000 RMSE: 0.823


In [12]:
pred = pred + shiing.predict(test_pair)

print('RMSE for glb + user_mean + LFM: %.3f' %rmse(test_rating, pred) )

RMSE for glb + user_mean + LFM: 0.972


## Train the model with stochastic gradient descent (**SGD**)?

In [15]:
class LFM(object):

    def __init__(self, n_user, n_item, lam=.001, K=10, iterNum=1000, tol=1e-4):
        self.P = np.random.randn(n_user, K)
        self.Q = np.random.randn(n_item, K)
        # self.index_item = []
        # self.index_user = []
        self.n_user = n_user
        self.n_item = n_item
        self.lam = lam
        self.K = K
        self.iterNum = iterNum
        self.tol = tol

    def fit(self, train_pair, train_rating, learning_rate=0.0001):
        diff, tol = 1., self.tol
        n_user, n_item, n_obs = self.n_user, self.n_item, len(train_pair)
        K, iterNum, lam = self.K, self.iterNum, self.lam
        ## store user/item index set
        self.index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
        self.index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
        print('Fitting Reg-LFM: K: %d, lam: %.5f' %(K, lam))
        for i in range(iterNum):
            ## item update
            score_old = self.rmse(test_pair=train_pair, test_rating=train_rating)
            for j in range(n_obs):
                user_id, item_id, rating_tmp = train_pair[j,0], train_pair[j,1], train_rating[j]
                err_tmp = rating_tmp - np.dot(self.P[user_id,:], self.Q[item_id,:])
                self.Q[item_id,:] = self.Q[item_id,:] + 2*learning_rate*err_tmp*self.P[user_id,:] - 2*learning_rate*lam*self.Q[item_id,:]
                err_tmp = rating_tmp - np.dot(self.P[user_id,:], self.Q[item_id,:])
                self.P[user_id,:] = self.P[user_id,:] + 2*learning_rate*self.Q[item_id,:] - 2*learning_rate*lam*self.P[user_id,:]
            # compute the new rmse score
            score_new = self.rmse(test_pair=train_pair, test_rating=train_rating)
            diff = - score_new + score_old
            print("Reg-LFM: ite: %d; diff: %.3f RMSE: %.3f" %(i, diff, score_new))
            if (diff < tol):
                break

    def predict(self, test_pair):
        # predict ratings for user-item pairs
        pred_rating = [np.dot(self.P[line[0]], self.Q[line[1]]) for line in test_pair]
        return np.array(pred_rating)
    
    def rmse(self, test_pair, test_rating):
        # report the rmse for the fitted `LFM`
        pred_rating = self.predict(test_pair=test_pair)
        return np.sqrt( np.mean( (pred_rating - test_rating)**2) )

In [24]:
## Baseline + LFM
# glb mean
glb_ave = glb_mean()
glb_ave.fit(train_rating)
pred = glb_ave.predict(test_pair)
# user_mean
train_rating_cm = train_rating - glb_ave.predict(train_pair)
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_rating_cm)
train_rating_res = train_rating_cm - user_ave.predict(train_pair)
pred = pred + user_ave.predict(test_pair)
# fit correlation-based RS by residual ratings 
shiing = LFM(n_user, n_item, K=3, lam=.0001)
shiing.fit(train_pair=train_pair, train_rating=train_rating_res, learning_rate=.01)

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.731 RMSE: 1.217
Reg-LFM: ite: 1; diff: 0.109 RMSE: 1.108
Reg-LFM: ite: 2; diff: 0.051 RMSE: 1.057
Reg-LFM: ite: 3; diff: 0.030 RMSE: 1.027
Reg-LFM: ite: 4; diff: 0.020 RMSE: 1.007
Reg-LFM: ite: 5; diff: 0.014 RMSE: 0.993
Reg-LFM: ite: 6; diff: 0.010 RMSE: 0.983
Reg-LFM: ite: 7; diff: 0.008 RMSE: 0.975
Reg-LFM: ite: 8; diff: 0.006 RMSE: 0.969
Reg-LFM: ite: 9; diff: 0.005 RMSE: 0.964
Reg-LFM: ite: 10; diff: 0.005 RMSE: 0.959
Reg-LFM: ite: 11; diff: 0.004 RMSE: 0.955
Reg-LFM: ite: 12; diff: 0.003 RMSE: 0.952
Reg-LFM: ite: 13; diff: 0.003 RMSE: 0.949
Reg-LFM: ite: 14; diff: 0.002 RMSE: 0.946
Reg-LFM: ite: 15; diff: 0.002 RMSE: 0.944
Reg-LFM: ite: 16; diff: 0.002 RMSE: 0.943
Reg-LFM: ite: 17; diff: 0.001 RMSE: 0.941
Reg-LFM: ite: 18; diff: 0.001 RMSE: 0.940
Reg-LFM: ite: 19; diff: 0.001 RMSE: 0.939
Reg-LFM: ite: 20; diff: 0.001 RMSE: 0.939
Reg-LFM: ite: 21; diff: 0.001 RMSE: 0.938
Reg-LFM: ite: 22; diff: 0.000 RMSE: 0.938
Reg-LFM:

In [25]:
shiing.fit(train_pair=train_pair, train_rating=train_rating_res, learning_rate=.001)

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.006 RMSE: 0.929
Reg-LFM: ite: 1; diff: 0.002 RMSE: 0.927
Reg-LFM: ite: 2; diff: 0.001 RMSE: 0.926
Reg-LFM: ite: 3; diff: 0.001 RMSE: 0.925
Reg-LFM: ite: 4; diff: 0.000 RMSE: 0.925
Reg-LFM: ite: 5; diff: 0.000 RMSE: 0.925
Reg-LFM: ite: 6; diff: 0.000 RMSE: 0.925
Reg-LFM: ite: 7; diff: 0.000 RMSE: 0.925


In [26]:
shiing.fit(train_pair=train_pair, train_rating=train_rating_res, learning_rate=.0001)

Fitting Reg-LFM: K: 3, lam: 0.00010
Reg-LFM: ite: 0; diff: 0.000 RMSE: 0.925


In [28]:
pred = pred + shiing.predict(test_pair)
print('RMSE for glb + user_mean + LFM: %.3f' %rmse(test_rating, pred))

RMSE for glb + user_mean + LFM: 1.590
