# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook4: ALS for Latent Factor Models II

## Object-Oriented Programming (OOP) in Python

- define an RS as a `class` with `parameters` in Python
- define `fit`, `predict` functions

In [3]:
# warm-up with baseline methods

class glb_mean(object):
	def __init__(self):
		self.glb_mean = 0
	
	def fit(self, train_ratings):
		self.glb_mean = np.mean(train_ratings)
	
	def predict(self, test_pair):
		pred = np.ones(len(test_pair))
		pred = pred*self.glb_mean
		return pred

class user_mean(object):
	def __init__(self, n_user):
		self.n_user = n_user
		self.glb_mean = 0.
		self.user_mean = np.zeros(n_user)
	
	def fit(self, train_pair, train_ratings):
		self.glb_mean = train_ratings.mean()
		for u in range(self.n_user):
			ind_train = np.where(train_pair[:,0] == u)[0]
			if len(ind_train) == 0:
				self.user_mean[u] = self.glb_mean
			else:
				self.user_mean[u] = train_ratings[ind_train].mean()
	
	def predict(self, test_pair):
		pred = np.ones(len(test_pair))*self.glb_mean
		j = 0
		for row in test_pair:
			user_tmp, item_tmp = row[0], row[1]
			pred[j] = self.user_mean[user_tmp]
			j = j + 1
		return pred

class item_mean(object):
	def __init__(self, n_item):
		self.n_item = n_item
		self.glb_mean = 0.
		self.item_mean = np.zeros(n_item)
	
	def fit(self, train_pair, train_ratings):
		self.glb_mean = train_ratings.mean()
		for i in range(self.n_item):
			ind_train = np.where(train_pair[:,1] == i)[0]
			if len(ind_train) == 0:
				self.item_mean[i] = self.glb_mean
			else:
				self.item_mean[i] = train_ratings[ind_train].mean()
	
	def predict(self, test_pair):
		pred = np.ones(len(test_pair))*self.glb_mean
		j = 0
		for row in test_pair:
			user_tmp, item_tmp = row[0], row[1]
			pred[j] = self.item_mean[item_tmp]
			j = j + 1
		return pred

In [4]:
# define class for correlation based RS
from numpy.linalg import norm
from scipy.sparse import lil_matrix

class cor_rs_user(object):
	def __init__(self, n_user):
		self.n_user = n_user
		self.glb_mean = 0.
		self.user_mean = np.zeros(n_user)
		self.S = lil_matrix((n_user, n_user))
		self.index_item = []
		self.index_user = []
		self.min_co = 3
	
	def cossim(self, index_u, index_v, train_pair, train_ratings):
		item_u = train_pair[index_u][:,1]
		item_v = train_pair[index_v][:,1]
		# find co-rating items by `set`
		item_co = list(set(item_u).intersection(set(item_v)))
		if len(item_co) < self.min_co:
			# a tuning parameter
			return 0.0
		else:
			# find the co-rating vectors by using `np.isin`
			vec_u, vec_v = train_ratings[index_u], train_ratings[index_v]
			vec_co_u, vec_co_v = vec_u[np.isin(item_u, item_co)], vec_v[np.isin(item_v, item_co)]
			return np.dot(vec_co_u, vec_co_v) / (norm(vec_co_u)+1e-5) / (norm(vec_co_v)+1e-5)
	
	def sim_mat(self, train_pair, train_ratings):
		self.index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
		self.index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
		for u in range(self.n_user):
			for v in range(u):
				if (len(self.index_user[u]) == 0) or (len(self.index_user[v]) == 0):
					continue
				weight_tmp = self.cossim(self.index_user[u],self.index_user[v],train_pair,train_ratings)
				if weight_tmp > 0:
					self.S[u,v] = weight_tmp
		self.S = self.S + self.S.T
	
	def fit(self, train_pair, train_ratings):
		self.glb_mean = train_ratings.mean()
		# use another class to predict the user mean
		user_ave_method = user_mean(self.n_user)
		user_ave_method.fit(train_pair, train_ratings)
		self.user_mean = user_ave_method.user_mean
		self.sim_mat(train_pair, train_ratings)
	
	def predict(self, test_pair, train_ratings, top=10):
		pred = np.zeros(len(test_pair))
		for j in range(len(test_pair)):
			user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
			index_tmp = self.index_item[item_tmp]
			rated_users = train_pair[index_tmp][:,0]
			rated_ratings = train_ratings[index_tmp]
			sim_weight = self.S[user_tmp, rated_users].toarray()[0]
			## only keep top 10 closest users
			top_ind = sim_weight.argsort()[-top:][::-1]
			sim_weight_knn = np.zeros(len(sim_weight))
			sim_weight_knn[top_ind] = sim_weight[top_ind]
			if (len(rated_users) == 0) or (max(sim_weight_knn) == 0):
				# if no rated users or no similar users
				pred[j] = self.user_mean[user_tmp]
			else:
				pred[j] = np.sum(sim_weight_knn*rated_ratings) / np.sum(sim_weight_knn)
		return pred


class cor_rs_item(object):
	def __init__(self, n_item):
		self.n_item = n_item
		self.glb_mean = 0.
		self.item_mean = np.zeros(n_item)
		self.S = lil_matrix((n_item, n_item))
		self.index_item = []
		self.index_user = []
		self.min_co = 3

	def cossim(self, index_i, index_j, train_pair, train_ratings):
		# index_u = np.where(train_pair[:,0] == u)[0]
		# index_v = np.where(train_pair[:,0] == v)[0]
		user_i = train_pair[index_i][:,0]
		user_j = train_pair[index_j][:,0]
		# find co-rating items by `set`
		user_co = list(set(user_i).intersection(set(user_j)))
		if len(user_co) < self.min_co:
			# a tuning parameter
			return 0.0
		else:
			# find the co-rating vectors by using `np.isin`
			vec_i, vec_j = train_ratings[index_i], train_ratings[index_j]
			vec_co_i, vec_co_j = vec_i[np.isin(user_i, user_co)], vec_j[np.isin(user_j, user_co)]
			return np.dot(vec_co_i, vec_co_j) / (norm(vec_co_i)+1e-5) / (norm(vec_co_j)+1e-5)
	
	def sim_mat(self, train_pair, train_ratings):
		self.index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
		self.index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
		for i in range(self.n_item):
			for j in range(i):
				if (len(self.index_item[i]) == 0) or (len(self.index_item[j]) == 0):
					continue
				weight_tmp = self.cossim(self.index_item[i],self.index_item[j],train_pair,train_ratings)
				if weight_tmp > 0:
					self.S[i,j] = weight_tmp
		self.S = self.S + self.S.T
	
	def fit(self, train_pair, train_ratings):
		self.glb_mean = train_ratings.mean()
		# use another class to predict the item mean
		item_ave_method = item_mean(self.n_item)
		item_ave_method.fit(train_pair, train_ratings)
		self.item_mean = item_ave_method.item_mean
		self.sim_mat(train_pair, train_ratings)
	
	def predict(self, test_pair, train_ratings, top=10):
		pred = np.zeros(len(test_pair))
		for j in range(len(test_pair)):
			user_tmp, item_tmp = test_pair[j,0], test_pair[j,1]
			index_tmp = self.index_user[user_tmp]
			rated_items = train_pair[index_tmp][:,1]
			rated_ratings = train_ratings[index_tmp]
			sim_weight = self.S[item_tmp, rated_items].toarray()[0]
			## only keep top 10 closest users
			top_ind = sim_weight.argsort()[-top:][::-1]
			sim_weight_knn = np.zeros(len(sim_weight))
			sim_weight_knn[top_ind] = sim_weight[top_ind]
			if (len(rated_items) == 0) or (max(sim_weight_knn) == 0):
				# if no rated items or no similar items
				pred[j] = self.item_mean[item_tmp]
			else:
				pred[j] = np.sum(sim_weight_knn*rated_ratings) / np.sum(sim_weight_knn)
		return pred

def rmse(true, pred):
	return np.sqrt(np.mean((pred - true)**2))

## Load and pro-processed dataset

In [5]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('./dataset/train.csv')
dtest = pd.read_csv('./dataset/test.csv')
## save real ratings for test set for evaluation.
test_ratings = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

## convert string to user_id and item_id -> [user_id, item_id, rating]
# pre-process for training data
train_pair = dtrain[['user_id', 'movie_id']].values
train_ratings = dtrain['rating'].values
# pre-process for testing set
test_pair = dtest[['user_id', 'movie_id']].values

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Define and training the predictive models based on `class`

In [6]:
## baseline user mean methods
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_user = user_ave.predict(test_pair)
print('RMSE for user_mean: %.3f' %rmse(test_ratings, pred_user) )

RMSE for user_mean: 1.017


In [8]:
## baseline item mean methods
item_ave = item_mean(n_item=n_item)
item_ave.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_item = item_ave.predict(test_pair)
print('RMSE for item_mean: %.3f' %rmse(test_ratings, pred_item) )

RMSE for item_mean: 1.052


In [9]:
## Correlation-based RS (user)
cor_user = cor_rs_user(n_user=n_user)
cor_user.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_cor_user = cor_user.predict(test_pair, train_ratings)
print('RMSE for item_mean: %.3f' %rmse(test_ratings, pred_cor_user) )

RMSE for item_mean: 1.073


In [10]:
## Correlation-based RS (item)
cor_item = cor_rs_item(n_item=n_item)
cor_item.fit(train_pair=train_pair, train_ratings=train_ratings)
pred_cor_item = cor_item.predict(test_pair, train_ratings)
print('RMSE for item_mean: %.3f' %rmse(test_ratings, pred_cor_item) )

RMSE for item_mean: 1.074


In [11]:
## Baseline + Correlation-based RS
# glb mean
glb_ave = glb_mean()
glb_ave.fit(train_ratings)
pred = glb_ave.predict(test_pair)
# user_mean
train_ratings_cm = train_ratings - glb_ave.predict(train_pair)
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_ratings_cm)
train_ratings_res = train_ratings_cm - user_ave.predict(train_pair)
pred = pred + user_ave.predict(test_pair)
# fit correlation-based RS by residual ratings 
cor_user = cor_rs_user(n_user=n_user)
cor_user.fit(train_pair=train_pair, train_ratings=train_ratings_res)
pred = pred + cor_user.predict(test_pair, train_ratings_res, top=1000)

print('RMSE for glb + user_mean + cor_rs(user): %.3f' %rmse(test_ratings, pred) )

RMSE for ite	m_mean: 1.005
