In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
import numpy as np
import pandas as pd
import time
from scipy.special import digamma
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support as score

class matrix_factorization():
	
	def __init__(self,data,orig_data,features,R,alpha,beta):
		self.data = data
		self.orig_data = orig_data
		self.features = features
		self.R = R
		self.alpha = alpha
		self.beta = beta
		self.user_count = data.shape[0]
		self.item_count = data.shape[1]
		self.user_features = np.random.uniform(low=0.1,high=0.9,size=(self.user_count,self.features))
		self.item_features = np.random.uniform(low=0.1,high=0.9,size=(self.features,self.item_count))
		self.gamma = np.random.uniform(low=0.1,high=0.9,size=(self.user_count,self.features))
		self.e_plus = np.random.uniform(low=0.1,high=0.9,size=(self.item_count,self.features))
		self.e_minus = np.random.uniform(low=0.1,high=0.9,size=(self.item_count,self.features))
		self.lambda_ = np.zeros(shape=(self.user_count,self.item_count,self.features))
	
	def MSE(self):
		"""
		Mean squared error function comparing dot product of user-feature row and feature-item column to user-item cell
		"""
		# print("mse")
		# print("user features:\n", self.user_features)
		# print("item features:\n", self.item_features)
		matrix_product = np.matmul(self.user_features,self.item_features)
		print("predicted rating matrix\n", matrix_product)

		scaled_matrix = np.zeros_like(matrix_product)

		scaled_matrix[(matrix_product >= 0.0) & (matrix_product < 0.2)] = 1
		scaled_matrix[(matrix_product >= 0.2) & (matrix_product < 0.4)] = 2
		scaled_matrix[(matrix_product >= 0.4) & (matrix_product < 0.6)] = 3
		scaled_matrix[(matrix_product >= 0.6) & (matrix_product < 0.8)] = 4
		scaled_matrix[matrix_product >= 0.8 ] = 5

		print("predicted rating matrix after scaling\n", scaled_matrix)
		print("actual rating matrix \n", self.orig_data)
	  
		cnt = 0
		mae = 0
		cmae = 0
		rmse = 0
		cmae_cnt = 0
		for u in range(0,self.user_count):
			for i in range(0,self.item_count):
				if self.orig_data[u,i] != 0:
					cnt += 1
					mae += abs(self.orig_data[u,i] - scaled_matrix[u,i])
					rmse += (self.orig_data[u,i] - scaled_matrix[u,i])**2
					if self.orig_data[u,i] >=4 or scaled_matrix[u,i]>=4:
						cmae_cnt += 1
						cmae += abs(self.orig_data[u,i] - scaled_matrix[u,i])
		mae /= cnt
		cmae /= cmae_cnt
		rmse /= cnt
		rmse = rmse ** 0.5
		print("CMAE = ", cmae)
		print("MAE = ", mae)
		print("RMSE = ", rmse)

		# precision, recall, fscore, support = score(self.orig_data, scaled_matrix)
		# print('precision = {}'.format(precision))
		# print('recall = {}'.format(recall))
		# print('fscore = {}'.format(fscore))
		# print('support = {}'.format(support))

		return mae

		# return np.sum(abs(self.orig_data - scaled_matrix))/(self.user_count*self.item_count)
	
	def get_user_item_features(self):
		# print("features")
		for i in range(0,self.user_count):
			self.user_features[i] = self.gamma[i]/np.sum(self.gamma[i])
		for k in range(0,self.features):
			for i in range(0,self.item_count):
				self.item_features[k,i] = self.e_plus[i,k]/(self.e_plus[i,k]+self.e_minus[i,k])

	def update_epsilion(self):
		"""
		Updates every epsilion parameter according to supplied learning rate
		"""   
		# print("epsilion")   
		for i in range(0,self.item_count):
			for j in range(0,self.features):
				val1 = 0
				val2 = 0
				for k in range(0,self.user_count):
					if self.data[k,i] != 0:
						val1 += self.lambda_[k,i,j]*self.R*self.data[k,i]
						val2 += self.lambda_[k,i,j]*self.R*(1-self.data[k,i])
				self.e_plus[i,j] = self.beta + val1
				self.e_minus[i,j] = self.beta + val2

	def update_gamma(self):
		"""
		Updates every gamma parameter according to supplied learning rate
		"""      
		# print("gamma")
		for i in range(0,self.user_count):
			for j in range(0,self.features):
				val = 0
				for k in range(0,self.item_count):
					if self.data[i,k] != 0:
						val += self.lambda_[i,k,j]
				self.gamma[i,j] = self.alpha + val

	def update_lambda(self):
		"""
		Updates every lambda parameter according to supplied learning rate
		"""      
		# print("lambda")
		for u in range(0,self.user_count):
			for i in range(0,self.item_count):
				if self.data[u,i] != 0:
					lambda_dash = np.zeros(self.features)
					for k in range(0,self.features):
						lambda_dash[k] = np.exp(digamma(self.gamma[u,k]) + \
							self.R*self.data[u,i]*digamma(self.e_plus[i,k]) + \
							self.R*(1-self.data[u,i])*digamma(self.e_minus[i,k]) - \
							self.R*digamma(self.e_plus[i,k]+self.e_minus[i,k]))
					for k in range(0,self.features):
						self.lambda_[u,i,k] = lambda_dash[k]/np.sum(lambda_dash)
				
	def train_model(self,iterations=20):
		"""
		Trains model
		"""    
		avg_tim = 0        
		for i in range(iterations):
			start = time.process_time()
			self.update_lambda()
			self.update_gamma()
			self.update_epsilion()
			tim = (time.process_time() - start)
			avg_tim += tim
			print("Time taken by ",(i+1), " iteration = ", tim)
		self.get_user_item_features()
		mse = self.MSE()
		avg_tim /= iterations
		print("Average time taken = ", avg_tim)
	
# configure file path
data_path = 'gdrive/MyDrive/ml-latest-small/'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
# read data
df_movies = pd.read_csv(
    data_path + movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    (data_path + ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


df_movies_cnt = df_ratings.groupby('movieId') \
       .agg({'userId':'size', 'rating':'mean'}) \
       .rename(columns={'userId':'count','rating':'average'}) \
       .reset_index()

print(df_ratings)
print(df_movies_cnt)
movie_rating_thres = 20

df_movies_cnt = df_movies_cnt.sort_values(by=['count']) 
popular_movies = df_movies_cnt.head(movie_rating_thres)
print(popular_movies)

movie_list = popular_movies['movieId'].tolist()
print(movie_list)

#improvement

df_users_cnt = df_ratings.groupby('userId') \
       .agg({'movieId':'size', 'rating':'mean'}) \
       .rename(columns={'movieId':'count','rating':'average'}) \
       .reset_index()

print(df_users_cnt)
user_rating_thres = 40

users_below_threshold = df_users_cnt[(df_users_cnt['count'] < user_rating_thres)]  
print("ans=", len(users_below_threshold))
print(users_below_threshold)

user_list = users_below_threshold['userId'].tolist()
print(user_list)


for u in user_list:
	for i in movie_list:
		if df_ratings[(df_ratings.userId == u) & (df_ratings.movieId == i)].empty:
			r = popular_movies.loc[popular_movies.movieId == i, 'average'].iloc[0]
			new_row = pd.Series({'userId': u, 'movieId': i, 'rating': r})
			df_ratings = df_ratings.append(new_row, ignore_index=True)

print(df_ratings)



# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

print(df_movie_features.head(10))
data = df_movie_features.to_numpy()
print(data)

# split data into train and test
train, test = train_test_split(data, test_size=0.2)	

min_max_scaler = preprocessing.MinMaxScaler()
train_n = min_max_scaler.fit_transform(train)
print(train_n)
test_n = min_max_scaler.fit_transform(test)

obj = matrix_factorization(train_n, train, 6, 4, 0.8, 5)
obj.train_model()	
				

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]
      movieId  count   average
0           1    215  3.920930
1           2    110  3.431818
2           3     52  3.259615
3           4      7  2.357143
4           5     49  3.071429
...       ...    ...       ...
9719   193581      1  4.000000
9720   193583      1  3.500000
9721   193585      1  3.500000
9722   193587      1  3.500000
9723   193609      1  4.000000

[9724 rows x 3 columns]
      movieId  count  average
9723   193609      1      4.0
3010     4032      1      4.0
6653    57526      1      3.0
6652    57522      1      3.5
6650    57502      1      4.0
6649    5749