#Recommender Systems:  Collaborative Filtering Model

In [None]:
# Importing the required libraries
import pandas as pd
import collections
import math
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
from operator import itemgetter

In [None]:
# Reading the input data files
df_item_test = pd.read_csv('/content/drive/My Drive/test_dataset.csv')
df_item_train = pd.read_csv('/content/drive/My Drive/training_dataset.csv')

In [None]:
# Converting the data files to matrices
given_test_matrix = df_item_test[['user_id', 'item_id', 'rating']].to_numpy()
given_train_matrix = df_item_train[['user_id', 'item_id', 'rating']].to_numpy()

In [None]:
test_max = given_test_matrix.max(axis=0)
train_max = given_train_matrix.max(axis=0)

total_users = max(test_max[0], train_max[0])+1     # Taking the maximum and adding 1 as the values start from 1 and not 0
total_movies = max(test_max[1], train_max[1])+1

print("Columns : " + str(total_users) + " and Rows: " + str(total_movies))

Columns : 944 and Rows: 1683


In [None]:
# Utility matrix is the matrix consisting of movies as rows and users as columns
utility_matrix = np.zeros((total_movies,total_users))
total_entries = df_item_train.shape[0]

In [None]:
print("Total Entries in the utility matrix: " + str(total_entries))

Total Entries in the utility matrix: 95000


In [None]:
# Extracting the users, movies and the corresponding ratings and transferring them to the utility matix
for i in range(0, total_entries):
  userID, itemID, rating, name = df_item_train.iloc[i]
  utility_matrix[itemID][userID] = rating

In [None]:
# Checking whether all the non-zero entries have been added to the utility matrix
np.count_nonzero(utility_matrix)

95000

In [None]:
# Initializing the adjusted cosine and cosine matrices with zeroes
adjusted_cosine_utility_matrix = np.zeros((total_movies,total_users))
cosine_utility_matrix = np.zeros((total_movies,total_users))

In [None]:
for i in range(1, total_movies):
  sum = np.sum(utility_matrix[i])  # Finding the sum of the row
  non_zero = np.count_nonzero(utility_matrix[i])  # This gives the number of non zeroes
  if sum == 0.0:
    continue
  avg = sum/non_zero   # Finding the average
  for j in range(1, total_users):
    if utility_matrix[i][j] != 0:
      adjusted_cosine_utility_matrix[i][j] = utility_matrix[i][j] - avg  # Subtracting the mean value from the rating for adjusted cosine
      cosine_utility_matrix[i][j] = utility_matrix[i][j] # Just adding the ratings to cosine similarity matrix

In [None]:
print("Non zero values in adjusted_cosine_utility_matrix: " + str(np.count_nonzero(adjusted_cosine_utility_matrix)))
print("Non zero values in cosine_utility_matrix: " + str(np.count_nonzero(cosine_utility_matrix)))

Non zero values in adjusted_cosine_utility_matrix: 93797
Non zero values in cosine_utility_matrix: 95000


In [None]:
# This function will give the similarity value
def getSimilarityValue(a, b) -> float:
  dot = np.dot(a,b) # Getting the dot product of two vectors which is the numerator of the similarity equation
  first = (math.sqrt(np.sum(np.square(a))))  # This is the first term in the denominator of the similarity equation
  second = (math.sqrt(np.sum(np.square(b)))) # This is the second term in the denominator of the similarity equation
  mult = first * second  # Multiplying the two terms
  if mult == 0.0:
    return 0.0
  val = dot/(first*second) # This is the similarity equation 
  return val

In [None]:
# Similarity matrix values are initialized with zeroes
similarity_matrix_adjusted_cosine = np.zeros((total_movies,total_movies))
similarity_matrix_cosine = np.zeros((total_movies,total_movies))

In [None]:
# Computing all the similarity values with Adjusted cosine similarity metric and cosine similarity metric
for i in range(1,total_movies):
  for j in range(1,total_movies):
    similarity_matrix_adjusted_cosine[i][j] = getSimilarityValue(adjusted_cosine_utility_matrix[i], adjusted_cosine_utility_matrix[j])
    similarity_matrix_cosine[i][j] = getSimilarityValue(cosine_utility_matrix[i], cosine_utility_matrix[j])

In [None]:
# This is the given RSME function 
def RMSE(y_actual, y_predicted):
  rms_value = sqrt(mean_squared_error(y_actual, y_predicted))
  return round(rms_value,4)

In [None]:
predicted_adjusted_cosine_ratings_list = list() 
predicted_cosine_ratings_list = list() 
actual_ratings_list = list() 
for i in range(0, 5000):
  #collecting movie_id and similarity in a dictionary for each movie
  #for both cosine and adjusted cosine similarity
  predict_user, predict_movie, actual_rating, movie_name = df_item_test.iloc[i]
  actual_ratings_list.append(actual_rating)
  similar_adjusted_cosine_dict = collections.defaultdict(float)
  similar_cosine_dict = collections.defaultdict(float)

  #selecting movies having more than 0.11 adjusted cosine similarity and
  #movies having more than 0.2 cosine similarity for prediction
  for i in range(1,total_movies):
    if i != predict_movie and similarity_matrix_adjusted_cosine[predict_movie][i] > 0.11:
      similar_adjusted_cosine_dict[i] = similarity_matrix_adjusted_cosine[predict_movie][i]
    
    if i != predict_movie and similarity_matrix_cosine[predict_movie][i] > 0.2:
      similar_cosine_dict[i] = similarity_matrix_cosine[predict_movie][i]

  ################################################################################################################################
  # Adjusted cosine similarity metric prediction
  adjusted_cosine_value_sum = 0.0
  adjusted_cosine_dot_sum = 0.0
  for key, value in similar_adjusted_cosine_dict.items():
    if utility_matrix[key][predict_user] == 0.0:
      continue
    adjusted_cosine_value_sum = adjusted_cosine_value_sum + value
    adjusted_cosine_dot_sum = adjusted_cosine_dot_sum + utility_matrix[key][predict_user]*value

  if adjusted_cosine_value_sum == 0.0:
    predicted_adjusted_cosine_value = 0.0
  else:
    predicted_adjusted_cosine_value = adjusted_cosine_dot_sum/adjusted_cosine_value_sum
  
  predicted_adjusted_cosine_ratings_list.append(predicted_adjusted_cosine_value) # Appending the values to the prediction list

  #################################################################################################################################
  # Cosine similarity metric prediction
  cosine_value_sum = 0.0
  cosine_dot_sum = 0.0
  for key, value in similar_cosine_dict.items():
    if utility_matrix[key][predict_user] == 0.0:
      continue
    cosine_value_sum = cosine_value_sum + value
    cosine_dot_sum = cosine_dot_sum + utility_matrix[key][predict_user]*value
    
  if cosine_value_sum == 0.0:
    predicted_cosine_value = 0.0
  else:
    predicted_cosine_value = cosine_dot_sum/cosine_value_sum
  
  predicted_cosine_ratings_list.append(predicted_cosine_value) # Appending the values to the prediction list


In [None]:
# Prediction values with adjusted cosine similarity metric and the RMSE values associated with it 
actual_list_for_rms = list()
predicted_adjusted_cosine_list_for_rms = list()

for i in range(0, 5000):
  if predicted_adjusted_cosine_ratings_list[i] > 0.0:
    actual_list_for_rms.append(actual_ratings_list[i])
    predicted_adjusted_cosine_list_for_rms.append(predicted_adjusted_cosine_ratings_list[i])

adjusted_cosine_lst_tuple = list(zip(actual_list_for_rms, predicted_adjusted_cosine_list_for_rms))

print("Tuple of actual and predicted values are as follows:")
print(adjusted_cosine_lst_tuple)
print()
print("The RMSE value for adjusted cosine similarity is:")
print(RMSE(actual_list_for_rms, predicted_adjusted_cosine_list_for_rms))

Tuple of actual and predicted values are as follows:
[(3, 3.427939610681819), (1, 2.858981531210522), (4, 3.5433266123328826), (1, 4.308022685421942), (5, 3.592108592688681), (3, 2.6786679200745867), (5, 4.376430196124437), (5, 3.923507017649808), (2, 2.8239030614883442), (5, 3.718146605557934), (5, 3.5482562017655854), (1, 3.3561333286343005), (4, 3.7454057144483595), (1, 3.784531074646092), (4, 3.823069610679724), (4, 4.447787724542886), (4, 3.3528119358319537), (1, 2.671677800845066), (4, 3.3872843581009837), (3, 3.441354034299381), (2, 3.355091141142677), (1, 2.191240359087587), (3, 3.3908041971051697), (2, 3.2423109083692347), (5, 2.90502467488686), (4, 3.709816136506254), (4, 3.1882436297274275), (3, 3.8374084589233917), (3, 3.801587563590328), (5, 3.896462648363203), (5, 3.139537679223537), (2, 2.6389127063953004), (3, 3.9419809382359996), (3, 3.0939521387866113), (4, 4.43033830237781), (1, 1.2403121140265003), (5, 4.0913375660875335), (4, 3.7780123958985072), (5, 3.695985824960

In [None]:
# Prediction values with cosine similarity metric and the RMSE values associated with it
actual_list_for_rms = list()
predicted_cosine_list_for_rms = list()

for i in range(0, 5000):
  if predicted_cosine_ratings_list[i] > 0.0:
    actual_list_for_rms.append(actual_ratings_list[i])
    predicted_cosine_list_for_rms.append(predicted_cosine_ratings_list[i])

cosine_lst_tuple = list(zip(actual_list_for_rms, predicted_cosine_list_for_rms))

print("Tuple of actual and predicted values are as follows:")
print(cosine_lst_tuple)
print()
print("The RMSE value for cosine similarity is:")
print(RMSE(actual_list_for_rms, predicted_cosine_list_for_rms))

Tuple of actual and predicted values are as follows:
[(3, 3.6451648391309615), (1, 2.812386926839275), (4, 3.630326120297367), (1, 3.222889909304044), (5, 3.1752131865495463), (3, 2.8484140048835203), (5, 4.224248172985014), (5, 3.7678683466140237), (2, 3.010001422367565), (5, 3.387172963676764), (5, 3.600997779065676), (1, 3.6380416118047654), (1, 3.8629873091931795), (4, 3.8422990140801976), (4, 4.382473748394156), (4, 3.4556134907334886), (1, 2.3821493950325916), (4, 3.5751488774920404), (3, 3.648574202636765), (2, 3.730847069268693), (1, 2.400327991447024), (3, 3.5921768865049266), (2, 3.3070070188125977), (5, 3.1482127215471443), (4, 4.127761492334853), (4, 3.479310742546139), (3, 4.272085080714847), (3, 3.962286974600201), (5, 3.6408191699027648), (5, 3.3823576635993997), (2, 2.7615489352829585), (3, 4.100514102520887), (3, 3.0229591441319865), (4, 4.469211528841288), (1, 1.3220005667728845), (5, 3.425605497440274), (4, 3.88543510420732), (5, 3.9549738072858154), (2, 2.8357357258

# Comparison between adjusted cosine and cosine similarity
Adjusted cosine similarity is better than cosine similarity which is evident from the RMSE results. It is because adjusted cosine has the average rating around 0 as the average is taken and subtracted from each rating and the missing ratings are given a rating of 0, but for cosine similarity the ratings are taken as it is and the missing ratings are given a rating of 0 which does not capture the essence of ratings well which the users have given.