In [14]:
import numpy as np
import os

from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [15]:
import os
os.chdir("/content/drive/My Drive/Friends2Vec/baseline")

!ls CF_Yelp.ipynb

CF_Yelp.ipynb


In [16]:
ratings_list = np.load("cf_full_ratings.npy")

users = np.unique([x[0] for x in ratings_list])
items = np.unique([x[1] for x in ratings_list])

n_users = len(users)
n_items = len(items)

user_mapping = {}
item_mapping = {}

for u in range(len(users)):
  user_mapping[users[u]] = u

for u in range(len(items)):
  item_mapping[items[u]] = u

print(n_users, n_items, len(ratings_list))

train_ratings_list = np.load("cf_train_ratings.npy")
test_ratings_list = np.load("cf_test_ratings.npy")

def get_ratings_dict(ratings_list):
  ratings_dict = {}
  for row in ratings_list:
    u = user_mapping[row[0]]
    i = item_mapping[row[1]]
    
    if u not in ratings_dict:
      ratings_dict[u] = {}
    ratings_dict[u][i] = row[2]
  return ratings_dict

train_ratings = get_ratings_dict(train_ratings_list)
test_ratings = get_ratings_dict(test_ratings_list)

print(len(train_ratings), len(test_ratings))

users = np.arange(n_users)
items = np.arange(n_items)

12632 10982 703942
12632 12541


In [0]:
# import random

# ratings = np.zeros((n_users, n_items))
# for row in ratings_list:
#     ratings[user_mapping[row[0]], item_mapping[row[1]]] = row[2]
    
# ratings = np.array([[4., 4., 0., 3., 0.], [0., 5., 5., 3., 0.], [4., 4., 4., 4., 0.], [0., 5., 0., 4., 5.], [3., 0., 4., 3., 3.]])  

# train_ratings = {}
# test_ratings = {}
# for u in range(len(ratings)):
#   for v in range(len(ratings[u])):
#     if v != 0:
#       p = random.random()
#       if p > 0.2:
#         if u not in train_ratings:
#           train_ratings[u] = {}
#         train_ratings[u][v] = p
#       else:
#         if u not in test_ratings:
#           test_ratings[u] = {}
#         test_ratings[u][v] = p

# users = [0, 1, 2, 3, 4]
# movies = [0, 1, 2, 3, 4]

In [0]:
import math

# Constant
k = 0.05

# Functionalize everything. 
# `users` is treated as global variables. So, we don't see them as parameters anywhere.
def root_of_sum_of_squares(dic):
    ans = 0
    for key in dic:
        ans += dic[key] ** 2
    return math.sqrt(ans)

def compute_cosine_table(train_ratings):
    cosine = {}
    for i in range(0, len(users)):
        for j in range(i+1, len(users)):
            cos_val = 0
            for mov in train_ratings[users[i]]:
                if mov in train_ratings[users[j]]:
                    cos_val += train_ratings[users[i]][mov] * train_ratings[users[j]][mov]
            cos_val = cos_val / (root_of_sum_of_squares(train_ratings[users[i]]) *  root_of_sum_of_squares(train_ratings[users[j]]))

            if users[i] not in cosine:
                cosine[users[i]] = {}
            if users[j] not in cosine:
                cosine[users[j]] = {}

            cosine[users[i]][users[j]] = cos_val
            cosine[users[j]][users[i]] = cos_val
        cosine[users[i]][users[i]] = 1
    return cosine

def compute_average_ratings(train_ratings):
    avg_ratings = {}
    for user in users:
        avg_rating = 0
        for mov in train_ratings[user]:
            avg_rating += train_ratings[user][mov]
        avg_rating /= len(train_ratings[user])
        avg_ratings[user] = avg_rating
    return avg_ratings

def compute_predicted_ratings(train_ratings, test_ratings, avg_ratings, similarity_table):
    pred_ratings = {}
    for user in test_ratings:
        for mov in test_ratings[user]:
            pred_rating = 0
            for v in train_ratings:
                if mov in train_ratings[v]:
                    pred_rating += similarity_table[user][v] * (train_ratings[v][mov] - avg_ratings[v])

            if user not in pred_ratings:
                pred_ratings[user] = {}
            pred_ratings[user][mov] = avg_ratings[user] + k * pred_rating
    return pred_ratings

In [19]:
# Things start happening here.
# cosine = compute_cosine_table(train_ratings)

# print("table computed")

avg_ratings = compute_average_ratings(train_ratings)

# print("average done")

# pred_ratings = compute_predicted_ratings(train_ratings, test_ratings, avg_ratings, cosine)

global_avg = 0
count = 0

for rat in train_ratings_list:
  count += 1
  global_avg += float(rat[2])
  
global_avg /= count
print(global_avg)

3.816666434083646


In [0]:
def compute_errors(test_ratings, avg_ratings):  
    mean_abs_error = 0
    rms_error = 0
    total_count = 0

    for user in test_ratings:
        for mov in test_ratings[user]:

            mean_abs_error += abs(test_ratings[user][mov] - avg_ratings[user])
            rms_error += (test_ratings[user][mov] - global_avg) ** 2
            total_count += 1

    mean_abs_error /= total_count
    rms_error = math.sqrt(rms_error / total_count)
    
    return mean_abs_error, rms_error

In [21]:
mean_abs, rms = compute_errors(test_ratings, avg_ratings)

print("Mean Absolute Error is: ", mean_abs)
print("Root Mean Squared Error is: ", rms)

Mean Absolute Error is:  0.8090941001435721
Root Mean Squared Error is:  1.078552792338733
