In [None]:
%matplotlib inline

import pylab as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading Dataset

In [None]:
# remember add the dataset to your drive
folder_path  = "/content/drive/MyDrive/DSA4212/Dataset/ml-1m/"
#folder_path = "C:/Users/ASUS/Desktop/aaa_UINVERSITY/Y3S2/DSA4212/Ass2/ml-1m/ml-1m/"
movies = pd.read_csv(folder_path + "movies.dat", sep = '::', header = None, names = ["MovieID", "Title", "Genres"], engine='python',encoding='latin-1')
ratings = pd.read_csv(folder_path + "ratings.dat",sep='::',header=None,engine='python',names=["UserID", "MovieID", "Rating", "Timestamp"])
users = pd.read_csv(folder_path + "users.dat",sep='::',header=None,engine='python',names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])

In [None]:
movie_ratings = movies.set_index("MovieID").join(ratings.set_index("MovieID"), on = 'MovieID').reset_index().dropna()
ids_to_remove = []
for i in range(3952):
    if len(movie_ratings[movie_ratings.MovieID == (i+1)]) < 2:
        ids_to_remove.append(i+1)
movie_ratings = movie_ratings[~movie_ratings.MovieID.isin(ids_to_remove)]
movies = movies[~movies.MovieID.isin(ids_to_remove)]
movie_train, movie_test = train_test_split(movie_ratings, test_size = 0.1, random_state = 4212)

In [None]:
# creating matrix of (users X movies) containing their ratings

movie_ind = {} # movieID as key, index as val
ind_movie = {} # index as key, movieID as val
pos = 0
for ind, row in movies.iterrows():
    movie_ind[row.MovieID] = pos
    ind_movie[pos] = row.MovieID
    pos += 1

mat_template = [[np.nan for _ in range(len(movies))] for _ in range(len(users))]
mat = np.asarray(mat_template)
for ind, row in movie_train.iterrows():
    if row.UserID>0:
        mat[int(row.UserID)-1][int(movie_ind[row.MovieID])] = row.Rating

testing_mat = np.asarray(mat_template)
for ind, row in movie_test.iterrows():
    if row.UserID>0:
        testing_mat[int(row.UserID)-1][int(movie_ind[row.MovieID])] = row.Rating

In [None]:
mat_weights = np.where(np.isnan(mat), 0., mat)
mat_weights = np.where(np.isfinite(mat), 1, mat_weights)
mat_weights_long = mat_weights.reshape(-1)
mat_na_is_0 = np.where(np.isnan(mat), 0., mat)
testing_mat_indices = np.where(np.isfinite(testing_mat), 1, testing_mat).reshape(-1)

In [None]:
mat = np.nan_to_num(mat)

In [None]:
# get numerical indices of where we need to test

testing_indices = np.squeeze(np.argwhere(np.isfinite(testing_mat_indices)))
print(testing_indices)
print(len(testing_indices))

[     566     1374     1721 ... 21694890 21695203 21695441]
100010


# Cosine Similarity

In [None]:
def cos_sim(vector1, vector2):
    #return jnp.dot(vector1, vector2) / ((jnp.linalg.norm(vector1)*jnp.linalg.norm(vector2)) + 0.000001)
    return np.dot(vector1, vector2) / ((np.linalg.norm(vector1)*np.linalg.norm(vector2)) + 0.000001)

In [None]:
# generating movie similarity matrix
movie_sim_matrix = np.zeros((len(movies),len(movies)))
mat_T = mat.T

for i in range(len(movies)):
    if i == (len(movies)//100):
        print("1% done!")
    if i == (len(movies)//4):
        print("25% done!")
    if i == (len(movies)//2):
        print("50% done!")
    if i == ((len(movies)//4)*3):
        print("75% done!")
    for j in range(len(movies)):
        if i == j:
            movie_sim_matrix[i][j] = 1.
        elif i < j:
            movie_sim_matrix[i][j] = cos_sim(mat_T[i], mat_T[j])
        else:
            # only need to generate upper triange, then copy
            movie_sim_matrix[i][j] = movie_sim_matrix[j][i]

1% done!
25% done!
50% done!
75% done!


In [None]:
fail_indices = []

def get_rmse_from_sim_mat(mov_sim_mat):
    # mat = training data
    num_tests = len(testing_indices)
    num_movies = len(movies)
    sq_error = 0
    fail_count = 0
    progress_count = 0
    for i in testing_indices:
        progress_count += 1
        if progress_count == (num_tests//10):
            print("10% done")
        if progress_count == (num_tests//2):
            print("50% done")
        if progress_count == (num_tests//1.25):
            print("80% done")
        # getting position of rating to test
        row = i//num_movies  # aka which user
        col = i % num_movies # aka which movie

        # calculating RMSE
        sim_dot_ratings = np.dot(mov_sim_mat[col], mat[row])
        total_sim_score = sum(mov_sim_mat[col][np.argwhere((mov_sim_mat[col] != 0) & (mat[row] != 0))])

        # error cases
        if sim_dot_ratings == 0:
            #print("sim dot ratings is 0")
            fail_indices.append(i)
            fail_count += 1
            continue
        if total_sim_score == 0:
            #print("total sim score is 0")
            fail_indices.append(i)
            fail_count += 1
            continue

        pred_val =  (sim_dot_ratings / total_sim_score)[0]  # divide by total similarity score where user has rated
        real_val = (testing_mat[row][col])

        sq_error += (pred_val - real_val)**2

    rmse = (sq_error / num_tests)**0.5
    print("Number of cases when cos sim doesn't work = " + str(fail_count))
    print("RMSE is "+ str(rmse))
    return rmse

get_rmse_from_sim_mat(movie_sim_matrix)

10% done
50% done
80% done
Number of cases when cos sim doesn't work = 2
RMSE is 1.0003689019986788


1.0003689019986788

# Adjusted Cosine Similarity

In [None]:
# normalizing ratings with average rating per movie

def rescale(x):
    return x - np.nanmean(x)

In [None]:
mat[mat == 0] = np.nan
avg_ratings = np.nanmean(mat.T, axis=0)
item_normed_mat = pd.DataFrame(mat)
item_normed_mat = item_normed_mat.T.apply(lambda x: rescale(x)).T
item_normed_mat = np.asarray(item_normed_mat)
item_normed_mat = np.nan_to_num(item_normed_mat)

In [None]:
# generating movie similarity matrix
movie_sim_matrix = np.zeros((len(movies),len(movies)))
print(len(movies))
item_normed_mat_T = item_normed_mat.T

for i in range(len(movies)):
    if i == (len(movies)//4):
        print("25% done!")
    if i == (len(movies)//2):
        print("50% done!")
    if i == ((len(movies)//4)*3):
        print("75% done!")
    for j in range(len(movies)):
        if i == j:
            movie_sim_matrix[i][j] = 1.
        elif i < j:
            movie_sim_matrix[i][j] = cos_sim(item_normed_mat_T[i], item_normed_mat_T[j])
        else:
            # only need to generate upper triange, then copy
            movie_sim_matrix[i][j] = movie_sim_matrix[j][i]

3592
25% done!
50% done!
75% done!


In [None]:
fail_indices = []

def get_rmse_from_sim_mat(mov_sim_mat):
    # mat = training data
    num_tests = len(testing_indices)
    num_movies = len(movies)
    sq_error = 0
    fail_count = 0
    progress_count = 0
    for i in testing_indices:
        progress_count += 1
        if progress_count == (num_tests//10):
            print("10% done")
        if progress_count == (num_tests//2):
            print("50% done")
        if progress_count == (num_tests//1.25):
            print("80% done")
        # getting position of rating to test
        row = i//num_movies  # aka which user
        col = i % num_movies # aka which movie
        
        # calculating RMSE
        sim_dot_ratings = np.dot(mov_sim_mat[col], item_normed_mat[row])
        total_sim_score = sum(mov_sim_mat[col][np.argwhere((mov_sim_mat[col] > 0) & (item_normed_mat[row] != 0))])

        # error cases
        if sim_dot_ratings == 0:
            #print("sim dot ratings is 0")
            fail_indices.append(i)
            fail_count += 1
            continue
        if total_sim_score == 0:
            #print("total sim score is 0")
            fail_indices.append(i)
            fail_count += 1
            continue
        if np.isnan(sq_error):
            print("sq_error is nan")
            fail_indices.append(i)
            break

        pred_val = avg_ratings[row] + (sim_dot_ratings / total_sim_score)[0]  # divide by total similarity score where user has rated
        real_val = (testing_mat[row][col])

        sq_error += (pred_val - real_val)**2
        #print((sq_error/progress_count)**0.5)
        #print("pred", pred_val)
        #print(real_val)

    rmse = (sq_error / num_tests)**0.5
    print("Number of cases when cos sim doesn't work = " + str(fail_count))
    print("RMSE is "+ str(rmse))

    return rmse

get_rmse_from_sim_mat(movie_sim_matrix)

10% done
50% done
80% done
Number of cases when cos sim doesn't work = 11
RMSE is 1.048764674011312


1.048764674011312