#Numerical Analysis' project

Movie recommendation system

In [854]:
from scipy.sparse import csr_matrix
from scipy.stats import pearsonr
from numpy.linalg import matrix_rank
from tqdm.notebook import tqdm
from enum import IntEnum
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
#import jax.numpy as jnp
#import jax
import time

Load the dataset using pandas

In [809]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [810]:
# Find all the possible user ids and movie ids-> do that after the deletion
userIds_available = set()
movieIds_available = set()
for id in np.array(ratings['userId'] , dtype = int):
    userIds_available.add(id)

for id in np.array(movies['movieId'] , dtype = int):
    movieIds_available.add(id)
    
userIds_available = list(userIds_available)
movieIds_available = list(movieIds_available)

userIds_available.sort()
movieIds_available.sort()

print(len(userIds_available) , len(movieIds_available))

668 10329


In [811]:
#ratings # 105339 users' ratings , 668 different users
#movies # 10329 movies
rows = np.array(ratings['userId'])
cols = np.array(ratings['movieId'])
vals = np.array(ratings['rating'])

n = rows.max() + 1 # Number of user
p = cols.max() + 1# Number of movies
N = len(vals) # Number of ratings


def binary_search(array , x):
    low = 0
    high = len(array) - 1
    while(high >= low):
        mid = int((high + low) / 2)
        
        if array[mid] == x:
            return mid
        
        elif array[mid] > x:
            high = mid - 1
            
        else:
            low = mid + 1

    print("Element %d not found" % x)
    return -1

# Update the arrays rows/cols with the true position instead of the ids 
for i_user in tqdm(range(len(rows))):
    rows[i_user] = binary_search(userIds_available ,  rows[i_user])

for i_movie in tqdm(range(len(cols))):
    cols[i_movie] = binary_search(movieIds_available , cols[i_movie])

n , p , N

  0%|          | 0/105339 [00:00<?, ?it/s]

  0%|          | 0/105339 [00:00<?, ?it/s]

(669, 149533, 105339)

In [None]:
# Command for analyse input data matrix 
movies.head()
ratings.head()
movies.info()
ratings.info()
movies.describe()
ratings.describe()
sns.distplot(ratings['rating'])
sns.distplot(ratings['movieId'])
sns.scatterplot(data = ratings , x = 'userId' , y = 'movieId' , hue = 'rating')
ratings.corr()

In [814]:
# Shuffle the data
indexes = np.arange(N)
np.random.seed(0) # for reproducibility
np.random.shuffle(indexes)
indexes
# Reordering the arrays
rows = rows[indexes]
cols = cols[indexes]
vals = vals[indexes]

Building the train set (80%) and the validation set (20%)

In [815]:
# Split data in training and testing
num_training = int(N * 0.8)

rows_train = rows[:num_training]
cols_train = cols[:num_training]
vals_train = vals[:num_training]
rows_test  = rows[num_training:]
cols_test  = cols[num_training:]
vals_test  = vals[num_training:]

print(len(rows_train) , len(cols_train) , len(vals_train))

84271 84271 84271


Building the matrix with the origina values

Building the 'Ratings matrix'
Users on the rows and Movies on the columns

Initializing all the elements to 0 and then updating position (i,j) with the rating of movie j by user i if it's present

In [818]:
# Initialize the matrix with all zeros
ratings_matrix = np.zeros((len(userIds_available) , len(movieIds_available)))
print(ratings_matrix)
print("=========================")
# Update the matrix with the known values (contained in vals_train array)
ratings_matrix[rows_train, cols_train] = vals_train
print(ratings_matrix) 

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [5. 0. 2. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 3. 2. ... 0. 0. 0.]]


In [None]:
frame = pd.DataFrame(ratings_matrix, index = userIds_available , columns = movieIds_available)
print(frame)

Checking if there are users that haven't watched any movie

Deleting rows corresponding to user that hasn't watched any movie

In [819]:
print("Initial shape: " , ratings_matrix.shape)
count = []
for user in range(ratings_matrix.shape[0]):
    # Save the row to delete
    if sum(ratings_matrix[user]) == 0: count.append(user)

ratings_matrix = np.delete(ratings_matrix , count , axis = 0)
print("Deleted %d rows" % len(count))
print("Final shape: " , ratings_matrix.shape)

Initial shape:  (668, 10329)
Deleted 0 rows
Final shape:  (668, 10329)


In [820]:
# Count the number of missing values
def count_missing_values(matrix):
    missing_values = 0

    for i_user in tqdm(range(matrix.shape[0])):
        for j_movie in range(matrix.shape[1]):
            # If the movie in position j_movie hasn't a rating
            if matrix[i_user , j_movie] == 0:
                missing_values += 1

    print("There are %d missing valuess" % (missing_values))
    print("There are %d values inserted" % (matrix.shape[0] * matrix.shape[1] - missing_values))
    print("There are %d values" % (matrix.shape[0] * matrix.shape[1]))
    
count_missing_values(ratings_matrix)

  0%|          | 0/668 [00:00<?, ?it/s]

There are 6815501 missing valuess
There are 84271 values inserted
There are 6899772 values


Building movie-genre correlation matrix M

$$
M_{i,j} = 
\begin{cases}
1 & \text{if movie i is of genre j}\\
0 & \text{otherwise}
\end{cases}
$$

In [822]:
# Put in a set all the genres available
genre_available = set()

for i in range(movies.shape[0]):
    genres = movies['genres'][i].split('|')
    for g in genres: genre_available.add(g)

# print("All genres available are: " , id_available , genre_available)

In [823]:
# Build the matrix
num_movies = len(movieIds_available)
num_genres = len(genre_available)
print("Max movie id: " , max(movies['movieId']))
print("Number of movies is: " , num_movies)
print("Number of genres is: " , num_genres)
# Initialize the matrix with all zeros of int8 type
correlation_matrix = np.zeros((num_movies , num_genres) , dtype = np.int8)

Max movie id:  149532
Number of movies is:  10329
Number of genres is:  20


In [824]:
# Update the table with the correspondance
for i in tqdm(range(movies.shape[0])):
    id = movies['movieId'][i]
    # Take the right position in the matrix
    id = movieIds_available.index(id)

    genres = movies['genres'][i].split('|')
    for pos , g in enumerate(genre_available):
        if g in genres:
            correlation_matrix[id , pos] = 1

  0%|          | 0/10329 [00:00<?, ?it/s]

In [None]:
frame = pd.DataFrame(correlation_matrix, index = movieIds_available , columns = genre_available)
print(frame)

Next step:
create a movie-movie matrix to find similiar movies: movies which covers the same genres

In [825]:
def similar_movies(movie1 , movie2):
    """
    movie1 and movie2 are rows of correlation_matrix
    """
    intersection = np.bitwise_and(movie1 , movie2)
    union = np.bitwise_or(movie1 , movie2)
    return sum(intersection) / sum(union)

def cosine_similarity(vector1 , vector2):
    """
    vector1 and vector2 are rows of correlation_matrix or of ratings_matrix
    """
    return np.dot(vector1, vector2)/(np.linalg.norm(vector1) * np.linalg.norm(vector2))

def cosine_similarity_2(vector1 , vector2):
    '''
    Apply this similarity between users -> want to find similar behaviour in rating common movies and then
        use movies that one of the two hasn't watched yet, not use them here
    '''
    common_vector1 = []
    common_vector2 = []
    
    # Take just the movies rated in both the array to find a similarity between users
    for i in range(len(vector1)):
        if vector1[i] != 0 and vector2[i] != 0:
            common_vector1.append(vector1[i])
            common_vector2.append(vector2[i])
    # If the two vectors(users) has at least 3 common ratings
    if len(common_vector1) > 2:
        return np.dot(common_vector1, common_vector2)/(np.linalg.norm(common_vector1) * np.linalg.norm(common_vector2))
    else:
        return 0

print("similar_movies_coefficent: " , similar_movies([1, 0, 0, 1, 0, 1] , [1, 1, 0, 1, 0, 1]))
print("cosine_similary_coefficent: " , cosine_similarity([1, 0, 3, 1, 0, 5] , [1, 2, 3, 1, 5, 5]))
print("cosine_similary_coefficent 2.0: " , cosine_similarity_2([1, 0, 3, 1, 0, 5] , [1, 2, 3, 1, 5, 5]))

similar_movies_coefficent:  0.75
cosine_similary_coefficent:  0.7442084075352507
cosine_similary_coefficent 2.0:  1.0


Creating the dictionary containing the clusters

In [937]:
# Creating clusters for movies
movie_cluster = {}
threshold = 0.7
index_cluster = {}
movieIds_copy = movieIds_available.copy()

num_cluster = 0
while len(movieIds_copy)>0:
    for id_x in tqdm(movieIds_copy):
        list_movies = []
        index_cluster[id_x] = num_cluster
        list_movies.append(id_x)
        for id_y in movieIds_copy:
            if id_x != id_y:
                sim = cosine_similarity(correlation_matrix[movieIds_available.index(id_x)], correlation_matrix[movieIds_available.index(id_y)])
                if sim>= threshold:
                    index_cluster[id_y] = num_cluster
                    list_movies.append(id_y)
                    movieIds_copy.remove(id_y)
        movieIds_copy.remove(id_x)
        movie_cluster[num_cluster] = list_movies
        num_cluster += 1
    
print("Number of cluster is: " , num_cluster)

  0%|          | 0/10329 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Number of cluster is:  261


In [928]:
# Creating clusters for users
users_cluster = {}
threshold = 0.95
user_index_cluster = {}
userIds_copy = userIds_available.copy()

num_cluster = 0
while len(userIds_copy)>0:
    for id_x in tqdm(userIds_copy):
        list_users = []
        user_index_cluster[id_x] = num_cluster
        list_users.append(id_x)
        for id_y in userIds_copy:
            # If it's not the same user
            if id_x != id_y:
                # Calculate the cosine similarity
                sim = cosine_similarity_2(ratings_matrix[userIds_available.index(id_x)], ratings_matrix[userIds_available.index(id_y)])
                # If they are similar enough
                if sim >= threshold:
                    user_index_cluster[id_y] = num_cluster
                    list_users.append(id_y)
                    userIds_copy.remove(id_y)
        userIds_copy.remove(id_x)
        users_cluster[num_cluster] = list_users
        num_cluster += 1
    
print("Number of cluster is: " , num_cluster)

  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Number of cluster is:  75


In [529]:
#print(index_cluster)
#print(movie_cluster.get(2))
#print(movie_cluster.get(400))
#print(movieIds_copy)    #this is empty now because i check all movies if similar
#print(movieIds_available)   #varify all ids are untouched

In [938]:
# Order each sublist of the dictionaries
for key , value in movie_cluster.items():
    new_value = value
    new_value.sort()
    movie_cluster[key] = new_value
    
for key , value in users_cluster.items():
    new_value = value
    new_value.sort()
    users_cluster[key] = new_value

In [None]:
    
'''
for key , value in users_cluster.items():
    if len(value) > 1:
        print(value)
        break
print("Showing 2 users considered similar")

user1_array = ratings_matrix[userIds_available.index(1)]
user2_array = ratings_matrix[userIds_available.index(2)]

print("Similarity between user %d and user %d is %1.3f" % (3 , 245 , cosine_similarity(user1_array , user2_array)))

for i in range(len(user1_array)):
    if user1_array[i] != 0 and user2_array[i] != 0:
        print("Movie %d: %1.1f - %1.1f" % (i , user1_array[i] , user2_array[i]))
'''

In [939]:
# Array that contains the position of each ratings
ratings_position_array = list([0.5 , 1 , 1.5 , 2 , 2.5 , 3 , 3.5 , 4 , 4.5 , 5])

def get_rating_to_assign(array):
    # Find the max count
    max_rating_count = 0
    count_of_ratings = 0
    for i in range(len(array)):
        count_of_ratings += array[i]
        if array[i] > max_rating_count:
            max_rating_count = array[i]
        
    # If there aren't at least 3 ratings
    if count_of_ratings < 3:
        return 0
            
    # Fill the list with the maximum ratings
    list_of_max = set()
    for i in range(len(array)):
        if array[i] == max_rating_count:
            list_of_max.add(ratings_position_array[i])
    
    if len(list_of_max) == 0:
        return 0
            
    # Calculate the avg between the ratings that appear more
    rating = 0
    for r in list_of_max:
        rating += r
    
    return rating / len(list_of_max)

In [931]:
# Filling matrix with some ratings due to user similarities
partial_ratings_matrix = ratings_matrix.copy() # to maintain the original
num_of_predicted_value = 0

for i_user in tqdm(range(partial_ratings_matrix.shape[0])):
    # Take the cluster key for the user
    cluster = user_index_cluster[userIds_available[i_user]]
    # Take all the similar users
    sim_users_ids = users_cluster[cluster]
    
    # If there is at least a similar user
    if len(sim_users_ids) > 1:
        
        for j_movie in range(partial_ratings_matrix.shape[1]):
            # If the user hasn't watched the movie yet
            if ratings_matrix[i_user , j_movie] == 0:
                # For each movie calculate the most common rating and assing it
                # Array that will contains the sum of all the different ratings the movie received
                ratings_array = np.zeros(10)
                pos = 0
                for user_id in sim_users_ids: 
                    # Take the row corresponding to the user
                    pos = userIds_available.index(user_id , pos)

                    # If the similar user has watched it
                    if ratings_matrix[pos , j_movie] != 0:
                        position_in_array = ratings_position_array.index(ratings_matrix[pos , j_movie])
                        ratings_array[position_in_array] += 1
                
                rating = get_rating_to_assign(ratings_array)
                if rating > 0:
                    partial_ratings_matrix[i_user , j_movie] = rating
                    num_of_predicted_value += 1
            
print(num_of_predicted_value)

  0%|          | 0/668 [00:00<?, ?it/s]

991112


In [893]:
# Filling matrix with some ratings due to user similarities
partial_ratings_matrix = ratings_matrix.copy() # to maintain the original
num_of_predicted_value = 0

for i_user in tqdm(range(partial_ratings_matrix.shape[0])):
    # Take the cluster key for the user
    cluster = user_index_cluster[userIds_available[i_user]]
    # Take all the similar users
    sim_users_ids = users_cluster[cluster]
    
    # If there is at least a similar user
    if len(sim_users_ids) > 1:
        
        for j_movie in range(partial_ratings_matrix.shape[1]):
            # If the user hasn't watched the movie yet
            if ratings_matrix[i_user , j_movie] == 0:
                # For each movie calculate the avg rating given by similar users
                ratings_sum = 0
                total_contributions = 0

                pos = 0
                for user_id in sim_users_ids: 
                    # Take the row corresponding to the user
                    pos = userIds_available.index(user_id , pos)

                    # If the similar user has watched it
                    if ratings_matrix[pos , j_movie] != 0:
                        ratings_sum += ratings_matrix[pos , j_movie]
                        total_contributions += 1

                # If at least a similar user has watched the movie
                if total_contributions > 0:
                    average = ratings_sum / total_contributions
                    partial_ratings_matrix[i_user , j_movie] = average # toDo: find a better way
                    num_of_predicted_value += 1
            
print(num_of_predicted_value)

  0%|          | 0/668 [00:00<?, ?it/s]

2801063


In [940]:
# Count the number of missing values
count_missing_values(partial_ratings_matrix)

  0%|          | 0/668 [00:00<?, ?it/s]

There are 5824389 missing valuess
There are 1075383 values inserted
There are 6899772 values


In [941]:
# Filling matrix with some ratings due to content similarities
possible_ratings_matrix = partial_ratings_matrix.copy() # to maintain the original
num_of_predicted_value = 0

for i_user in tqdm(range(possible_ratings_matrix.shape[0])):
    for j_movie in range(possible_ratings_matrix.shape[1]):
        # If user i_user has whatched and rated movie j_movie
        if ratings_matrix[i_user , j_movie] >= 0.5: # toDo: use partial_ratings_matrix
            # Take movies similar to j_movie
            cluster = index_cluster[movieIds_available[j_movie]]
            sim_movies_ids = movie_cluster[cluster]

            pos = 0
            for id in sim_movies_ids:
                # Take the position in the matrix of that movie
                pos = movieIds_available.index(id , pos)
                # If the user hasn't watched that movie yet
                if ratings_matrix[i_user , pos] == 0:
                    if possible_ratings_matrix[i_user , pos] == 0:
                        # Save the similar rating
                        possible_ratings_matrix[i_user , pos] = ratings_matrix[i_user , j_movie]
                        num_of_predicted_value += 1
                    else:
                        # Save the minimum/or maximum between what I'm predicting and what I've alreadey predicted
                        possible_ratings_matrix[i_user , pos] = max(possible_ratings_matrix[i_user , pos] , 
                                                                     ratings_matrix[i_user , j_movie])
            
print(num_of_predicted_value)

  0%|          | 0/668 [00:00<?, ?it/s]

4096292


In [942]:
# Count the number of missing values
count_missing_values(possible_ratings_matrix)

  0%|          | 0/668 [00:00<?, ?it/s]

There are 1728097 missing valuess
There are 5171675 values inserted
There are 6899772 values


In [848]:
# Save the matrix "possible_ratings_matrix" as a CSV file
# Sofia's algorithm
#np.savetxt('possible_ratings_matrix1.csv', possible_ratings_matrix, delimiter=',' , fmt='%1.1f')
# Matteo's algorithm
#np.savetxt('possible_ratings_matrix2.csv', possible_ratings_matrix, delimiter=',' , fmt='%1.1f')
# Content + collaborative filtering
#np.savetxt('content_collaborative_filterting_matrix.csv' , possible_ratings_matrix , delimiter = ',' , fmt = '%1.1f')

In [364]:
# Load the matrix "possible_ratings_matrix" from the CSV file
# Sofia's algorithm
#possible_ratings_matrix = np.loadtxt('possible_ratings_matrix1.csv', delimiter=',')
# Matteo's algorithm
#possible_ratings_matrix = np.loadtxt('possible_ratings_matrix2.csv', delimiter=',')
# Content + collaborative filtering
#possible_ratings_matrix = np.loadtxt('content_collaborative_filterting_matrix.csv', delimiter=',')

In [935]:
print(ratings_matrix)
print("===============================")
print(possible_ratings_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [5. 0. 2. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 3. 2. ... 0. 0. 0.]]
[[5.  3.  4.  ... 0.  1.5 0. ]
 [5.  3.  2.  ... 0.  5.  0. ]
 [5.  3.5 4.  ... 4.  5.  0. ]
 ...
 [5.  5.  5.  ... 5.  5.  0. ]
 [5.  3.  4.5 ... 0.  4.5 0. ]
 [3.  3.  2.  ... 4.5 5.  2.5]]


In [126]:
import json

dict = {
    int(movieIds_available[0]): [2 , 3 , 6],
    int(movieIds_available[1]): [1 , 5]
}

with open('sim_movies.json' , 'w') as fp:
    json.dump(dict , fp , indent = 4)

In [127]:
# Open the saved data
with open('sim_movies.json' , 'r') as fp:
    movie_sim_dict_loaded = json.load(fp)

print(movie_sim_dict_loaded)

{'1': [2, 3, 6], '2': [1, 5]}


# Singular value truncation (SVT) based recommender system

In [943]:
# Aanalyzing the errors/precision/recall/f1 score after the prediction of the ratings predicted only
rows_test_limited = list()
cols_test_limited = list()
vals_test_limited = list()

def analyze_starting_errors():
    for i in tqdm(range(len(rows_test))):
        # Add just the position filled with the algortithm
        if possible_ratings_matrix[rows_test[i]][cols_test[i]] != 0:
            rows_test_limited.append(rows_test[i])
            cols_test_limited.append(cols_test[i])
            vals_test_limited.append(vals_test[i])
            
    vals_pred_limited = possible_ratings_matrix[rows_test_limited, cols_test_limited]
    err = vals_test_limited - vals_pred_limited
    RMSE = np.sqrt(np.mean(err**2))
    rho = pearsonr(vals_test_limited, vals_pred_limited)[0]
    
    return RMSE , rho

# Perform some evaluations
def precision_and_recall_initial_state():
    total_recommended = 0 # true positive + false negative
    predicted_recommended_items = 0 # true positive + false positive
    predicted_true_recommended_items = 0 # true positive
    # A movie is recommended if it's rating is greater than this value
    recommendation_value = 3
    for i in range(len(rows_test_limited)):
        true_rating = vals_test_limited[i]
        predicted_value = possible_ratings_matrix[rows_test_limited[i]][cols_test_limited[i]]
        # Calculate true positive
        if true_rating >= recommendation_value: 
            total_recommended += 1
            if predicted_value >= recommendation_value:
                predicted_true_recommended_items += 1
        # Calculate true positive + false positive
        if predicted_value >= recommendation_value:
            predicted_recommended_items += 1
                
    print("True positive: " , predicted_true_recommended_items)
    print("True positive + false positive: " , predicted_recommended_items)
    print("True positive + false negative: " , total_recommended)
    precision = predicted_true_recommended_items / predicted_recommended_items
    recall = predicted_true_recommended_items / total_recommended
    print("Precision: " , precision)
    print("Recall: " , recall)
    return precision , recall

def F1_measure(precision_value , recall_value):
    return 2 * precision_value * recall_value / ( precision_value + recall_value)

print(analyze_starting_errors()) 
print("At this stage %d values have already been predicted" % len(rows_test_limited))
precision , recall = precision_and_recall_initial_state()
F1_measure(precision , recall)

  0%|          | 0/21068 [00:00<?, ?it/s]

(1.5237087625801642, 0.16821182253228176)
At this stage 20211 values have already been predicted
True positive:  16443
True positive + false positive:  19957
True positive + false negative:  16587
Precision:  0.8239214310768151
Recall:  0.9913185024416712


0.8999014886164622

In [None]:
''' 
With max: (movie 0.7 , user 0.95 , cosine2 with at least 3 similarity ratings)
    True positive: 16443
    True positivie + False positive: 19957
    True positive + fate negative: 16587
    Precision: 0.824
    Recall:    0.991
    F-1 score: 0.9
    
With max: (movie 0.8 , user 0.95 , cosine2 with at least 3 similarity ratings)
    True positive: 19699
    True positivie + False positive: 19300
    True positive + fate negative: 16168
    Precision: 0.826
    Recall:    0.986
    F-1 score: 0.899
    
With max: (movie 0.8 , user 0.95 , cosine2 with at least 4 similarity ratings)
    True positive: 19509
    True positivie + False positive: 19107
    True positive + fate negative: 16004
    Precision: 0.826
    Recall:    0.987
    F-1 score: 0.899
    
With max: (movie 0.8 , user 0.9 , cosine2 with at least 4 similarity ratings)
    True positive: 19637
    True positivie + False positive: 19255
    True positive + fate negative: 16081
    Precision: 0.824
    Recall:    0.986
    F-1 score: 0.899
    
With max: (movie 0.8 , user 0.9 , cosine2 with at least 3 similarity ratings)
    True positive: 16430
    True positivie + False positive: 19937
    True positive + fate negative: 16964
    Precision: 0.824
    Recall:    0.984
    F-1 score: 0.897
'''

In [944]:
# Reconstruct rows_train, cols_train, vals_train with all the value of the input + already predicted values
counter = 0
rows_train_updated = list()
cols_train_updated = list()
vals_train_updated = list()
for i_user in tqdm(range(possible_ratings_matrix.shape[0])):
    for j_movie in range(possible_ratings_matrix.shape[1]):
        # If it is a default or predicted value, save the position
        if possible_ratings_matrix[i_user][j_movie] != 0:
            rows_train_updated.append(i_user)
            cols_train_updated.append(j_movie)
            vals_train_updated.append(possible_ratings_matrix[i_user][j_movie])
            counter += 1
print("Saved %d values" % counter)

  0%|          | 0/668 [00:00<?, ?it/s]

Saved 5171675 values


In [945]:
def errors():
    vals_pred = X_hat[rows_test, cols_test]
    err = vals_test - vals_pred
    RMSE = np.sqrt(np.mean(err**2))
    rho = pearsonr(vals_test, vals_pred)[0]
    
    return RMSE , rho

In [None]:
# errors_jit = jax.jit(errors)

In [946]:
# SVT before with the empty matrix , now check the rank, maybe it's low
n_max_iter = 150
# Fifth possibility -> contant threshold
#threshold = 50
increment_tol = 1e-1

a = 0.01
b = 300

RMSE_list = list()
rho_list = list()

X_hat = possible_ratings_matrix

for k in tqdm(range(n_max_iter)):
    X_old = X_hat.copy()
    U,s,VT = np.linalg.svd(X_hat, full_matrices=False)

    # Tenth possibility 
    threshold = b * np.exp(-k * a)
    
    # Fourth possibility
    s[s > 0] = s[s > 0] - threshold
    s[s < 0] = 0

    X_hat = U @ np.diag(s) @ VT
    
    # Maintain the default values
    X_hat[rows_train_updated,cols_train_updated] = vals_train_updated
    
    # Some negative values could appear -> set to 0
    X_hat[X_hat < 0] = 0

    # Calculate the increment -> how much the new matrix is different from the previuos one
    increment = np.linalg.norm(X_hat - X_old) 

    if k % 10 == 9:
        # Calculate the errors
        RMSE , rho = errors()
        # Add the errors in the lists
        RMSE_list.append(RMSE)
        rho_list.append(rho)
        precision , recall = precision_and_recall()
        f1_score = F1_measure(precision , recall)
        # Show the errors
        print('================== iter %d - theshold %1.2f - increment %1.3e' % (k+1, threshold, increment))
        print('RMSE: %1.3f' % RMSE)
        print('rho : %1.3f' % rho)
        print('precision: %1.3f' % precision)
        print('recall: %1.3f' % recall)
        print('F1-score: %1.3f' % f1_score)

    # If the increment is low -> stop the algorithm
    if increment < increment_tol:
        break

  0%|          | 0/150 [00:00<?, ?it/s]

RMSE: 1.520
rho : 0.168
precision: 0.825
recall: 0.981
F1-score: 0.896
RMSE: 1.517
rho : 0.170
precision: 0.825
recall: 0.982
F1-score: 0.896
RMSE: 1.516
rho : 0.171
precision: 0.825
recall: 0.982
F1-score: 0.897
RMSE: 1.515
rho : 0.171
precision: 0.825
recall: 0.982
F1-score: 0.897
RMSE: 1.514
rho : 0.171
precision: 0.825
recall: 0.982
F1-score: 0.897
RMSE: 1.514
rho : 0.172
precision: 0.825
recall: 0.983
F1-score: 0.897
RMSE: 1.513
rho : 0.172
precision: 0.825
recall: 0.983
F1-score: 0.897
RMSE: 1.513
rho : 0.172
precision: 0.825
recall: 0.983
F1-score: 0.897
RMSE: 1.513
rho : 0.172
precision: 0.825
recall: 0.983
F1-score: 0.897
RMSE: 1.512
rho : 0.172
precision: 0.825
recall: 0.984
F1-score: 0.897
RMSE: 1.512
rho : 0.172
precision: 0.825
recall: 0.984
F1-score: 0.897
RMSE: 1.512
rho : 0.172
precision: 0.825
recall: 0.984
F1-score: 0.897
RMSE: 1.511
rho : 0.172
precision: 0.825
recall: 0.984
F1-score: 0.897
RMSE: 1.511
rho : 0.172
precision: 0.825
recall: 0.984
F1-score: 0.897
RMSE: 

In [512]:
#print(np.mean(s[s > 0]))
#print(s)

In [None]:
'''
Collaborative-filterting + Content-filtering (150 iterations) (movie 0.7,user 0.95, cosine2 with at least 3 similarity ratings)

    RMSE: 1.511
    rho:  0.172
    
    Precision: 0.825
    Recall:    0.984
    F-1 measure: 0.897


Collaborative-filterting + Content-filtering (200 iterations) (movie similarity threshold = 0.9, cosine2 with 0.9)

    RMSE: 1.292
    rho:  0.207
    
    Precision: 0.832
    Recall:    0.951
    F-1 measure: 0.888
    
Collaborative-filterting + Content-filtering (200 iterations) (movie similarity threshold = 0.8, cosine2 with 0.9)

    RMSE: 1.417
    rho:  0.194
    
    Precision: 0.824
    Recall:    0.974
    F-1 measure: 0.893
    
Collaborative-filterting + Content-filtering (300 iterations) (movie similarity threshold = 0.8)

    RMSE: 1.424
    rho:  0.181
    
    Precision: 0.826
    Recall:    0.952
    F-1 measure: 0.884
    
Collaborative-filterting + Content-filtering (300 iterations) (movie similarity threshold = 0.9)

    RMSE: 1.306
    rho:  0.220
    
    Precision: 0.835
    Recall:    0.911
    F-1 measure: 0.871

Collaborative-filterting only(500 iterations)
    
    RMSE: 1.280
    rho:  0.433
    
    Precision: 0.923
    Recall:    0.647
    F-1 measure: 0.761


Content-filtering only (300 iterations)
    
    After restarting all with max:
    
    RMSE: 1.305
    rho:  0.221
    
    Precision: 0.835
    Recall:    0.909
    F-1 measure: 0.871

    Adaptive threshold -> Threshold = b * np.exp(-k * a) , a = 0.01 , b = 300

                After 300 iterations:
                RMSE: 1.118
                rho: 0.222

                Precision: 0.822
                Recall:    0.975
                F-1 measure: 0.892

    Constant threshold -> Threshold = 50

                After 300 iterations: 
                RMSE: 1.125
                rho:  0.208

                Precision: 0.821
                Recall:    0.973
                F-1 measure: 0.890
'''

In [456]:
# Save the matrix resulting from SVT as a CSV file
# Sofia's algorithm
#np.savetxt('final_ratings_matrix1.csv', X_hat, delimiter=',' , fmt='%1.1f')
# Matteo's algorithm
#np.savetxt('final_ratings_matrix2.csv', X_hat, delimiter=',' , fmt='%1.1f')

In [None]:
# Load the matrix final_ratings_matrix from the CSV file
# Sofia's algorithm
#X_hat = np.loadtxt('final_ratings_matrix1.csv', delimiter=',')
# Matteo's algorithm
#X_hat = np.loadtxt('final_ratings_matrix2.csv', delimiter=',')

In [852]:
# Perform some evaluations
def precision_and_recall():
    total_recommended = 0 # true positive + false negative
    predicted_recommended_items = 0 # true positive + false positive
    predicted_true_recommended_items = 0 # true positive
    # A movie is recommended if it's rating is greater than this value
    recommendation_value = 3
    for i in range(len(rows_test)):
        true_rating = vals_test[i]
        predicted_value = X_hat[rows_test[i]][cols_test[i]]
        # Calculate true positive
        if true_rating >= recommendation_value: 
            total_recommended += 1
            if predicted_value >= recommendation_value:
                predicted_true_recommended_items += 1
        # Calculate true positive + false positive
        if predicted_value >= recommendation_value:
            predicted_recommended_items += 1
                
    #print("True positive: " , predicted_true_recommended_items)
    #print("True positive + false positive: " , predicted_recommended_items)
    #print("True positive + false negative: " , total_recommended)
    precision = predicted_true_recommended_items / predicted_recommended_items
    recall = predicted_true_recommended_items / total_recommended
    #print("Precision: " , precision)
    #print("Recall: " , recall)
    return precision , recall

def F1_measure(precision_value , recall_value):
    return 2 * precision_value * recall_value / ( precision_value + recall_value)

In [807]:
precision , recall = precision_and_recall()
F1_measure(precision , recall)

True positive:  16745
True positive + false positive:  20319
True positive + false negative:  17184
Precision:  0.8241055170037895
Recall:  0.9744529795158287


0.8929952270485028