#Numerical Analysis' project

Movie recommendation system

In [214]:
from scipy.sparse import csr_matrix
from scipy.stats import pearsonr
from numpy.linalg import matrix_rank
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
#import jax.numpy as jnp
#import jax
import time

Load the dataset using pandas

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [83]:
# Find all the possible user ids and movie ids-> do that after the deletion
userIds_available = set()
movieIds_available = set()
for id in np.array(ratings['userId'] , dtype = int):
  userIds_available.add(id)

for id in np.array(movies['movieId'] , dtype = int):
  movieIds_available.add(id)

print(len(userIds_available) , len(movieIds_available))

668 10329


In [176]:
#ratings # 105339 users' ratings , 668 different users
#movies # 10329 movies

rows = np.array(ratings['userId'])
cols = np.array(ratings['movieId'])
vals = np.array(ratings['rating'])

n = rows.max() + 1 # Number of user
p = cols.max() + 1# Number of movies
N = len(vals) # Number of ratings

def binary_search(array , low , high , x):
    if high >= low:
        mid = (high + low) // 2
        
        # If element is equal to mid
        if array[mid] == x:
            return mid
 
        # If element is smaller than mid, then it can only
        # be present in left subarray
        elif array[mid] > x:
            return binary_search(array, low, mid - 1, x)
 
        # Else the element can only be present in right subarray
        else:
            return binary_search(array, mid + 1, high, x)
 
    else:
        # Element is not present in the array
        return -1

# Update the arrays rows/cols with the true position instead of the ids 
for i_user in tqdm(range(len(rows))):
    #rows[i_user] = userIds_available.index(rows[i_user])
    rows[i_user] = binary_search(userIds_available , 0 , len(userIds_available) , rows[i_user])
for i_movie in tqdm(range(len(cols))):
    #cols[i_movie] = movieIds_available.index(cols[i_movie])
    cols[i_movie] = binary_search(movieIds_available , 0 , len(movieIds_available) , cols[i_movie])

n , p , N

  0%|          | 0/105339 [00:00<?, ?it/s]

  0%|          | 0/105339 [00:00<?, ?it/s]

(669, 149533, 105339)

In [None]:
# Command for analyse input data matrix 
movies.head()
ratings.head()
movies.info()
ratings.info()
movies.describe()
ratings.describe()
sns.distplot(ratings['rating'])
sns.distplot(ratings['movieId'])
sns.scatterplot(data = ratings , x = 'userId' , y = 'movieId' , hue = 'rating')
ratings.corr()

In [177]:
# Shuffle the data
indexes = np.arange(N)
np.random.seed(0) # for reproducibility
np.random.shuffle(indexes)
indexes
# Reordering the arrays
rows = rows[indexes]
cols = cols[indexes]
vals = vals[indexes]

Building the train set (80%) and the validation set (20%)

In [178]:
# Split data in training and testing
num_training = int(N * 0.8)

rows_train = rows[:num_training]
cols_train = cols[:num_training]
vals_train = vals[:num_training]
rows_test  = rows[num_training:]
cols_test  = cols[num_training:]
vals_test  = vals[num_training:]

print(len(rows_train) , len(cols_train) , len(vals_train))

84271 84271 84271


Building the matrix with the origina values

Building the 'Ratings matrix'
Users on the rows and Movies on the columns

Initializing all the elements to 0 and then updating position (i,j) with the rating of movie j by user i if it's present

In [180]:
def update_with_default_ratings():
    #default_ratings = csr_matrix((vals_train, (rows_train, cols_train)), shape=(len(userIds_available), len(movieIds_available)))
    #default_ratings = default_ratings.toarray()
    ratings_matrix[rows_train, cols_train] = vals_train
    
# Initialize the matrix with all zeros
ratings_matrix = np.zeros((len(userIds_available) , len(movieIds_available)))
print(ratings_matrix)
print("================================")
# Update the matrix with the known values (contained in vals_train array)
update_with_default_ratings()
print(ratings_matrix) 

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


  0%|          | 0/668 [00:00<?, ?it/s]

There are 6899772 missing valuess
There are 6899772 values


  0%|          | 0/668 [00:00<?, ?it/s]

There are 6815501 missing valuess
There are 6899772 values
[[0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  2.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  3.  0.  ... 0.  4.5 0. ]]


In [182]:
frame = pd.DataFrame(ratings_matrix, index = userIds_available , columns = movieIds_available)
print(frame)

     1       2       3       4       5       6       7       8       9       \
1       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2       0.0     0.0     2.0     0.0     3.0     0.0     0.0     0.0     0.0   
3       0.0     0.0     0.0     0.0     3.0     0.0     0.0     0.0     0.0   
4       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5       4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
664     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
665     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
666     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
667     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
668     0.0     3.0     0.0     0.0     0.0     5.0     0.0     0.0     0.0   

     10      ...  144482  144656  144976  146344  1

Checking if there are users that haven't watched any movie

Deleting rows corresponding to user that hasn't watched any movie

In [183]:
print("Initial shape: " , ratings_matrix.shape)
count = []
for user in range(ratings_matrix.shape[0]):
  # Save the row to delete
  if sum(ratings_matrix[user]) == 0: count.append(user)

ratings_matrix = np.delete(ratings_matrix , count , axis = 0)
print("Deleted %d rows" % len(count))
print("Final shape: " , ratings_matrix.shape)

Initial shape:  (668, 10329)
Deleted 0 rows
Final shape:  (668, 10329)


Building movie-genre correlation matrix M

$$
M_{i,j} = 
\begin{cases}
1 & \text{if movie i is of genre j}\\
0 & \text{otherwise}
\end{cases}
$$

In [184]:
# Put in a set all the genres available
genre_available = set()

for i in range(movies.shape[0]):
  genres = movies['genres'][i].split('|')
  for g in genres: genre_available.add(g)

# print("All genres available are: " , id_available , genre_available)

In [185]:
# Build the matrix
num_movies = len(movieIds_available)
num_genres = len(genre_available)
print("Max movie id: " , max(movies['movieId']))
print("Number of movies is: " , num_movies)
print("Number of genres is: " , num_genres)
# Initialize the matrix with all zeros of int8 type
correlation_matrix = np.zeros((num_movies , num_genres) , dtype = np.int8)

Max movie id:  149532
Number of movies is:  10329
Number of genres is:  20


In [186]:
# Update the table with the correspondance
for i in tqdm(range(movies.shape[0])):
  id = movies['movieId'][i]
  # Take the right position in the matrix
  id = movieIds_available.index(id)

  genres = movies['genres'][i].split('|')
  for pos , g in enumerate(genre_available):
    if g in genres:
      correlation_matrix[id , pos] = 1

  0%|          | 0/10329 [00:00<?, ?it/s]

In [187]:
frame = pd.DataFrame(correlation_matrix, index = movieIds_available , columns = genre_available)
print(frame)

        Western  Documentary  Children  Crime  Film-Noir  Comedy  Adventure  \
1             0            0         1      0          0       1          1   
2             0            0         1      0          0       0          1   
3             0            0         0      0          0       1          0   
4             0            0         0      0          0       1          0   
5             0            0         0      0          0       1          0   
...         ...          ...       ...    ...        ...     ...        ...   
146684        0            0         1      0          0       1          0   
146878        0            0         0      0          0       1          0   
148238        0            0         0      0          0       1          0   
148626        0            0         0      0          0       0          0   
149532        0            0         0      0          0       0          0   

        (no genres listed)  Fantasy  Horror  Thrill

Next step:
create a movie-movie matrix to find similiar movies: movies which covers the same genres

In [188]:
def similar_movies(movie1 , movie2):
  """
  movie1 and movie2 are rows of correlation_matrix
  """
  intersection = np.bitwise_and(movie1 , movie2)
  union = np.bitwise_or(movie1 , movie2)
  return sum(intersection) / sum(union)

def cosine_similarity(movie1 , movie2):
  """
  movie1 and movie2 are rows of correlation_matrix
  """
  return np.dot(movie1, movie2)/(np.linalg.norm(movie1) * np.linalg.norm(movie2))

print("similar_movies_coefficent: " , similar_movies([1, 0, 0, 1, 0, 1] , [1, 1, 0, 1, 0, 1]))
print("cosine_similary_coefficent: " , cosine_similarity([1, 0, 0, 1, 0, 1] , [0, 1, 0, 0, 0, 0]))


similar_movies_coefficent:  0.75
cosine_similary_coefficent:  0.0


In [222]:
# Matteo's version (don't look at it)

# Building a dictionary with key the movieid and value the number of genre it covers
movie_genre_dict = {}
# Filling the dictionary
for i in range(len(movieIds_available)):
    movie_genre_dict[movieIds_available[i]] = sum(correlation_matrix[i])
#print(movie_genre_dict)

# Ordering "movie_genre_dict" according to values (in DESC order)
sorted_tuples = sorted(movie_genre_dict.items(), key=lambda item: item[1] , reverse=True)
#print(sorted_tuples)

sorted_dict = {k: v for k, v in sorted_tuples}
#print(sorted_dict)

# Take just the movieId ordered
ordered_ids = list(sorted_dict.keys())
#print(ordered_ids)

# Create the dictionary containing the clusters

# key=integer , value: array of similar movie ids
movie_cluster = {}
# Over the threshold value 2 movies are considered similar
threshold = 0.8

ids_to_evaluate = list(sorted_dict.keys())

# key: movieId , value: cluster in which the movie is
index_cluster = {}

for i_id in tqdm(ids_to_evaluate):
    is_inserted = False
    for key , value in movie_cluster.items():
        # Check if movie with id=id is similar to the first movie in values array
        j_id = value[0]
        #sim = similar_movies(correlation_matrix[movieIds_available.index(i_id)] , correlation_matrix[movieIds_available.index(j_id)])
        sim = cosine_similarity(correlation_matrix[movieIds_available.index(i_id)] , correlation_matrix[movieIds_available.index(j_id)])
        if sim >= threshold:
            new_value = value
            new_value.append(i_id)
            movie_cluster[key] = new_value
            index_cluster[i_id] = key
            is_inserted = True
            break
    # If the movie is not been inserted
    if not is_inserted:
         # If len = 0 -> key = -1, else the key is the last of the orevious for (the last cluster present)
        if(len(movie_cluster) == 0):
            key = -1
        movie_cluster[key + 1] = [i_id]
        index_cluster[i_id] = key + 1

print("Number of cluster is: " , len(movie_cluster))

  0%|          | 0/10329 [00:00<?, ?it/s]

Number of cluster is:  402


Creating the dictionary containing the clusters

In [249]:
# Sofia's version
movie_cluster = {}
threshold = 0.8
index_cluster = {}
movieIds_copy = movieIds_available.copy()

num_cluster = 0
while len(movieIds_copy)>0:
  for id_x in tqdm(movieIds_copy):
    list_movies = []
    index_cluster[id_x] = num_cluster
    list_movies.append(id_x)
    for id_y in movieIds_copy:
      if id_x != id_y:
        sim = cosine_similarity(correlation_matrix[movieIds_available.index(id_x)], correlation_matrix[movieIds_available.index(id_y)])
        if sim>= threshold:
          index_cluster[id_y] = num_cluster
          list_movies.append(id_y)
          movieIds_copy.remove(id_y)
    movieIds_copy.remove(id_x)
    movie_cluster[num_cluster] = list_movies
    num_cluster += 1
    
print("Number of cluster is: " , num_cluster)

  0%|          | 0/10329 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Number of cluster is:  438


In [114]:
#print(index_cluster)
#print(movie_cluster.get(0))
#print(movie_cluster.get(400))
#print(movieIds_copy)    #this is empty now because i check all movies if similar
#print(movieIds_available)   #varify all ids are untouched

In [250]:
# Order each sublist of the dictionary
for key , value in movie_cluster.items():
  new_value = value
  new_value.sort()
  #print("New_value: " , new_value)
  movie_cluster[key] = new_value

In [224]:
# Count the number of missing values
def count_missing_values(matrix):
    missing_values = 0

    for i_user in tqdm(range(matrix.shape[0])):
        for j_movie in range(matrix.shape[1]):
            # If the movie in position j_movie hasn't a rating
            if matrix[i_user , j_movie] == 0:
                missing_values += 1

    print("There are %d missing valuess" % (missing_values))
    print("There are %d values" % (matrix.shape[0] * matrix.shape[1]))
    
count_missing_values(ratings_matrix)

  0%|          | 0/668 [00:00<?, ?it/s]

There are 6815501 missing valuess
There are 6899772 values


In [None]:
Filling the "ratings_matrix" with possible ratings related to the movie he has already watched ank liked (rating >= 3.5)

In [251]:
# Filling matrix with some ratings
possible_ratings_matrix1 = ratings_matrix.copy() # to maintain the original
np.around(possible_ratings_matrix1, 1)
num_of_predicted_value = 0
good_rating = 3.5

for i_user in tqdm(range(possible_ratings_matrix1.shape[0])):
    for j_movie in range(possible_ratings_matrix1.shape[1]):
        # If user i_user has whatched movie j_movie
        if possible_ratings_matrix1[i_user , j_movie] >= good_rating:
            # Take movies similar to j_movie
            cluster = index_cluster[movieIds_available[j_movie]]
            sim_movies_ids = movie_cluster[cluster]

            pos = 0
            for id in sim_movies_ids:
                # Take the position in the matrix of that movie
                pos = movieIds_available.index(id , pos)
                if possible_ratings_matrix1[i_user , pos] == 0:
                    # sim_j_pos = similar_movies(correlation_matrix[j_movie] , correlation_matrix[pos])
                    sim_j_pos = cosine_similarity(correlation_matrix[j_movie] , correlation_matrix[pos])
                    #possible_ratings_matrix1[i_user , pos] = sim_j_pos * possible_ratings_matrix[i_user , j_movie]
                    if sim_j_pos >= 0.9:
                        possible_ratings_matrix1[i_user , pos] = possible_ratings_matrix[i_user , j_movie]
                    else:
                         possible_ratings_matrix1[i_user , pos] = possible_ratings_matrix[i_user , j_movie] - 0.5
                    num_of_predicted_value += 1
            
print(num_of_predicted_value)

  0%|          | 0/668 [00:00<?, ?it/s]

3174273


In [252]:
# Count the number of missing values
count_missing_values(possible_ratings_matrix1)

  0%|          | 0/668 [00:00<?, ?it/s]

There are 3641228 missing valuess
There are 6899772 values


In [253]:
# Take just 1 decimal place
np.around(possible_ratings_matrix1 , 1)

array([[0. , 4. , 3.5, ..., 0. , 0. , 0. ],
       [0. , 0. , 2. , ..., 0. , 4. , 0. ],
       [0. , 0. , 3.5, ..., 4. , 4. , 0. ],
       ...,
       [4. , 5. , 4.5, ..., 4. , 4. , 0. ],
       [4. , 4.5, 3.5, ..., 0. , 4.5, 0. ],
       [3.5, 3. , 3.5, ..., 4. , 4.5, 0. ]])

In [258]:
# Save the matrix "possible_ratings_matrix" as a CSV file
# Sofia's algorithm
#np.savetxt('possible_ratings_matrix1.csv', possible_ratings_matrix1, delimiter=',' , fmt='%1.1f')
# Matteo's algorithm
#np.savetxt('possible_ratings_matrix2.csv', possible_ratings_matrix1, delimiter=',' , fmt='%1.1f')

In [305]:
# Load the matrix "possible_ratings_matrix" from the CSV file
# Sofia's algorithm
#possible_ratings_matrix1 = np.loadtxt('possible_ratings_matrix1.csv', delimiter=',')
# Matteo's algorithm
#possible_ratings_matrix1 = np.loadtxt('possible_ratings_matrix2.csv', delimiter=',')

In [306]:
# Count the number of missing values
count_missing_values(possible_ratings_matrix1)

  0%|          | 0/668 [00:00<?, ?it/s]

There are 3936310 missing valuess
There are 6899772 values


In [275]:
print(ratings_matrix)
print("===============================")
print(possible_ratings_matrix1)

[[0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  2.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  3.  0.  ... 0.  4.5 0. ]]
[[0.  0.  0.  ... 4.5 0.  0. ]
 [0.  0.  2.  ... 0.  4.  0. ]
 [0.  0.  4.  ... 4.  4.  0. ]
 ...
 [4.  5.  4.  ... 4.  4.  0. ]
 [4.  0.  4.  ... 3.5 4.5 0. ]
 [3.  3.  4.  ... 3.5 4.5 0. ]]


In [126]:
import json

dict = {
    int(movieIds_available[0]): [2 , 3 , 6],
    int(movieIds_available[1]): [1 , 5]
}

# Save the similar movies dictionary -> their calculation took 6h 50 min
with open('sim_movies.json' , 'w') as fp:
    json.dump(dict , fp , indent = 4)

In [127]:
# Open the saved data
with open('sim_movies.json' , 'r') as fp:
    movie_sim_dict_loaded = json.load(fp)

print(movie_sim_dict_loaded)

{'1': [2, 3, 6], '2': [1, 5]}


# Singular value truncation (SVT) based recommender system

In [260]:
# Reconstrict rows_train, cols_train, vals_train with all the value of the input + predicted by movie relationships
counter = 0
rows_train_updated = list()
cols_train_updated = list()
vals_train_updated = list()
for i_user in tqdm(range(possible_ratings_matrix1.shape[0])):
    for j_movie in range(possible_ratings_matrix1.shape[1]):
        # If it is a default or predicted value, save the position
        if possible_ratings_matrix1[i_user][j_movie] != 0:
            rows_train_updated.append(i_user)
            cols_train_updated.append(j_movie)
            vals_train_updated.append(possible_ratings_matrix1[i_user][j_movie])
            counter += 1
print("Saved %d values" % counter)

  0%|          | 0/668 [00:00<?, ?it/s]

Saved 3258544 values


In [276]:
def errors():
    vals_pred = X_hat[rows_test, cols_test]
    err = vals_test - vals_pred
    RMSE = np.sqrt(np.mean(err**2))
    rho = pearsonr(vals_test, vals_pred)[0]
    
    return RMSE , rho

In [None]:
# errors_jit = jax.jit(errors)

In [310]:
# SVT before with the empty matrix , now check the rank, maybe it's low
n_max_iter = 1000
threshold = 300.0
increment_tol = 1e-2

RMSE_list = list()
rho_list = list()

X_hat = possible_ratings_matrix1

for k in tqdm(range(n_max_iter)):
    X_old = X_hat.copy()
    U,s,VT = np.linalg.svd(X_hat, full_matrices=False)

    threshold = max(300 , np.mean(s[s > 0]))
    s[s < threshold] = 0
    X_hat = U @ np.diag(s) @ VT
    
    # Maintain the default values
    X_hat[rows_train_updated,cols_train_updated] = vals_train_updated 
    # Some negative values could appear -> set to 0
    X_hat[X_hat < 0] = 0

    # Calculate the increment -> how much the new matrix is different from the previuos one
    increment = np.linalg.norm(X_hat - X_old) 

    # Calculate the errors
    RMSE , rho = errors()
    
    RMSE_list.append(RMSE)
    rho_list.append(rho)
    print('================== iter %d - increment %1.3e' % (k+1, increment))
    print('RMSE: %1.3f' % RMSE)
    print('rho : %1.3f' % rho)

    if increment < increment_tol:
        break

  0%|          | 0/1000 [00:00<?, ?it/s]

RMSE: 1.697
rho : 0.097
RMSE: 1.578
rho : 0.107
RMSE: 1.521
rho : 0.114
RMSE: 1.485
rho : 0.119
RMSE: 1.459
rho : 0.123
RMSE: 1.440
rho : 0.126
RMSE: 1.424
rho : 0.129
RMSE: 1.412
rho : 0.131
RMSE: 1.402
rho : 0.133
RMSE: 1.393
rho : 0.134
RMSE: 1.385
rho : 0.136
RMSE: 1.378
rho : 0.137
RMSE: 1.372
rho : 0.138
RMSE: 1.367
rho : 0.139
RMSE: 1.362
rho : 0.140
RMSE: 1.358
rho : 0.141
RMSE: 1.354
rho : 0.141
RMSE: 1.351
rho : 0.142
RMSE: 1.347
rho : 0.143
RMSE: 1.344
rho : 0.143
RMSE: 1.341
rho : 0.144
RMSE: 1.339
rho : 0.144
RMSE: 1.336
rho : 0.145
RMSE: 1.334
rho : 0.145
RMSE: 1.332
rho : 0.146
RMSE: 1.329
rho : 0.146
RMSE: 1.328
rho : 0.146
RMSE: 1.326
rho : 0.147
RMSE: 1.324
rho : 0.147
RMSE: 1.322
rho : 0.147
RMSE: 1.320
rho : 0.148
RMSE: 1.319
rho : 0.148
RMSE: 1.317
rho : 0.148
RMSE: 1.316
rho : 0.149
RMSE: 1.314
rho : 0.149
RMSE: 1.313
rho : 0.149
RMSE: 1.312
rho : 0.149
RMSE: 1.311
rho : 0.150
RMSE: 1.309
rho : 0.150
RMSE: 1.308
rho : 0.150
RMSE: 1.307
rho : 0.150
RMSE: 1.306
rho 

In [279]:
#print(np.mean(s[s > 0]))
#print(s)

In [236]:
'''
Threshold = np.mean(s[s>0])

    Sofia's version:

        After 100 iterations
        RMSE: 1.797
        rho:  0.104

    Matteo's version:

        After 100 iterations
        RMSE: 1.985
        rho:  0.080 

Threshold = max(300 , np.mean(s[s>0]))

    Sofia's version:

        After 100 iterations
        RMSE: 1.301
        rho:  0.152

    Matteo's version:

        After 100 iterations
        RMSE: 1.272
        rho:  0.158
        
        After 200 iterations:
        RMSE: 1.248
        rho: 0.163
        
        After 1000 iterations:
        RMSE: 1.210
        rho:  0.180
'''

"\nSofia's version:\n\nAfter 100 iterations\nRMSE: 1.797\nrho: 0.104\n\nMatteo's version:\n\nAfter 100 iterations\nRMSE: 1.985\nrho: 0.080\n"

In [311]:
# Save the matrix resulting from SVT as a CSV file
# Sofia's algorithm
#np.savetxt('final_ratings_matrix1.csv', X_hat, delimiter=',' , fmt='%1.1f')
# Matteo's algorithm
#np.savetxt('final_ratings_matrix2.csv', X_hat, delimiter=',' , fmt='%1.1f')

In [316]:
# Perform some evaluations
def precision_and_recall():
    total_recommended = 0 # true positive + false negative
    predicted_recommended_items = 0 # true positive + false positive
    predicted_true_recommended_items = 0 # true positive
    # A movie is recommended if it's rating is greater than this value
    recommendation_value = 3.5
    for i in range(len(rows_test)):
        true_rating = vals_test[i]
        predicted_value = X_hat[rows_test[i]][cols_test[i]]
        # Calculate true positive
        if true_rating >= recommendation_value: 
            total_recommended += 1
            if predicted_value >= recommendation_value:
                predicted_true_recommended_items += 1
        # Calculate true positive + false positive
        if predicted_value >= recommendation_value:
            predicted_recommended_items += 1
                
    print("True positive: " , predicted_true_recommended_items)
    print("True positive + false positive: " , predicted_recommended_items)
    print("True positive + false negative: " , total_recommended)
    precision = predicted_true_recommended_items / predicted_recommended_items
    recall = predicted_true_recommended_items / total_recommended
    print("Precision: " , precision)
    print("Recall: " , recall)
    return precision , recall

def F1_measure(precision_value , recall_value):
    return 2 * precision_value * recall_value / ( precision_value + recall_value)

In [317]:
precision , recall = precision_and_recall()
F1_measure(precision , recall)

True positive:  10486
True positive + false positive:  16584
True positive + false negative:  12776
Precision:  0.6322961890979257
Recall:  0.8207576706324358


0.7143051771117166

In [None]:
'''
Threshold = max(300 , np.mean(s[s>0]))

> recommendation_value (3.5)

    Sofia's version:

        After 100 iterations
        RMSE: 1.301
        rho:  0.152
        
        Precision: 0.568
        Recall:    0.575
        F-1 measure: 0.574

    Matteo's version:

        After 100 iterations
        RMSE: 1.272
        rho:  0.158
        
        Precision: 0.569
        Recall:    0.592
        F-1 measure: 0.580
        
        After 200 iterations:
        RMSE: 1.248
        rho: 0.163
        
        Precision: 0.569
        Recall:    0.598
        F-1 measure: 0.584
        
        After 1000 iterations:
        RMSE: 1.210
        rho:  0.180
        
        Precision: 0.567
        Recall:    0.611
        F-1 measure: 0.588
        
>= recommendation_value (3.5)
        
        After 1000 iterations:
        RMSE: 1.210
        rho:  0.180
        
        Precision: 0.632
        Recall:    0.821
        F-1 measure: 0.714
'''