In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('/home/supernova/Placement/Projects/Movie Recommender using Iterative RSVD/Processed_data_tmdb.csv')
user_ids = df['user'].values
movie_ids = df['movie'].values
user_ratings = df['rating'].values

# Assigning unique ids to users and movies
unique_user_ids = np.unique(user_ids)
unique_movie_ids = np.unique(movie_ids)
print(np.min(unique_movie_ids), np.max(unique_movie_ids))
print(np.min(unique_user_ids), np.max(unique_user_ids))
unique_user_ids += 1
unique_movie_ids += 1
print('Number of unique users: ', len(unique_user_ids))
print('Number of unique movies: ', len(unique_movie_ids))

# Creating a matrix of users and movies
n_users = len(np.unique(user_ids))
n_movies = len(np.unique(movie_ids))
user_movie_matrix = np.zeros((n_users, n_movies))
for i in range(df.shape[0]):
    user_movie_matrix[user_ids[i]-1, movie_ids[i]-1] = user_ratings[i]

# df_movie_name
df_movie_name = pd.read_csv("/home/supernova/Placement/Projects/Movie Recommender using Iterative RSVD/movie_names_tmdb.csv")

1 856
1 670
Number of unique users:  670
Number of unique movies:  856


In [3]:
def RSVD(X, reg, k, niter, v_rand):
    def UfromV(V, l, X):
        temp = V.T@ V + l*np.identity(V.shape[1])
        temp = np.linalg.inv(temp)
        temp = V@temp
        U = X@temp
        return U
    def VfromU(U, l, X):
        temp = U.T@ U + l*np.identity(U.shape[1])
        temp = np.linalg.inv(temp)
        V = (X.T)@U@temp
        return V

    # Randomly initialised V with Column size = k  and Row size = n_movies
    V = v_rand
    threshould = 1e-20
    error = []
    i = 0
    loss = 1e9
    while i < (niter) and  loss > threshould:
        V_temp = V
        if i%2==0:
            U = UfromV(V_temp, reg, X)
        else:
            V = VfromU(U, reg, X)
            # print("testing: ", np.linalg.norm(V - V_temp))
            error.append(np.linalg.norm(V - V_temp)/np.sqrt(V.shape[0]*V.shape[1]))
            loss = np.linalg.norm(V - V_temp)
        # error.append(np.linalg.norm(X - U@V.T))
        i += 1
    return U,V, error

In [4]:
def well_defined_RSVD(X, reg, k):
    F, S, G = np.linalg.svd(X)
    G=G.T
    D = np.zeros((X.shape[0], X.shape[1]))
    S = np.diag(S)
    # print(np.unique(S))
    Omega = np.zeros((k, k))
    for i in range(k):
        Omega[i][i] = np.sqrt(np.abs(S[i][i]-reg))
    # Omega = np.diag(Omega)
    V_ = G[:, :k] @ Omega
    U_ = F[:, :k] @ Omega
    return U_, V_

In [None]:
# Number of latent features K that we are considering

K = [3, 5, 10, 100]
for k in K:
    v_rand = np.random.rand(user_movie_matrix.shape[1], k)

    for l in [0, 3, 5, 10]:
        U, V, error = RSVD(user_movie_matrix, l, k, 20, v_rand)
        U_, V_ = well_defined_RSVD(user_movie_matrix, l, k)
        print("Error in V and V_ for lambda = ", l, " is: ", np.linalg.norm(V - V_)/np.sqrt(V.shape[0]*V.shape[1]))
        print("Error in U and U_ for lambda = ", l, " is: ", np.linalg.norm(U - U_)/np.sqrt(U.shape[0]*U.shape[1]))
        print("Overall mean error for lambda = ", l, " is: ", (np.linalg.norm(user_movie_matrix - U@V.T))/np.sqrt(user_movie_matrix.shape[0]*user_movie_matrix.shape[1]))
        print("Overall mean error for lambda = ", l, " is: ", (np.linalg.norm(user_movie_matrix - U_@V_.T))/np.sqrt(user_movie_matrix.shape[0]*user_movie_matrix.shape[1]))
        plt.plot(error, label = "lambda = " + str(l))
    plt.legend()
    plt.xlabel("Iterations")
    plt.ylabel("$dV_t$")
    plt.title("Error vs Iterations for different values of lambda at K = " + str(k))
    # plt.show()
    plt.savefig("/home/supernova/Placement/Projects/Movie Recommender using Iterative RSVD/Images_tmdb/K=" + str(k) + ".png")
    plt.close()

Error in V and V_ for lambda =  0  is:  1.5721600169368255
Error in U and U_ for lambda =  0  is:  0.7833203591411781
Overall mean error for lambda =  0  is:  1.0041528097579362
Overall mean error for lambda =  0  is:  1.0040864186136162
Error in V and V_ for lambda =  3  is:  1.1857989694692506
Error in U and U_ for lambda =  3  is:  0.8449629053963865
Overall mean error for lambda =  3  is:  1.0042837004916738
Overall mean error for lambda =  3  is:  1.004109861389655
Error in V and V_ for lambda =  5  is:  1.0999988166812844
Error in U and U_ for lambda =  5  is:  0.8772629974430816
Overall mean error for lambda =  5  is:  1.0043729965943065
Overall mean error for lambda =  5  is:  1.0041515360846087
Error in V and V_ for lambda =  10  is:  0.9937110450631802
Error in U and U_ for lambda =  10  is:  0.9279637339228172
Overall mean error for lambda =  10  is:  1.0046166529513902
Overall mean error for lambda =  10  is:  1.0043468631659893
Error in V and V_ for lambda =  0  is:  1.457

In [7]:
# the resulting matrix is
E = U@V.T
E_ = U_@V_.T
print(np.max(E), np.min(E))
print(np.max(E_), np.min(E_))

14.54874116475603 -2.04043407423053
14.6418858511283 -2.731920019575378


In [8]:
#predicting the top N recommendations for a user
def topNRecommendations(user_id, U, V, N):
    user_id -= 1
    user_ratings = U[user_id, :]
    movie_ratings = V.T
    Recommendation = user_ratings@movie_ratings
    topN = np.argsort(Recommendation)[::-1][:N]
    return topN
a = topNRecommendations(1, U, V, 10)
# print(a)
print(df_movie_name.iloc[a, :])

     Unnamed: 0  movie      original_title
68        11519     69      Arlington Road
385        5144    386              Lolita
198       18391    199  Dogtown and Z-Boys
836       15404    837   Y tu mamá también
482       14732    483          Persepolis
703        4699    704     The Last Castle
437        8877    438  Mr. Holland's Opus
393        3271    394          Madagascar
439       10059    440      Mrs. Doubtfire
384       16950    385          Lola rennt
