# PREPROCESSING

### Importing libraries and loading the data into 3 seperate categories: Movies, Ratings, and Users

In [None]:
# Importing relevant modules:
import math
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

In [None]:
# Loading Data 
movies = pd.read_table('movies.dat', sep = "::", names = ['MovieID','Title', 'Genres'],
                       header = None, engine='python')
print(movies.head())

ratings = pd.read_table('ratings.dat', sep = "::", names = ['UserID', 'MovieID', 'Rating', 'Timestamp'],
                        header = None, engine='python')
print(ratings.head())

users = pd.read_table('users.dat', sep = "::", names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
                      header = None, engine='python')

ratings_np = ratings.to_numpy()

# Cross-validation

# Naive Approach 1

In [None]:
#Making 5 folds, same 5 folds split used for the 3 first approaches:
np.random.seed(120)
indices = np.repeat(range(5), (len(ratings)/5)+1)[0:(len(ratings))]
np.random.shuffle(indices)

# Setup 2 data frame that will hold the error values. 1 for training set, 1 for testing set
accuracy_per_fold_overall = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) # Test set
accuracy_per_fold_overall2 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) # Training set


for k in range(5): #Make every fold a test set once
    # Split data set into training set and testing set
    traindata = ratings.loc[indices != k] 
    testdata = ratings.loc[indices == k, 'Rating'] 
    
    #Determine mean based on train data of that CV round
    mean = traindata["Rating"].mean() # Naive Approach 1 algorithm
    
    # Convert pandas data frame work to numpy array. Simplifies calculating the RMSE and MAE
    testdata_np = testdata.to_numpy()
    traindata_np = traindata.to_numpy()
    
    # Calculate RMSE and MAE for the testing set
    rmse = np.sqrt(np.mean(np.square((testdata_np - mean))))
    mae = np.mean(np.abs(testdata_np - mean))
    
    # Calculate RMSE and MAE for the training set
    rmse2 = np.sqrt(np.mean(np.square((traindata_np.T[2] - mean))))
    mae2 = np.mean(np.abs(traindata_np.T[2] - mean))
    
    # Add values to testing error data frame
    accuracy_per_fold_overall.loc[k,'RMSE'] = rmse #RMSE
    accuracy_per_fold_overall.loc[k, 'MAE'] = mae #MAE    
    
    # Add values to training error data frame
    accuracy_per_fold_overall2.loc[k,'RMSE'] = rmse2 #RMSE 
    accuracy_per_fold_overall2.loc[k, 'MAE'] = mae2 #MAE          

print("MAE on Testing set:\t", accuracy_per_fold_overall.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Testing set:\t", accuracy_per_fold_overall.loc[:,'RMSE'].mean()) #Average the RMSEs
print()
print("MAE on Training set:\t", accuracy_per_fold_overall2.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Training set:\t", accuracy_per_fold_overall2.loc[:,'RMSE'].mean()) #Average the RMSEs

# Naive Approach 2

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
    
def movie_ratings_mean(np_ratings, item_id):
    selected_np = np_ratings[np_ratings[:, 1] == item_id, :]
    if len(selected_np) != 0:
        return np.mean(selected_np[:,2])
    else:  #Exception if there are no ratings in the training set:
        #print(item_id,"No movies available")
        return overall_mean(ratings_np)

def overall_mean(ratings_arr): 
    return ratings_arr[:,2].mean()

# Setup 2 data frame that will hold the error values. 1 for training set, 1 for testing set
accuracy_per_fold_movie_mean = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) #initializing array
accuracy_per_fold_movie_mean2 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) #initializing array


for k in range(5): #Make every fold a test set once
    # Split data set into training set and testing set
    traindata = ratings.loc[indices != k,:]
    testdata = ratings.loc[indices == k,:]
    
    # Add an additional column where the predicted values will be stored
    testdata.loc[:,'Predicted_Rating'] = 999 
    traindata.loc[:,'Predicted_Rating'] = 999 
    
    # Convert pandas data frame work to numpy array. Simplifies calculations
    testdata_np = testdata.to_numpy(dtype="float64")
    traindata_np = traindata.to_numpy(dtype="float64")
    
    # Predict movie ratings for each movie in the testing set
    for movie in testdata.MovieID.unique(): # Get all the unique movie IDs
        a = np.where(testdata_np[:,1] == movie) # Get the location of the movie ID specified by 'movie'
        testdata_np[a[0], 4] =  movie_ratings_mean(traindata_np, movie) # Add the average 'movie' rating to 
                                                                        # the test data matrix
    # Predict movie ratings for each movie in the training set
    for movie in traindata.MovieID.unique(): # Get all the unique movie IDs
        a = np.where(traindata_np[:,1] == movie) # Get the location of the movie ID specified by 'movie'
        traindata_np[a[0], 4] =  movie_ratings_mean(traindata_np, movie) # Add the average 'movie' rating to 
                                                                         # the training data matrix
    
    # Calculate RMSE and MAE for the testing set
    rmse = np.sqrt(np.mean(np.square((testdata_np[:,2] - testdata_np[:,4]))))
    mae = np.mean(np.abs(testdata_np[:,2] - testdata_np[:,4]))
    
    # Add values to testing error data frame
    accuracy_per_fold_movie_mean.loc[k, 'RMSE'] = rmse
    accuracy_per_fold_movie_mean.loc[k, 'MAE'] = mae
    
    # Calculate RMSE and MAE for the training set
    rmse = np.sqrt(np.mean(np.square((traindata_np[:,2] - traindata_np[:,4]))))
    mae = np.mean(np.abs(traindata_np[:,2] - traindata_np[:,4]))
    
    # Add values to training error data frame
    accuracy_per_fold_movie_mean2.loc[k, 'RMSE'] = rmse
    accuracy_per_fold_movie_mean2.loc[k, 'MAE'] = mae


print("MAE on Testing set:\t", accuracy_per_fold_movie_mean.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Testing set:\t", accuracy_per_fold_movie_mean.loc[:,'RMSE'].mean()) #Average the RMSEs
print()
print("MAE on Training set:\t", accuracy_per_fold_movie_mean2.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Training set:\t", accuracy_per_fold_movie_mean2.loc[:,'RMSE'].mean()) #Average the RMSEs



# Naive Approach 3

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
    
def user_ratings_mean(np_ratings, item_id):
    selected_np = np_ratings[np_ratings[:, 0] == item_id, :]
    if len(selected_np) != 0:
        return np.mean(selected_np[:,2])
    else:  #Exception if there are no ratings in the training set:
        #print(item_id,"No movies available")
        return overall_mean(ratings_np)
    
def overall_mean(ratings_arr): 
    return ratings_arr[:,2].mean()

# Setup 2 data frame that will hold the error values. 1 for training set, 1 for testing set
accuracy_per_fold_user_mean = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) #initializing array
accuracy_per_fold_user_mean2 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) #initializing array


for k in range(5): #Make every fold a test set once
    # Split data set into training set and testing set
    traindata = ratings.loc[indices != k,:]
    testdata = ratings.loc[indices == k,:]
    
    # Add an additional column where the predicted values will be stored
    testdata.loc[:,'Predicted_Rating'] = 999 #Extra column to store the predicted ratings
    traindata.loc[:,'Predicted_Rating'] = 999 #Extra column to store the predicted ratings
    
    # Convert pandas data frame work to numpy array. Simplifies calculations
    testdata_np = testdata.to_numpy(dtype="float64")
    traindata_np = traindata.to_numpy(dtype="float64")
    
    # Predict user ratings for each user in the testing set
    for user in testdata.UserID.unique(): # Get all the unique user IDs
        a = np.where(testdata_np[:,0] == user) # Get the location of the user ID specified by 'user'
        testdata_np[a[0], 4] =  user_ratings_mean(traindata_np, user) # Add the average 'user' rating to 
                                                                      # the test data matrix
    # Predict user ratings for each user in the training set
    for user in traindata.UserID.unique(): # Get all the unique user IDs
        a = np.where(traindata_np[:,0] == user)  # Get the location of the user ID specified by 'user'
        traindata_np[a[0], 4] =  user_ratings_mean(traindata_np, user) # Add the average 'user' rating to 
                                                                       # the training data matrix
        
    # Calculate RMSE and MAE for the testing set
    rmse = np.sqrt(np.mean(np.square((testdata_np[:,2] - testdata_np[:,4]))))
    mae = np.mean(np.abs(testdata_np[:,2] - testdata_np[:,4]))
    
    # Add values to testing error data frame
    accuracy_per_fold_user_mean.loc[k, 'RMSE'] = rmse
    accuracy_per_fold_user_mean.loc[k, 'MAE'] = mae
    
    # Calculate RMSE and MAE for the training set
    rmse = np.sqrt(np.mean(np.square((traindata_np[:,2] - traindata_np[:,4]))))
    mae = np.mean(np.abs(traindata_np[:,2] - traindata_np[:,4]))
    
    # Add values to training error data frame
    accuracy_per_fold_user_mean2.loc[k, 'RMSE'] = rmse
    accuracy_per_fold_user_mean2.loc[k, 'MAE'] = mae


print("MAE on Testing set:\t", accuracy_per_fold_user_mean.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Testing set:\t", accuracy_per_fold_user_mean.loc[:,'RMSE'].mean()) #Average the MAEs
print()
print("MAE on Training set:\t", accuracy_per_fold_user_mean2.loc[:,'MAE'].mean()) 
print("RMSE on Training set:\t", accuracy_per_fold_user_mean2.loc[:,'RMSE'].mean()) #Average the MAEs


# Naive Approach 4: Linear regression without intercept

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'


#Naive approach 4: Linear regression without intercept
def get_ratings_per_movie(train):
    ratings_per_movie = pd.DataFrame(train["MovieID"].unique(), columns = ['MovieID'])
    ratings_per_movie['Rating'] = [movie_ratings_mean(train, movie) for movie in train.MovieID.unique()]
    return ratings_per_movie


def get_ratings_per_user(train):
    ratings_per_user = pd.DataFrame(train["UserID"].unique(), columns = ['UserID'])
    ratings_per_user['Rating'] = [user_ratings_mean(train, user) for user in train.UserID.unique()]
    return ratings_per_user

def user_ratings_mean(train, user_id):
    selected_ratings = train.loc[train.UserID == user_id, 'Rating']
    if (len(selected_ratings) != 0):
        return selected_ratings.mean()
    else: #Exception if there are no ratings in the training set:
        return overall_mean(ratings_df)
    
def movie_ratings_mean(train, item_id):
    selected_movies = train.loc[train.MovieID == item_id, 'Rating']
    if len(selected_movies) != 0:
        return selected_movies.mean()
    else:  #Exception if there are no ratings in the training set:
        #print(item_id,"No movies available")
        return overall_mean(train)

def get_ratings_extended(user_mean, movie_mean, train):
    temp = train.copy()
    temp["usermean"] = np.nan
    temp["itemmean"] = np.nan
    
    for user_id in train.UserID.unique():
        idx = temp.loc[temp["UserID"] == user_id].index
        mean = float(user_mean.loc[user_mean["UserID"] == user_id, "Rating"])
        temp.loc[idx, 'usermean'] = mean

    for movie_id in train.MovieID.unique():
        idx = temp.loc[temp["MovieID"] == movie_id].index
        mean = float(movie_mean.loc[movie_mean["MovieID"] == movie_id, "Rating"])
        temp.loc[idx, 'itemmean'] = mean
    return temp

# Setup 2 data frame that will hold the error values for LR no intercept. 1 for training set, 1 for testing set
accuracy_per_fold_reg1 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) #initializing array
accuracy_per_fold_reg2 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) #initializing array



for k in range(5):
    # Split data set into training set and testing set
    traindata = ratings.loc[indices != k,:]
    testdata = ratings.loc[indices == k,:]
    
    # Get mean ratings for training set
    user_mean_ratings = get_ratings_per_user(traindata) # Mean rating per user
    movie_mean_ratings = get_ratings_per_movie(traindata) # Mean rating per movie
    
    # Get mean ratings for testing set
    test_user_mean = get_ratings_per_user(testdata) # Mean rating per user
    test_movie_mean = get_ratings_per_movie(testdata)  # Mean rating per movie
    
    # Add mean rating per user & mean rating per movie to the training set
    ratings_extended = get_ratings_extended(user_mean_ratings, movie_mean_ratings, traindata)
    ratings_extended_t = get_ratings_extended(test_user_mean, test_movie_mean, testdata)

    # Add extra columns to the test and training set - Used for adding predictions from both LR models
    testdata['Predicted_Rating_reg1'] = 999
    
    traindata['Predicted_Rating_reg1'] = 999
    
    # Create a linear regression model with no intercept using mean rating per user and mean rating per movie
    regression_1 = LinearRegression(fit_intercept = False).fit(X = ratings_extended[['usermean', 'itemmean']],
                                                               y = ratings_extended['Rating'])
    
   
    # Predict ratings using the linear regression model on the testing set
    testdata['Predicted_Rating_reg1'] = regression_1.predict(X = ratings_extended_t[['usermean', 'itemmean']])  

    # Predict ratings using the linear regression model on the training set    
    traindata['Predicted_Rating_reg1'] = regression_1.predict(X = ratings_extended[['usermean', 'itemmean']])  
    
    # Testing set
    # All prediction below 1 is interpreted as 1. Likewise, prediction above 5 is interpreted as 5.
    testdata.loc[testdata.Predicted_Rating_reg1 > 5, 'Rating'] = 5 # above 5 with LR no intercept
    testdata.loc[testdata.Predicted_Rating_reg1 < 1, 'Rating'] = 1 # below 1 with LR no intercept
    
    
    # Training set
    # All prediction below 1 is interpreted as 1. Likewise, prediction above 5 is interpreted as 5.
    traindata.loc[traindata.Predicted_Rating_reg1 > 5, 'Rating'] = 5 # above 5 with LR no intercept
    traindata.loc[traindata.Predicted_Rating_reg1 < 1, 'Rating'] = 1 # below 1 with LR no intercept

    
    # Calculate RMSE and MAE for the testing set on LR model without intercept
    rmse = (testdata.Rating - testdata.Predicted_Rating_reg1)**2
    mae = abs(testdata.Rating- testdata.Predicted_Rating_reg1)
    
    # Add values to testing error data frame for LR model without intercept
    accuracy_per_fold_reg1.loc[k, 'RMSE'] = np.sqrt(np.mean(rmse))
    accuracy_per_fold_reg1.loc[k, 'MAE'] = (sum(mae)/len(mae))
    
    
    # Calculate RMSE and MAE for the training set on LR model without intercept
    rmse = (traindata.Rating - traindata.Predicted_Rating_reg1)**2
    mae = abs(traindata.Rating- traindata.Predicted_Rating_reg1)
    
    # Add values to training set error data frame for LR model with intercept
    accuracy_per_fold_reg1_2.loc[k, 'RMSE'] = np.sqrt(np.mean(rmse))
    accuracy_per_fold_reg1_2.loc[k, 'MAE'] = (sum(mae)/len(mae))



print("Testing Set")
print("MAE on Testing set for Regression 1:\t", accuracy_per_fold_reg1.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Testing set for Regression 1:\t", accuracy_per_fold_reg1.loc[:,'RMSE'].mean()) 

print("Training set")
print("MAE on Training set for Regression 1:\t", accuracy_per_fold_reg1_2.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Training set for Regression 1:\t", accuracy_per_fold_reg1_2.loc[:,'RMSE'].mean()) 

# Naive Approach 5: with intercept

In [None]:
def get_ratings_per_movie(train):
    ratings_per_movie = pd.DataFrame(train["MovieID"].unique(), columns = ['MovieID'])
    ratings_per_movie['Rating'] = [movie_ratings_mean(train, movie) for movie in train.MovieID.unique()]
    return ratings_per_movie


def get_ratings_per_user(train):
    ratings_per_user = pd.DataFrame(train["UserID"].unique(), columns = ['UserID'])
    ratings_per_user['Rating'] = [user_ratings_mean(train, user) for user in train.UserID.unique()]
    return ratings_per_user

def user_ratings_mean(train, user_id):
    selected_ratings = train.loc[train.UserID == user_id, 'Rating']
    if (len(selected_ratings) != 0):
        return selected_ratings.mean()
    else: #Exception if there are no ratings in the training set:
        return overall_mean(ratings_df)
    
def movie_ratings_mean(train, item_id):
    selected_movies = train.loc[train.MovieID == item_id, 'Rating']
    if len(selected_movies) != 0:
        return selected_movies.mean()
    else:  #Exception if there are no ratings in the training set:
        #print(item_id,"No movies available")
        return overall_mean(train)

def get_ratings_extended(user_mean, movie_mean, train):
    temp = train.copy()
    temp["usermean"] = np.nan
    temp["itemmean"] = np.nan
    
    for user_id in train.UserID.unique():
        idx = temp.loc[temp["UserID"] == user_id].index
        mean = float(user_mean.loc[user_mean["UserID"] == user_id, "Rating"])
        temp.loc[idx, 'usermean'] = mean

    for movie_id in train.MovieID.unique():
        idx = temp.loc[temp["MovieID"] == movie_id].index
        mean = float(movie_mean.loc[movie_mean["MovieID"] == movie_id, "Rating"])
        temp.loc[idx, 'itemmean'] = mean
    return temp
    
# Setup 2 data frame that will hold the error values for LR with intercept. 1 for training set, 1 for testing set
accuracy_per_fold_reg1 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) # Training set
accuracy_per_fold_reg2 = pd.DataFrame(index = range(5), columns = ['RMSE', 'MAE']) # Training set


for k in range(5):
    # Split data set into training set and testing set
    traindata = ratings.loc[indices != k,:]
    testdata = ratings.loc[indices == k,:]
    
    # Get mean ratings for training set
    user_mean_ratings = get_ratings_per_user(traindata) # Mean rating per user
    movie_mean_ratings = get_ratings_per_movie(traindata) # Mean rating per movie
    
    # Get mean ratings for testing set
    test_user_mean = get_ratings_per_user(testdata) # Mean rating per user
    test_movie_mean = get_ratings_per_movie(testdata)  # Mean rating per movie
    
    # Add mean rating per user & mean rating per movie to the training set
    ratings_extended = get_ratings_extended(user_mean_ratings, movie_mean_ratings, traindata)
    ratings_extended_t = get_ratings_extended(test_user_mean, test_movie_mean, testdata)

    # Add extra columns to the test and training set - Used for adding predictions from both LR models
    testdata['Predicted_Rating_reg2'] = 999    
    traindata['Predicted_Rating_reg2'] = 999
    
    # Create a linear regression model with intercept using mean rating per user and mean rating per movie
    regression_2 = LinearRegression(fit_intercept = True).fit(X = ratings_extended[['usermean', 'itemmean']],
                                                               y = traindata['Rating'])
   
    # Predict ratings using the both linear regression models on the testing set
    testdata['Predicted_Rating_reg2'] = regression_2.predict(X = ratings_extended_t[['usermean', 'itemmean']])

    # Predict ratings using the both linear regression models on the training set    
    traindata['Predicted_Rating_reg2'] = regression_2.predict(X = ratings_extended[['usermean', 'itemmean']])
    
    # Testing set
    # All prediction below 1 is interpreted as 1. Likewise, prediction above 5 is interpreted as 5.
    testdata.loc[testdata.Predicted_Rating_reg2 > 5, 'Rating'] = 5 # above 5 with LR with intercept
    testdata.loc[testdata.Predicted_Rating_reg2 < 1, 'Rating'] = 1 # below 1 with LR with intercept
    
    
    # Training set
    # All prediction below 1 is interpreted as 1. Likewise, prediction above 5 is interpreted as 5.
    traindata.loc[traindata.Predicted_Rating_reg2 > 5, 'Rating'] = 5 # above 5 with LR with intercept
    traindata.loc[traindata.Predicted_Rating_reg2 < 1, 'Rating'] = 1 # below 1 with LR with intercept
    
    
    # Calculate RMSE and MAE for the testing set on LR model with intercept
    mse = (testdata.Rating - testdata.Predicted_Rating_reg2)**2
    mae = abs(testdata.Rating- testdata.Predicted_Rating_reg2)
    
    # Add values to testing error data frame for LR model with intercept
    accuracy_per_fold_reg2.loc[k, 'RMSE'] = np.sqrt(sum(mse)/len(mse))
    accuracy_per_fold_reg2.loc[k, 'MAE'] = (sum(mae)/len(mae))
    

    # Calculate RMSE and MAE for the training set on LR model with intercept
    mse = (traindata.Rating - traindata.Predicted_Rating_reg2)**2
    mae = abs(traindata.Rating- traindata.Predicted_Rating_reg2)
    
    # Add values to training set error data frame for LR model with intercept    
    accuracy_per_fold_reg2.loc[k, 'RMSE'] = np.sqrt(sum(mse)/len(mse))
    accuracy_per_fold_reg2.loc[k, 'MAE'] = (sum(mae)/len(mae))

print("MAE on Testing set for Regression:\t", accuracy_per_fold_reg2.loc[:,'MAE'].mean()) #Average the MAEs
print("RMSE on Testing set for Regression:\t", accuracy_per_fold_reg2.loc[:,'RMSE'].mean()) 

# UV Matrix Decomposition

In [None]:
ratings_sub = ratings


np.random.seed(120)
indices = np.repeat(range(5), (len(ratings_sub)/5)+1)[0:(len(ratings_sub))]
np.random.shuffle(indices)


#Initializing lists to store the accuracies of the test set
rmse_train = []
rmse_test = []
mae_train = []
mae_test = []



for k in range(5): #Cross-validation
    """Procedure of selecting training set: 
  - We use all rows, so also the ones of test set, because otherwise the UV matrices aren't 
    of the correct dimensionality, and therefore predictions wouldn't be possible
  - We set all test set elements to 'nan' i.e. blank
  - We make a utility matrix"""
    print(k)
    train_dat = ratings_sub.copy()
    train_dat.loc[indices == k, 'Rating'] = 'nan' #Make ratings of all rows from test set blank elements 
    M_train = train_dat.pivot(index = 'UserID', columns = 'MovieID', values = 'Rating').to_numpy(dtype=float) #utility matrix
    temp_M = np.copy(M_train) # Create copy of M to have a modified version of it for normalization
  
    # Preprocessing: we normalize the utility matrix M
    #Step 1: subtract from each element its row mean
    i = 0
    for row in temp_M: 
        temp_M[i] = row - np.nanmean(row) 
        i += 1

    #Step 2: subtract from each element its column mean
    i = 0
    for col in temp_M.T: 
        temp_M.T[i] = col - np.nanmean(col)
        i += 1

     # Initialization
    n = temp_M.shape[0] #Amount of users
    m = temp_M.shape[1] #Amount of movies

    a = np.nanmean(temp_M) # the average nonblank element of M
    d = 10 # the lengths of the short sides of U and V, i.e. amount of features chosen

    #Initialize U V matrix
    U = np.ones((n, d), dtype=float)
    V = np.ones((d, m), dtype=float)
    """Because of the normalization the mean of the M matrix would be 0, so np.sqrt(a/d) -a frequently 
 used initialization strategy that returns for a given entry m_ij the global average - would be 0 as well"""
    U[:] = np.random.normal(0,0.1,U.shape) # starting points for U; add some random noise
    V[:] = np.random.normal(0,0.1,(V.shape))#starting points for V; add some random noise 
    #P = U.dot(V) # P = U * V


    
  
  #Converting all index combinations to tuples such that we don't have a loop within a loop(loop over the 
  #rows and loop over the columns), but just 1 loop over all the tuples.
    inds_U = np.where(U)
    inds_V = np.where(V)
    rmse_new = 998 # Initializing values for RMSE
    rmse_old = 999 # Initializing values for RMSE

    # Performing the optimization process:

    while(rmse_new > 1e-3 and (rmse_old - rmse_new)>1e-4): # While rmse or improvement is above the threshold (Chosen at random), keep updating
        rmse_old = rmse_new

        #We iterate over every index combination, i.e. every entry, of the U matrix, and compute
        #For that entry the value that would make the derivative of the loss function 0, i.e. the optimal value.
        for r, s in zip(*inds_U):
            num = np.nansum(V[s,:] *(temp_M[r,:]-(np.matmul(np.delete(U,s, axis = 1)[r],np.delete(V, s, axis =0))))) #numerator
            denom = sum(V[s,:][~np.isnan(temp_M[r,:])]**2) #denominator
            U[r,s] = num/denom #Split numerator and denominator up to make code more readable

        #After having updated the U matrix, we iterate over every index combination, i.e. every entry, of the V matrix, 
        #and compute for that entry the value that would make the derivative of the loss function 0, i.e. the optimal value.
        for r, s in zip(*inds_V):
            num = np.nansum(U[:,r] *(temp_M[:,s]- np.matmul(np.delete(U,r, axis = 1),(np.delete(V, r, axis =0)[:,s])))) #numerator
            denom = (sum(U[:,r][~np.isnan(temp_M[:,s])]**2)) #denominator
            V[r,s] = num/denom #Numerator and denominator split up to make code more readable
    
        P = U.dot(V) # P = U * V -> current version of UV after 1 optimization training set
        rmse_new = np.sqrt(np.nanmean((temp_M-P)**2)) # Keeping track of the RMSE reduction:
    
        print(rmse_new)

      #Reverse the normalization (which was carried out earlier) of the UV-multiplication, such that we are able to 
      #compare predicted values with the true values in the original (test) data.


    for i in range(temp_M.shape[0]): 
        P[i,:] = P[i,:] + np.nanmean(M_train[i,:]) 

    for i in range(temp_M.shape[1]): 
        P[:,i] = P[:,i] + np.nanmean(M_train[:,i])
    
    P[P < 1] = 1
    P[P > 5] = 5
    #Accuracy training data:
    rmse_train_fold = np.sqrt(np.nanmean((M_train-P)**2))
    mae_train_fold = np.nanmean(abs(M_train-P)) 
    rmse_train.append(rmse_train_fold) #In the end this is a list of length k, for every CV looping 1 result
    mae_train.append(mae_train_fold) #In the end this is a list of length k, for every CV looping 1 result
    
    """Procedure of selecting test set: 
  - We use all rows, so also the ones of training set, because otherwise the utility matrix of the test set doesn't match
  the dimensionality of the UV matrices and therefore predictions wouldn't be possible
  - We set all training set elements to 'nan' i.e. blank (we only assess accuracy on the test set)
  - We make a utility matrix"""
    
    test_dat = ratings_sub.copy()
    test_dat.loc[indices != k, 'Rating'] = 'nan' #Make ratings of all rows from train set blank elements,
    #such that it returns only the non-blank (i.e. test) elements in computing the RMSE
    M_test = test_dat.pivot(index = 'UserID', columns = 'MovieID', values = 'Rating').to_numpy(dtype=float)
    rmse_test_fold = np.sqrt(np.nanmean((M_test-P)**2)) # Returns RMSE based on test set of the kth fold
    mae_test_fold = np.nanmean(abs(M_test-P)) #Returns the MAE of the test set of the kth fold
    rmse_test.append(rmse_test_fold) #In the end this is a list of length k, for every CV looping 1 result
    mae_test.append(mae_test_fold) #In the end this is a list of length k, for every CV looping 1 result

# Matrix Factorisation

In [None]:
# Testing on a subset
ratings_sub = ratings
np.random.seed(120)
indices = np.repeat(range(5), (len(ratings_sub)/5)+1)[0:(len(ratings_sub))]
np.random.shuffle(indices)

all_rmse_train = []
all_mae_train = []
all_rmse_test = []
all_mae_test = []
#Procedure according to p.24 of gravity_Tikk article:
for k in range(5):
    mse_old = 100
    mse_new = 99
    counter = 0
    #Initializing: 
    train_dat = ratings_sub.copy()
    train_dat.loc[indices == k, 'Rating'] = 'nan' #Make ratings of all rows from test set blank elements 
    M = train_dat.pivot(index = 'UserID', columns = 'MovieID', values = 'Rating').to_numpy(dtype=float)
    inds = np.where(~np.isnan(M))

    d = 10
    n = M.shape[0] #Amount of users
    m = M.shape[1] #Amount of movies

  
    #Initialize U V matrix
    U = np.random.normal(0,1, (n,d)) #Random initialization, to be decided on. At least don't use constant vectors
    #Because then it will stay constant
    V = np.random.normal(0,1, (d,m))
    while(counter < 75):
        U_new = U.copy() # Creating copies to update U in
        V_new = V.copy()

        mse_old = mse_new
        for i,j in zip(*inds):
            err = M[i,j] - np.dot(U_new[i,:], V_new[:,j])
            U = U_new.copy()
            U_new[i,:] = U_new[i,:] + 0.005*((2*err*V_new[:,j]) - 0.05*U_new[i,:])
            V_new[:,j] = V_new[:,j] + 0.005*((2*err*U[i,:]) - 0.05*V_new[:,j]) 
            """Note: for V_new updating, I use the U[i,:] from the previous iteration, such that I update U and V
            simultaneously for the gradient of a certain e_i,j"""
        U = U_new.copy() 
        V = V_new.copy()
        mse_new = np.nanmean(err)#MSE
        counter +=1
        
    P = np.dot(U,V)
  
    #Rounding:
    P[P < 1] = 1
    P[P > 5] = 5

    # Testing on training set:
    rmse_train_fold = np.sqrt(np.nanmean((M-P)**2)) # Returns RMSE (when squared value = False)
    mae_train_fold = np.nanmean(abs(M-P))
    all_rmse_train.append(rmse_train_fold)
    all_mae_train.append(mae_train_fold)


    #Testing on test set:

    test_dat = ratings_sub.copy()
    test_dat.loc[indices != k, 'Rating'] = 'nan' #Make ratings of all rows from train set blank elements,
    #such that it returns only the non-blank (i.e. test) elements in computing the RMSE
    M_test = test_dat.pivot(index = 'UserID', columns = 'MovieID', values = 'Rating').to_numpy(dtype=float)
    rmse_test_fold = np.sqrt(np.nanmean((M_test-P)**2)) # Returns RMSE (when squared value = False)
    mae_test_fold = np.nanmean(abs(M_test-P))
    all_rmse_test.append(rmse_test_fold)
    all_mae_test.append(mae_test_fold)
