# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:
# please do not change this cell

from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
item_popularity = np.zeros(n_items)
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
    item_popularity[row[2]-1] =  item_popularity[row[2]-1] + 1
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
testsize = 0
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    if item_popularity[row[2]-1] > 30:
        test_ds[row[1]-1, row[2]-1] = row[3]
        testsize = testsize + 1
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

print("Testsize = " + str(testsize))

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Testsize = 17678


# Utils

In [4]:
# Please don't change this cell
# you can use this devaluate Utils here, and you can also implement your own MAE and RMSE calculation. 

EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

In [5]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

#--------------------------------------------
# DEFINING VARIABLES
#--------------------------------------------

MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

# Convert numpy arrays into panda dataframes
train_ds = pd.DataFrame(train_ds)
test_ds = pd.DataFrame(test_ds) 

# Set GAMMA and Epsilon values for weighted calculations and avoiding divison by 0
GAMMA = 30
EPSILON = 1e-9

# Min and Max ratings for movies that users can give (1-5)
V_MAX = 5 
V_MIN = 1

#-----------------------------------------------------------------------------------
# CALCULATING ADJUSTED EUCLIDEAN DISTANCE FOR EACH PAIR OF USERS IN TRAINING DATASET
#-----------------------------------------------------------------------------------

# Code has been modified from KNN_based_CF_final from lecture 10 for adjusted euclidean distance

# Creating matrix with size of n_users by n_users to hold adjusted euclidean distance
np_user_aed = np.zeros((n_users, n_users))

for i, user_i_vec in enumerate(train_ds.values):
    for j, user_j_vec in enumerate(train_ds.values):
        
        # Skip if user i and user j equal each other, as we are not comparing user with themselves
        if i == j:
            continue

        # Ratings co-rated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # Co-rated item index, skip if there are no co-rated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # Calculate adjusted euclidean distance
        squared_difference_sum = np.sum(np.square(user_i_vec[corrated_index] - user_j_vec[corrated_index]))

        aed = np.sqrt(squared_difference_sum) / np.sqrt((V_MAX - V_MIN)**2 * len(corrated_index))
        similarity = 1 - aed

        # Significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * similarity

        # Put the weighted sim into the np_user_aed matrix
        np_user_aed[i][j] = weighted_sim

#--------------------------------------------
# PREDICT RATINGS
#--------------------------------------------

# Create matrix to hold predictions of size n_users by n_items
np_predictions = np.zeros((n_users, n_items))

# K value is 100
K = 100
EPSILON = 1e-9

# Enumerate through the test values
for (i, j), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        # Find top-K most similar users as the current user, remove itself
        sim_user_ids = np.argsort(np_user_aed[i])[-(K + 1):-1]

        # The aed values of similar users
        sim_val = np_user_aed[i][sim_user_ids]

        # The average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # Select the users who rated movie j
        mask_rated_j = sim_users[:, j] > 0
        
        if np.sum(mask_rated_j) == 0:
            # If none of the similar users rated movie j, use the average rating of the current user
            np_predictions[i][j] = user_mean
        else:
            # weighted aed
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            np_predictions[i][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        
        # Clip ratings so that the range is only between 1 and 5
        np_predictions[i][j] = np.clip(np_predictions[i][j], 1, 5)


MAE, RMSE = evaluate(test_ds.values, np_predictions)

In [6]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7338597942106887, RMSE: 0.9419625815142503
