# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [2]:
# Please don't change this cell

df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

# obtain top 500 users and top 500 items
user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(500).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(500).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1,186,302,3,891717742
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467


# Split dataset

## Randomly select one rating from each user as test set

In [3]:
# Please don't change this cell

# remap user and item ID
df['user_id'] = df.groupby('user_id').ngroup()
df['item_id'] = df.groupby('item_id').ngroup()

test_df = df.groupby('user_id').sample(1, random_state=1024)
train_df = df[~df.index.isin(test_df.index)]

In [4]:
# Please don't change this cell

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
avg_num = df.groupby('user_id').size().mean()
density = df.shape[0] / (n_users * n_items)
min_ratings = df.rating.min()
max_ratings = df.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

The number of users: 500
The number of items: 500
Avg. # of rated Items/User: 129.914
Density of data: 0.259828
Ratings Range: 1 - 5


In [5]:
# Please don't change this cell

# Convert the format of datasets to matrices
# Train dataset
df_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_ds = df_zeros.merge(train_df, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
# Test dataset
test_ds = df_zeros.merge(test_df, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

Construct the rating matrix based on train_df:
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 4. 0.]]
Construct the rating matrix based on test_df:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [6]:
# Please don't change this cell
EPSILON = 1e-9

def user_corr(imputed_train_ds):
    '''
    Function for calculating user's similarity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))

    # Compute Pearson Correlation Coefficient of All Pairs of Users between active set and training dataset
    for i, user_i_vec in enumerate(imputed_train_ds):
        for j, user_j_vec in enumerate(imputed_train_ds):

            # ratings corated by the current pair od users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # corrated item index, skip if there are no corrated ratings
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                continue

            # average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

            # compute pearson corr
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predicted_ds = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            
            mask_rateditem_user = imputed_train_ds[i] != 0
            num_rated_items = mask_rateditem_user.astype(np.float32)
            user_mean = np.sum(imputed_train_ds[i, mask_rateditem_user]) / (num_rated_items.sum() + EPSILON)

            mask_nei_rated_items = sim_users != 0
            num_rated_per_user = mask_nei_rated_items.astype(np.float32)
            num_per_user = num_rated_per_user.sum(axis=1)

            sum_per_user = sim_users.sum(axis=1)
            sim_user_mean = sum_per_user / (num_per_user + EPSILON)
            
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predicted_ds[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predicted_ds

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Baseline - KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient)

In [7]:
# Please don't change this cell

user_pearson_corr = user_corr(train_ds)
predicted_ds = predict(test_ds, train_ds, user_pearson_corr, k=20)

In [8]:
# Please don't change this cell

MAE, RMSE = evaluate(test_ds, predicted_ds)

print("===================== Baseline Result =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8471711011333851, RMSE: 1.092846045041526


# Your Solution
(Put all your implementation for your solution in the following cell only)

In [9]:
#Firstly, creating a new dataframe which consists of the train_ds and test_ds- the sets I have to work on. 

train_ds =pd.DataFrame (train_ds)
test_ds =pd.DataFrame (test_ds)

#First the important step in the implementation is to calculate the P(t) value- which will give us the rating$ of what 
# every user gave for item t 

# I create a list for P(t)
# POPULARITY = P(t)
item_t_pop = []

for i in train_ds.columns:
    pop= train_ds[i].value_counts().sum()
    item_t_pop.append(pop)
    
# A very important and new implementation step is to compute the weight 

# the weight supports that the less popular items contribute to a higher significance
# n_users= 500 is the total number of users in database and i is the item_t_popularity
# W(t) = weight
weight_item_t= []
for i in item_t_pop:
    weight= np.log (n_users/i)
    weight_item_t.append(weight) 
    
    
# Using epsilon to avoid errors and gamma =30 as based on research it is a good number,
# this is even more reliable as if lesser than this it will reduce its significance.

EPSILON = 1e-9
GAMMA = 30

pearson_correlation = np.zeros((n_users, n_users))

for i, user_a_v in enumerate(train_ds.values):
    for j, user_u_v in enumerate(train_ds.values):

        # calculating the ratings of the pairs of users who corated 
        user_a_val = user_a_v > 0
        user_u_val = user_u_v > 0

        # calculating the union for user a and u
        # the if condition is used when the corrated items are larger than 0 then only I can calculate 
        # the similarity, if it is lower than 0 those ratings should not be taken forward as no similarity 
        # between the two
        items_union_a_u= np.union1d(np.where(user_a_val),np.where(user_u_val))
        if len (items_union_a_u) == 0: 
            continue
            
        # calculating the mean for users a and u
        # using np.clip to keep all values below 0 as o and above 1 as 1
        mean_user_a = np.sum(user_a_v) / (np.sum(np.clip(user_a_v, 0, 1)) + EPSILON)
        mean_user_u = np.sum(user_u_v) / (np.sum(np.clip(user_u_v, 0, 1)) + EPSILON)
        
        # adding missing data
        # here for adding the missing data created a for loop in which finding the item set the user has not rated
        # and making it == the mean for that user
        for ind in train_ds.index:
            if(ind == i): 
                for index in items_union_a_u:
                    if(train_ds[index][ind] == 0) :
                        train_ds[index][ind] = mean_user_a
        
        for ind in train_ds.index:
            if(ind == j): 
                for index in items_union_a_u:
                    if(train_ds[index][ind] == 0) :
                        train_ds[index][ind] = mean_user_u
                    
        # Introducing the weight element here to find the new pcc
        # add each item's weight
        weight_square_a_u = []
        for i in items_union_a_u:
            weight_square_a_u.append(weight_item_t[i]* weight_item_t[i])
        weight_square_a_u = pd.Series(weight_square_a_u) 
        # Changed the weight square into a series as in an array an error will show as it will not be able to 
        #compute the values. Therefore, converted it into series.

        #Calculating the user a and u sub mean
        user_a_sub_mean = user_a_v[items_union_a_u] - mean_user_a
        user_u_sub_mean = user_u_v[items_union_a_u] - mean_user_u

        #squaring the means
        sq_a_mean = np.square(user_a_sub_mean)
        sq_u_mean = np.square(user_u_sub_mean)
        
        #denominator calculation for PCC
        # multiplying the weight square
        a_sqrt = np.sqrt(np.sum(weight_square_a_u*sq_a_mean))
        u_sqrt = np.sqrt(np.sum(weight_square_a_u*sq_u_mean))

        #numerator calculation for PCC
        new_user_au_sub_mean= weight_square_a_u*user_a_sub_mean*user_u_sub_mean
        sim_au = np.sum(new_user_au_sub_mean) / (a_sqrt * u_sqrt + EPSILON)
        
        #calculating the significance weighting
        sig_wt = (min(len(items_union_a_u), GAMMA) / GAMMA) * sim_au

        pearson_correlation[i][j] = sig_wt 


#after finding the similarities between pairs I can move on to prediction

#Predict the ratings

prediction_matrix = np.zeros((n_users, n_items))

K = 20
EPSILON = 1e-9

# I use denumerate here as I want to do the prediction for every rating in the test set by checking the values 
# in a and u
# here i is the active user and it can be anyone in the test set, and t is the item that user is rating

for (i, t), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        # i is the active user
        #argsort is used to sort all the current users based on similarity in an ascending order
        # and -1 is used here as I need to access from the bottom i.e., the last value. Because argsort sorts it
        #in a descending order. 
        
        similar_users = np.argsort(pearson_correlation[i])[-(K + 1):-1]

        # similarity values
        coeff_val = pearson_correlation[i][similar_users]

        # value of similar users.'similar_users' from the k nearest neighbours
        #this matrix is smaller which includes all the k-nearest neighbours from all rows
        #each value is taken from the train_ds
        k_sim_users = train_ds.values[similar_users]
        
        #mean values for the current user
        #this is a calculation just for one row [i]
        current_user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
        
        # this gives mean values for all selected k neighbours 
        # while this calculates for k rows for the k -nearest neighbours
        #axis=1 is for every row
        k_neighbours_mean = np.sum(k_sim_users, axis=1) / (np.sum(np.clip(k_sim_users, 0, 1), axis=1) + EPSILON)

        # all the users in k-nearest neighbours who rated item t
        rating_t = k_sim_users[:, t] > 0
        
        final_value = coeff_val[rating_t] * (k_sim_users[rating_t, t] - k_neighbours_mean[rating_t])

        
        prediction_matrix[i][t] = current_user_mean + np.sum(final_value) / (np.sum(coeff_val[rating_t]) + EPSILON)
        # using a clip, just incase to change the rating values which are out of range to within the range of 0 to 5
        # as mentioned in the readme file for the given dataset
        
        prediction_matrix[i][t] = np.clip(prediction_matrix[i][t], 0, 5)
   




 ##Finally, calculating the MAE and RMSE values   

#==================MAE on Testing set===================#
actual_ratings = test_ds.values

# calculating the differences and absolute errors
error_abs = np.abs(prediction_matrix - actual_ratings)

# calculating all the ratings in the test set 
# this is the weight
wt = np.clip(actual_ratings, 0, 1)

# calculating the absoulte error only on the rated ratings
error_abs_rated = error_abs * wt

# MAE
MAE = np.sum(error_abs_rated) / np.sum(wt) 

#==================RMSE on Testing set===================#
actual_ratings = test_ds.values

# calculating the squared error
sqr_error = np.square(prediction_matrix - actual_ratings)
wt = np.clip(actual_ratings, 0, 1)

# calculating the squared error only on the rated ratings
sqr_error = sqr_error * wt

# RMSE
RMSE = np.sqrt(np.sum(sqr_error) / np.sum(wt))

#RMSE tries to panelize the larger errors more hence, it will always be greater than the MAE



# RECOMMENDATION- ComRV calculation
#ComRV= P(t)*R(t)[mean]

sum_ratings  = train_ds.sum(axis=0)

Popularity = pd.Series(item_t_pop)

#to find out R(T)[mean], calculting the sum of ratings divided by the item popularity P(t)
mean = sum_ratings/Popularity

#using np.clip to keep it within range
mean = np.clip(mean, 0, 5)

#finally, multiply the Popularity with the mean value R(t)
Com_RV = Popularity*mean

# calculating the top n highest prediction value- the top 10 

Top_n_most_pop= Com_RV.nlargest(n=10, keep='first').sort_values(ascending=False)

MAE = 0.8381396662804781 
RMSE = 1.0384051910547503

# In my implementation as you can notice, my MAE and RMSE values are lower than the baseline recommendation which is
# MAE= 0.8471711011333851, RMSE: 1.092846045041526

## Print the MAE and RMSE of Your Implementation

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8381396662804781, RMSE: 1.0384051910547503
