In [1]:
# Please don't change this cell
import pandas as pd
import numpy as np  
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')
df.head(25)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [3]:
from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

In [8]:
import sklearn.metrics
from sklearn.model_selection import train_test_split

In [5]:
# Please don't change this cell
import pandas as pd
import numpy as np
import warnings
import sklearn.metrics
from sklearn.model_selection import train_test_split

# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

def deviation_matrix(train_ds, lambda_val=0.5): #this is where we can hypertune the lambda as well to see the impact
    #to see the lambda's impact on the MAE and RMSE
    # This helps in first obtaining the number of users and items
    n_users, n_items = train_ds.shape #we had declared these variables earlier

    # to initialize standard deviation and frequency matrices
    dev = np.zeros((n_items, n_items))
    freq = np.zeros((n_items, n_items))

    for u in range(n_users):
        # Calculate the current user 'u's similarity with other users using cosine distances
        # here we are employing cosine distance to compute this value
        user_sim = 1 - sklearn.metrics.pairwise.cosine_distances([train_ds[u]])

        for i in range(n_items):
            for j in range(n_items):
                # This is to check if user u has rated both items i and j
                if train_ds[u, i] > 0 and train_ds[u, j] > 0:
                    #this is the u' - j portion in Step 2 implemented
                    dev[i, j] += ((train_ds[u, j] - train_ds[u, i]) * np.exp(user_sim)) / np.sum(np.exp(user_sim))
                    freq[i, j] += 1

    dev /= freq + EPSILON  # adding a small value to avoid division by zero
    #we declared EPSILON above

    return dev, freq

def predict(train_ds, dev, freq):
    # this helps us get the number of users and items in our dataset
    n_users, n_items = train_ds.shape

    # here we are initializing a predictions matrix
    pred = np.zeros((n_users, n_items))

    for u in range(n_users):
        for i in range(n_items):
            if train_ds[u, i] == 0:  # this ensures we are predicting the ratings for items not currently rated by
                #the user
                numerator = 0
                denominator = 0
                for j in range(n_items):
                    if train_ds[u, j] > 0:  # here we ensure that training set only considers items rated by user
                        numerator += (dev[j, i] + train_ds[u, j]) * freq[j, i]
                        denominator += freq[j, i]
                pred[u, i] = numerator / (denominator + EPSILON)  # adding small value to avoid division by zero

    return pred

# calculating the deviation matrix
dev, freq = deviation_matrix(train_ds, lambda_val=0.5) #hypertuning the parameter in subsequent steps

# predicting ratings
predicted_ds = predict(train_ds, dev, freq)

# evaluating predictions with MAE and RMSE
MAE, RMSE = evaluate(test_ds, predicted_ds)
print(f"MAE: {MAE}, RMSE: {RMSE}")

MAE: 0.7446695207901444, RMSE: 0.9533358060356125


In [7]:
# Please don't change this cell
import pandas as pd
import numpy as np
import warnings
import sklearn.metrics
from sklearn.model_selection import train_test_split

# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

def deviation_matrix(train_ds, lambda_val=0.9): #this is where we can hypertune the lambda as well to see the impact
    #to see the lambda's impact on the MAE and RMSE
    # This helps in first obtaining the number of users and items
    n_users, n_items = train_ds.shape #we had declared these variables earlier

    # to initialize standard deviation and frequency matrices
    dev = np.zeros((n_items, n_items))
    freq = np.zeros((n_items, n_items))

    for u in range(n_users):
        # Calculate the current user 'u's similarity with other users using cosine distances
        # here we are employing cosine distance to compute this value
        user_sim = 1 - sklearn.metrics.pairwise.cosine_distances([train_ds[u]])

        for i in range(n_items):
            for j in range(n_items):
                # This is to check if user u has rated both items i and j
                if train_ds[u, i] > 0 and train_ds[u, j] > 0:
                    #this is the u' - j portion in Step 2 implemented
                    dev[i, j] += ((train_ds[u, j] - train_ds[u, i]) * np.exp(user_sim)) / np.sum(np.exp(user_sim))
                    freq[i, j] += 1

    dev /= freq + EPSILON  # adding a small value to avoid division by zero
    #we declared EPSILON above

    return dev, freq

def predict(train_ds, dev, freq):
    # this helps us get the number of users and items in our dataset
    n_users, n_items = train_ds.shape

    # here we are initializing a predictions matrix
    pred = np.zeros((n_users, n_items))

    for u in range(n_users):
        for i in range(n_items):
            if train_ds[u, i] == 0:  # this ensures we are predicting the ratings for items not currently rated by
                #the user
                numerator = 0
                denominator = 0
                for j in range(n_items):
                    if train_ds[u, j] > 0:  # here we ensure that training set only considers items rated by user
                        numerator += (dev[j, i] + train_ds[u, j]) * freq[j, i]
                        denominator += freq[j, i]
                pred[u, i] = numerator / (denominator + EPSILON)  # adding small value to avoid division by zero

    return pred

# calculating the deviation matrix
dev, freq = deviation_matrix(train_ds, lambda_val=0.9) #hypertuning the parameter in subsequent steps

# predicting ratings
predicted_ds = predict(train_ds, dev, freq)

# evaluating predictions with MAE and RMSE
MAE, RMSE = evaluate(test_ds, predicted_ds)
print(f"MAE: {MAE}, RMSE: {RMSE}")

#Note: although the MAE and RMSE seem to be 0.7 and 0.9 respectively, this isn't what 
#I expected as a higher lambda should ideally balance the ratings and reduce the errors related to prediction

MAE: 0.7446695207901444, RMSE: 0.9533358060356125


In [8]:
# Please don't change this cell
import pandas as pd
import numpy as np
import warnings
import sklearn.metrics
from sklearn.model_selection import train_test_split

# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

def deviation_matrix(train_ds, lambda_val=0.1): #this is where we can hypertune the lambda as well to see the impact
    #to see the lambda's impact on the MAE and RMSE
    # This helps in first obtaining the number of users and items
    n_users, n_items = train_ds.shape #we had declared these variables earlier

    # to initialize standard deviation and frequency matrices
    dev = np.zeros((n_items, n_items))
    freq = np.zeros((n_items, n_items))

    for u in range(n_users):
        # Calculate the current user 'u's similarity with other users using cosine distances
        # here we are employing cosine distance to compute this value
        user_sim = 1 - sklearn.metrics.pairwise.cosine_distances([train_ds[u]])

        for i in range(n_items):
            for j in range(n_items):
                # This is to check if user u has rated both items i and j
                if train_ds[u, i] > 0 and train_ds[u, j] > 0:
                    #this is the u' - j portion in Step 2 implemented
                    dev[i, j] += ((train_ds[u, j] - train_ds[u, i]) * np.exp(user_sim)) / np.sum(np.exp(user_sim))
                    freq[i, j] += 1

    dev /= freq + EPSILON  # adding a small value to avoid division by zero
    #we declared EPSILON above

    return dev, freq

def predict(train_ds, dev, freq):
    # this helps us get the number of users and items in our dataset
    n_users, n_items = train_ds.shape

    # here we are initializing a predictions matrix
    pred = np.zeros((n_users, n_items))

    for u in range(n_users):
        for i in range(n_items):
            if train_ds[u, i] == 0:  # this ensures we are predicting the ratings for items not currently rated by
                #the user
                numerator = 0
                denominator = 0
                for j in range(n_items):
                    if train_ds[u, j] > 0:  # here we ensure that training set only considers items rated by user
                        numerator += (dev[j, i] + train_ds[u, j]) * freq[j, i]
                        denominator += freq[j, i]
                pred[u, i] = numerator / (denominator + EPSILON)  # adding small value to avoid division by zero

    return pred

# calculating the deviation matrix
dev, freq = deviation_matrix(train_ds, lambda_val=0.1) #hypertuning the parameter here to 0.1 to see the impact 
#on MAE and RMSE

# predicting ratings
predicted_ds = predict(train_ds, dev, freq)

# evaluating predictions with MAE and RMSE
MAE, RMSE = evaluate(test_ds, predicted_ds)
print(f"MAE: {MAE}, RMSE: {RMSE}")

#Note: although the MAE and RMSE seem to be 0.7 and 0.9 respectively, this isn't what 
#I expected as a lower lambda should have put the the ratings off-balance and increase
#errors related to prediction

MAE: 0.7446695207901444, RMSE: 0.9533358060356125


In [9]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE = 0.7446695207901444 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0.9533358060356125 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7446695207901444, RMSE: 0.9533358060356125
