In [1]:
# Imports:

import numpy as np
import pandas as pd

In [2]:
# Declaration and initialization of a Pandas DataFrame to contain ratings provided by 
# 200 unique users (rows) for 1000 available movies (columns). 

user_movie_matrix = pd.DataFrame(index=range(1, 201), columns=range(1, 1001))

In [3]:
# Populating the above Pandas DataFrame, a row at a time (ratings for movies watched by a user)
# by parsing 'train.txt'.

with open("../train.txt", "r") as train_file_handle:
    for row, user in enumerate(train_file_handle):
        user_movie_matrix.loc[row+1] = user.split()

In [4]:
# Displaying the Pandas DataFrame as assigned above.

user_movie_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
1,5,3,0,3,3,5,0,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,4,0,0,3,0,2,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199,1,0,0,0,0,0,4,0,5,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Conversion of 'user_movie_matrix' from a Pandas DataFrame to a Numpy Array.

user_movie_matrix = user_movie_matrix.to_numpy()

In [6]:
# Changing the data type of each element in the above Numpy Array, from a string to an integer.

user_movie_matrix = user_movie_matrix.astype("uint8", copy=False)

In [7]:
# Displaying the final version of user_movie_matrix as a Numpy Array.

user_movie_matrix

array([[5, 3, 0, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [4, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [5, 4, 0, ..., 0, 0, 0]], dtype=uint8)

In [8]:
# Declaration and initialization of a list to contain movies which have not been rated by any
# existing user:

train_unrated_movies = []

for train_count, movie in enumerate(user_movie_matrix.T):
    if not any(movie):
        train_unrated_movies.append(train_count)

In [9]:
# Declaration and initialization of two dictionaries corresponding to 'test5.txt':

# one to contain ratings for movies watched by each active user,

test5_observed_ratings = {}

# and the other, to contain target movie(s) for which rating prediction(s) is/are to be made, 
# respective to each active user.

test5_required_predictions = {}

for test5_count in range(100):
    test5_observed_ratings[test5_count] = []
    test5_required_predictions[test5_count] = []

In [10]:
# Parsing 'test5.txt' for subsequent assignments of the above dictionaries.

with open("../test5.txt", "r") as test5_file_handle:
    for line in test5_file_handle:
        user_movie_rating = [int(value) for value in line.split()]
        if user_movie_rating[2] != 0:
            test5_observed_ratings[user_movie_rating[0]-201].append((user_movie_rating[1]-1, 
                                                                     user_movie_rating[2]))
        else:
            test5_required_predictions[user_movie_rating[0]-201].append(user_movie_rating[1]-1)

In [11]:
# Defining a function to compute the similarity between two arbitrary users (represented as vectors),
# based on their movie ratings. 

def cosine_similarity(vector_1, vector_2):
    return np.dot(vector_1, vector_2)/(np.linalg.norm(vector_1)*np.linalg.norm(vector_2))

In [12]:
# Initialization of a file handler to operate on a text file in which to store results (i.e., predicted
# ratings for target movie(s) corresponding to each active user).

result5_file_handle = open("Task1-1-CosineSimilarity-Test05Result.txt", "a")

In [13]:
# COLLABORATIVE FILTERING ALGORITHM:

# Iteration through each active user:
for test5_count in range(100):
    available_ratings = test5_observed_ratings[test5_count]
    target_movies = test5_required_predictions[test5_count]
    # Iteration through each target movie:
    for movie in target_movies:
        # If the target movie has not been rated by any existing user, its rating by the 
        # corresponding active user is approximated to be equal to the average value in the rating
        # scale (in this particular model, the average rating equates to 3, for a rating scale from
        # 1 to 5, with unit increments).
        if movie in train_unrated_movies:
            result5_file_handle.write(f"{test5_count + 201} {movie + 1} 3\n")
        # The following code block gets executed if the target movie has been rated by at least one
        # existing user.
        else:
            # Declaring and initializing a Numpy Array to store similarity values between an arbitrary 
            # active user and existing users, corresponding to a target movie:
            test5_similarity = np.zeros(shape=(200,))
            # Iteration through each existing user:
            for train_count in range(200):
                # The arbitrary existing user is only to be considered further, if said user has rated 
                # the target movie under consideration:
                if user_movie_matrix[train_count][movie] != 0:
                    # Declaration and initialization of two lists (required to have the same length):
                    # one, to hold ratings by an arbitrary active user,
                    common_dimensions_test5 = []
                    # and the other, to hold ratings by an arbitrary existing user,
                    common_dimensions_train = []
                    # corresponding to movies watched by the arbitrary active user.
                    # Iteration through movies watched by an arbitrary active user, to populate
                    # above lists.
                    for observed_movie in available_ratings:
                        # Population of the above lists can only proceed, if an arbitrary existing user
                        # has rated the movie which has been rated by the arbitrary active user:
                        if user_movie_matrix[train_count][observed_movie[0]] != 0:
                            common_dimensions_test5.append(observed_movie[1])
                            common_dimensions_train.append(user_movie_matrix[train_count][observed_movie[0]])
                    # Calculation of the similarity between an arbitrary active user and an arbitrary
                    # existing user can only proceed, if there exist two or more movies for which 
                    # ratings are available across both users (otherwise, cosine similarity cannot 
                    # accurately account for the actual similarity between both users):
                    if len(common_dimensions_train) > 1:
                        # Calculation of the similarity between an arbitrary active user and an 
                        # arbitrary existing user by utilizing Cosine Similarity as the metric:
                        test5_similarity[train_count] = cosine_similarity(np.array(common_dimensions_test5), np.array(common_dimensions_train))
            # Each value in the above Numpy Array is directly proportional to the similarity between an 
            # arbitrary active user and the corresponding existing user.
            # If there exists at least one non-zero similarity value, we proceed to determine existing 
            # users who closely resemble an arbitrary active user, with regards to the ratings of 
            # observed movies.
            if any(test5_similarity):
                # Sorting the above Numpy Array in a decreasing order to arrive at five existing users 
                # (neighbors) who closely resemble an arbitrary active user, with regards to the ratings 
                # of observed movies:
                test5_top5_neighbors = np.argpartition(test5_similarity, -100)[-100:]
                # Prediction of the rating for a target movie corresponding to an arbitrary active user 
                # is computed as a weighted average of the ratings of the neighbours.
                # The numerator represents the sum of products of weight ,i.e., the similarity value 
                # (between an arbitrary active user and a neighbor), and rating, of the neighbor 
                # corresponding to an arbitrary active user's target movie:
                numerator = 0
                # The denominator represents the sum of similarity value(s) between each neighbor and  
                # an arbitrary active user, corresponding to the target movie:
                denominator = 0
                # Iteration through each neighbor:
                for top5 in test5_top5_neighbors:
                    numerator += test5_similarity[top5] * user_movie_matrix[top5][movie]
                    denominator += test5_similarity[top5]
                result5_file_handle.write(f"{test5_count + 201} {movie + 1} {str(round(numerator/denominator))[0]}\n")
            # The following code block gets executed if similarity values between an arbitrary active 
            # user and all existing users are all equal to zero; thus implying the absence of 
            # neighbors corresponding to the arbitrary active user.
            else:
                # Declaration and initialization of a list to hold ratings for the target movie 
                # (corresponding to an arbitrary active user) provided by existing users:
                nonzero_ratings = []
                # Iteration through each existing user to obtain the corresponding rating for the 
                # target movie, only if it has been watched by said existing user:
                for train_movie_rating in user_movie_matrix[:, movie]:
                    if train_movie_rating != 0:
                        nonzero_ratings.append(train_movie_rating)
                # The predicted rating for the target movie corresponding to an arbitrary user is 
                # approximated to be equal to be mean of ratings contained in the above list.
                result5_file_handle.write(f"{test5_count + 201} {movie + 1} {str(round(np.mean(np.array(nonzero_ratings))))[0]}\n")

In [14]:
# Closing the text file after having written all outputs:

result5_file_handle.close()