In [1]:
# Imports:

import numpy as np
import pandas as pd

In [2]:
# Declaration and initialization of a Pandas DataFrame to contain ratings provided by 
# 200 unique users (rows) for 1000 available movies (columns). 

user_movie_matrix = pd.DataFrame(index=range(1, 201), columns=range(1, 1001))

In [3]:
# Populating the above Pandas DataFrame, a row at a time (ratings for movies watched by a user)
# by parsing 'train.txt'.

with open("../train.txt", "r") as train_file_handle:
    for row, user in enumerate(train_file_handle):
        user_movie_matrix.loc[row+1] = user.split()

In [4]:
# Displaying the Pandas DataFrame as assigned above.

user_movie_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
1,5,3,0,3,3,5,0,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,4,0,0,3,0,2,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199,1,0,0,0,0,0,4,0,5,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Conversion of 'user_movie_matrix' from a Pandas DataFrame to a Numpy Array.

user_movie_matrix = user_movie_matrix.to_numpy()

In [6]:
# Changing the data type of each element in the above Numpy Array, from a string to an integer.

user_movie_matrix = user_movie_matrix.astype("float64", copy=False)

In [7]:
# Displaying the final version of user_movie_matrix as a Numpy Array.

user_movie_matrix

array([[5., 3., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [4., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [5., 4., 0., ..., 0., 0., 0.]])

In [8]:
# Declaration and initialization of a Numpy Array to hold the averages of (observed) ratings of 
# existing users.

train_average_ratings = np.zeros(shape=(200,))

In [9]:
# Populating the above Numpy Array:

for train_count, train_user in enumerate(user_movie_matrix):
    nonzero_rating_indices = np.nonzero(train_user)
    train_average_ratings[train_count] = np.mean(train_user[nonzero_rating_indices])

In [10]:
# Declaration and initialization of a list to contain movies which have not been rated by any
# existing user:

train_unrated_movies = []

for train_count, movie in enumerate(user_movie_matrix.T):
    if not any(movie):
        train_unrated_movies.append(train_count)

In [11]:
# Declaration and initialization of two dictionaries corresponding to 'test5.txt':

# one to contain ratings for movies watched by each active user,

test5_observed_ratings = {}

# and the other, to contain target movie(s) for which rating prediction(s) is/are to be made, 
# respective to each active user.

test5_required_predictions = {}

for test5_count in range(100):
    test5_observed_ratings[test5_count] = []
    test5_required_predictions[test5_count] = []

In [12]:
# Parsing 'test20.txt' for subsequent assignments of the above dictionaries.

with open("../test20.txt", "r") as test5_file_handle:
    for line in test5_file_handle:
        user_movie_rating = [int(value) for value in line.split()]
        if user_movie_rating[2] != 0:
            test5_observed_ratings[user_movie_rating[0]-401].append((user_movie_rating[1]-1, 
                                                                     user_movie_rating[2]))
        else:
            test5_required_predictions[user_movie_rating[0]-401].append(user_movie_rating[1]-1)

In [13]:
# Declaration and initialization of a Numpy Array to hold the averages of (observed) ratings of 
# active users.

test5_average_ratings = np.zeros(shape=(100,))

In [14]:
# Populating the above Numpy Array:

for test5_count, test5_user in enumerate(test5_observed_ratings.values()):
    test5_user_ratings = []
    for movie, rating in test5_user:
        test5_user_ratings.append(rating)
    test5_average_ratings[test5_count] = np.mean(np.array(test5_user_ratings))

In [15]:
# Declaration and initialization of a list to contain active users who have rated each movie with the
# same value.

test5_users_with_same_ratings = []

In [16]:
# Populating the above list:

for test5_count in range(100):
    observed_movies_ratings = test5_observed_ratings[test5_count]
    different_ratings = []
    for movie, rating in observed_movies_ratings:
        if rating not in different_ratings:
            different_ratings.append(rating)
    if len(different_ratings) == 1:
        test5_users_with_same_ratings.append(test5_count)

In [17]:
# Defining a function to compute the similarity between two arbitrary users (represented as vectors),
# based on their movie ratings. 

def cosine_similarity(vector_1, vector_2):
    return np.dot(vector_1, vector_2)/(np.linalg.norm(vector_1)*np.linalg.norm(vector_2))

In [18]:
# Initialization of a file handler to operate on a text file in which to store results (i.e., predicted
# ratings for target movie(s) corresponding to each active user).

result5_file_handle = open("Task2-ItemBasedCF-Test20.txt", "a")

In [19]:
# COLLABORATIVE FILTERING ALGORITHM:

# Iteration through each active user:
for test5_count in range(100):
    available_ratings = test5_observed_ratings[test5_count]
    target_movies = test5_required_predictions[test5_count]
    # Iteration through each target movie:
    for movie in target_movies:
        # If the target movie has not been rated by any existing user, its rating by the 
        # corresponding active user is approximated to be equal to the average value in the rating
        # scale (in this particular model, the average rating equates to 3, for a rating scale from
        # 1 to 5, with unit increments).
        if movie in train_unrated_movies:
            result5_file_handle.write(f"{test5_count + 401} {movie + 1} 3\n")
        else:
            train_users_rated_target_movie = np.nonzero(user_movie_matrix[:, movie])
            item_item_similarity = {}
            for rated_movie, rating in available_ratings:
                train_users_rated_observed_movie = np.nonzero(user_movie_matrix[:, rated_movie])
                train_users_rated_target_observed_movies = np.intersect1d(train_users_rated_target_movie, train_users_rated_observed_movie)
                if len(train_users_rated_target_observed_movies) == 0:
                    item_item_similarity[rated_movie] = 0
                    continue
                target_movie_ratings = (user_movie_matrix[:, movie])[train_users_rated_target_observed_movies]
                observed_movie_ratings = (user_movie_matrix[:, rated_movie])[train_users_rated_target_observed_movies]
                for count, user in enumerate(train_users_rated_target_observed_movies):
                    target_movie_ratings[count] -= train_average_ratings[user]
                    observed_movie_ratings[count] -= train_average_ratings[user]
                if any(target_movie_ratings) and any(observed_movie_ratings):
                    item_item_similarity[rated_movie] = cosine_similarity(target_movie_ratings, observed_movie_ratings)
            if not any (item_item_similarity.values()):
                result5_file_handle.write(f"{test5_count + 401} {movie + 1} 3\n")
            else:
                numerator, denominator = 0, 0
                for rated_movie, rating in available_ratings:
                    if rated_movie in item_item_similarity:
                        numerator += item_item_similarity[rated_movie] * (rating - test5_average_ratings[test5_count])
                        denominator += abs(item_item_similarity[rated_movie])
                predicted_rating = test5_average_ratings[test5_count] + (numerator/denominator)
                if round(predicted_rating) not in range(1, 6):
                    if predicted_rating < 1:
                        predicted_rating = 1
                    else:
                        predicted_rating = 5
                result5_file_handle.write(f"{test5_count + 401} {movie + 1} {str(round(predicted_rating))[0]}\n")

In [20]:
# Closing the text file after having written all outputs:

result5_file_handle.close()