# User based collaborative filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Get the data into pandas dataframes

In [2]:
FOLDER_PATH = 'Data/'
books = pd.read_csv(FOLDER_PATH + 'books.csv')
sample_submission = pd.read_csv(FOLDER_PATH + 'sample_submission.csv')
test = pd.read_csv(FOLDER_PATH + 'test.csv')
train = pd.read_csv(FOLDER_PATH + 'train.csv')

Create the user-item matrix

In [3]:
# Create a user-item matrix where the rows are comprised of user_ids
# and the columns of book_ids. The values are the ratings
user_item_matrix = train.pivot(index='user_id', columns='book_id', values='rating')
user_item_matrix = user_item_matrix.fillna(0)

Calculate the User Similarity matrix using cosine similarity

In [4]:
# Use the sklearn function in order to get the cosine similarity between each user
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

Center the mean of the ratings for each user

In [None]:
def mean_center_ignore_zeros(user_item_matrix):
    user_item_matrix_np = user_item_matrix.values
    
    # Calculate mean ratings ignoring zero values, which is important in order to adjust 
    # user preferences subjectivity in terms of their ratins, as some may be harsher than others
    
    # Sum of ratings per user
    row_sums = user_item_matrix_np.sum(axis=1) 
    # Number of non-zero ratings per user
    row_counts = (user_item_matrix_np > 0).sum(axis=1)  
    user_means = np.divide(row_sums, row_counts, out=np.zeros_like(row_sums, dtype=float), where=row_counts != 0)  # Avoid division by zero

    # Center the matrix by subtracting the mean (only for non-zero entries)
    centered_matrix = user_item_matrix_np - user_means[:, np.newaxis]
    # Keep the zero entries
    centered_matrix[user_item_matrix_np == 0] = 0 

    centered_df = pd.DataFrame(centered_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)
    
    return centered_df, user_means

Create a matrix with all the predicted ratings

In [None]:
def predict_ratings(user_item_matrix, user_similarity_df, k=10):
    user_item_matrix_np = user_item_matrix.values
    user_similarity_np = user_similarity_df.values
    normalized_matrix, user_means = mean_center_ignore_zeros(user_item_matrix)
    normalized_matrix_np = normalized_matrix.values
   
    # Apply top-k filtering to similarity matrix
    top_k_similarities = np.zeros_like(user_similarity_np)
    for i in range(user_similarity_np.shape[0]):
        top_k_indices = np.argsort(-user_similarity_np[i])[:k]
        top_k_similarities[i, top_k_indices] = user_similarity_np[i, top_k_indices]
    
    weighted_sum = top_k_similarities.dot(normalized_matrix_np)
    sum_of_similarities = np.abs(top_k_similarities).dot((user_item_matrix_np > 0).astype(float))
    # Avoid division by zero
    sum_of_similarities[sum_of_similarities == 0] = 1e-8
    predicted_ratings_np = (weighted_sum / sum_of_similarities) + user_means[:, np.newaxis]
    # In case any ratings are outside the 1-5 range, readjust them
    predicted_ratings_np = np.clip(predicted_ratings_np, 1, 5)
    predicted_ratings = pd.DataFrame(predicted_ratings_np, index=user_item_matrix.index, columns=user_item_matrix.columns)
    
    return predicted_ratings

predicted_ratings = predict_ratings(user_item_matrix, user_similarity_df)

Create a function that creates the sumbission.csv file

In [None]:
def make_submission(predicted_ratings):
    submission = []
    for index, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        unique_id = row['id']  
        predicted_rating = predicted_ratings.loc[user_id, book_id]
        submission.append({'id': int(unique_id), 'rating': predicted_rating})
    submission_df = pd.DataFrame(submission)
    return submission_df

submissions_df = make_submission(predicted_ratings)
submissions_df.to_csv('submission_user_cf.csv', index=False)