# Item based collaborative filtering

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Get the data into pandas dataframes

In [10]:
FOLDER_PATH = 'Data/'
books = pd.read_csv(FOLDER_PATH + 'books.csv')
sample_submission = pd.read_csv(FOLDER_PATH + 'sample_submission.csv')
test = pd.read_csv(FOLDER_PATH + 'test.csv')
train = pd.read_csv(FOLDER_PATH + 'train.csv')

Create the user-item matrix

In [11]:
# Create a user-item matrix where the rows are comprised of user_ids
# and the columns of book_ids. The values are the ratings
user_item_matrix = train.pivot(index='user_id', columns='book_id', values='rating')
user_item_matrix = user_item_matrix.fillna(0)

Create the item similarity matrix

In [12]:
item_similarity = cosine_similarity(user_item_matrix.T)

Center the mean of the ratings for each item

In [None]:
def mean_center_ignore_zeros(user_item_matrix):
    user_item_matrix_np = user_item_matrix.values
    
    # Calculate mean ratings ignoring zero values
    
    # Sum of ratings per item
    column_sums = user_item_matrix_np.sum(axis=0) 
    # Number of non-zero ratings per item
    column_counts = (user_item_matrix_np > 0).sum(axis=0)  
    item_means = np.divide(column_sums, column_counts, out=np.zeros_like(column_sums, dtype=float), where=column_counts != 0)  # Avoid division by zero

    # Center the matrix by subtracting the mean (only for non-zero entries)
    centered_matrix = user_item_matrix_np - item_means[np.newaxis,:]
    # Keep the zero entries
    centered_matrix[user_item_matrix_np == 0] = 0 

    centered_df = pd.DataFrame(centered_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)
    
    return centered_df, item_means

Create a matrix with all the predictions in it

In [None]:
def item_based_predict(user_item_matrix_df, item_similarity, k=10):
    normalized_matrix, item_means = mean_center_ignore_zeros(user_item_matrix_df)
    normalized_matrix_np = normalized_matrix.values
    
    # Apply top-k filtering to similarity matrix
    top_k_similarities = np.zeros_like(item_similarity)
    for i in range(item_similarity.shape[0]):
        top_k_indices = np.argsort(-item_similarity[i])[:k]
        top_k_similarities[i, top_k_indices] = item_similarity[i, top_k_indices]
    
    weighted_sum = normalized_matrix_np.dot(top_k_similarities)
    sum_of_similarities = np.abs(normalized_matrix_np).dot((top_k_similarities > 0).astype(float))
    #Prevent division by zero
    sum_of_similarities[sum_of_similarities == 0] = 1e-8
    predicted_ratings = (weighted_sum / sum_of_similarities) + item_means[np.newaxis,:]
    # In case the ratings are outside the 1-5 range
    predicted_ratings = np.clip(predicted_ratings, 1, 5)
    
    predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix_df.index, columns=user_item_matrix_df.columns)
    
    return predicted_ratings_df

predicted_ratings = item_based_predict(user_item_matrix, item_similarity)

Create a function that generates the submission.csv file

In [None]:
def make_submission(predicted_ratings):
    submission = []
    for index, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        unique_id = row['id']  
        predicted_rating = predicted_ratings.loc[user_id, book_id]
        submission.append({'id': int(unique_id), 'rating': predicted_rating})
    submission_df = pd.DataFrame(submission)
    return submission_df

submissions_df = make_submission(predicted_ratings)
submissions_df.to_csv('submission_item_cf.csv', index=False)