Import Libraries

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

Load Data

In [2]:
# Load the data
ratings = pd.read_csv('ratings.csv')

# Display the first few rows of the dataset
print(ratings.head())  

# Define desired user ID
user_id = 126

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


Part a

Pearson

In [15]:
# Create a pivot table
ratings_pivot = ratings.pivot(index='userId', columns='movieId', values='rating')

# Function to compute Pearson correlation coefficient
def pearson_similarity(user1, user2):
    common_ratings = ratings_pivot.loc[user1].dropna().index.intersection(ratings_pivot.loc[user2].dropna().index)
    if len(common_ratings) < 2:
        return float('nan')
    return pearsonr(ratings_pivot.loc[user1, common_ratings], ratings_pivot.loc[user2, common_ratings])[0]

# Compute similarity of user 126 with all other users
pearson_similarities = ratings_pivot.index.to_series().apply(lambda x: pearson_similarity(user_id, x)).drop(user_id)

# Filter out NaN values
pearson_similarities = pearson_similarities.dropna()

# Top 10 similar users by Pearson correlation
top_10_pearson = pearson_similarities.nlargest(10)
similar_users = top_10_pearson.index
print("Pearson:")
print(top_10_pearson)

  return pearsonr(ratings_pivot.loc[user1, common_ratings], ratings_pivot.loc[user2, common_ratings])[0]


Pearson:
userId
61     1.0
146    1.0
237    1.0
252    1.0
259    1.0
272    1.0
293    1.0
309    1.0
487    1.0
511    1.0
Name: userId, dtype: float64


Cosine Similarity

In [16]:
# Fill NaN with 0s for cosine similarity computation
ratings_filled = ratings_pivot.fillna(0)

# Compute cosine similarity
cosine_similarities = cosine_similarity(ratings_filled)
cosine_sim_df = pd.DataFrame(cosine_similarities, index=ratings_filled.index, columns=ratings_filled.index)

# Top 10 similar users by cosine similarity
top_10_cosine = cosine_sim_df.loc[user_id].nlargest(11).iloc[1:]
print("Cosine sim:")
print(top_10_cosine)

Cosine sim:
userId
379    0.813373
94     0.706430
507    0.690587
512    0.684872
81     0.679452
485    0.679053
179    0.674915
498    0.671849
470    0.665665
340    0.652847
Name: 126, dtype: float64


Part b

USER-USER Collaborative filtering

In [5]:
# Function to predict rating using the user-user collaborative filtering approach
def predict_user_rating(user_id, movie_id, similar_users):
    numerator = 0
    denominator = 0
    
    for similar_user in similar_users:
        similarity = pearson_similarities[similar_user]
        rating = ratings_pivot.loc[similar_user, movie_id]
        
        if not pd.isna(rating):
            numerator += similarity * rating
            denominator += abs(similarity)
    
    if denominator == 0:
        return float('nan')
    
    return numerator / denominator

In [6]:
def user_user_cf(user_id, similar_users, top_n=6):
    user_ratings = ratings_pivot.loc[user_id].dropna()
    predictions = pd.Series(dtype=float)
    
    for movie_id in ratings_pivot.columns:
        if movie_id not in user_ratings.index:
            predicted_rating = predict_user_rating(user_id, movie_id, similar_users)
            predictions.loc[movie_id] = predicted_rating
    
    recommendations = predictions.dropna().nlargest(top_n)
    full_recommendations = predictions.dropna().nlargest(150)
    return recommendations, full_recommendations

In [7]:
user_user_recommendations, user_for_combined = user_user_cf(user_id, similar_users)
print("Top 6 reccomended movies using user-user CF:")
# print(user_user_recommendations)
for movie_id, rating in user_user_recommendations.items():
    print(f"Movie {movie_id} ({rating})")

Top 6 reccomended movies using user-user CF:
Movie 912 (5.0)
Movie 1172 (5.0)
Movie 1250 (5.0)
Movie 1283 (5.0)
Movie 1304 (5.0)
Movie 1584 (5.0)


Part c & d functions

In [8]:
# Compute item-item similarity matrix using Pearson correlation
item_sim_df = ratings_pivot.corr(method='pearson')

# Function to predict rating using the item-item collaborative filtering formula
def predict_item_rating(user_id, item_id, item_sim_df, ratings_pivot, top_n=10):
    user_ratings = ratings_pivot.loc[user_id].dropna()
    
    # Similar items to the target item
    similar_items = item_sim_df[item_id].sort_values(ascending=False).index
    
    numerator = 0
    denominator = 0
    
    for similar_item in similar_items:
        if similar_item in user_ratings.index:
            similarity = item_sim_df.loc[item_id, similar_item]
            rating = user_ratings[similar_item]
            numerator += similarity * rating
            denominator += abs(similarity)
            
    if denominator == 0:
        return float('nan')
    
    return numerator / denominator

In [9]:
# Overall average rating
mu = ratings['rating'].mean()
# User biases
user_biases = ratings.groupby('userId')['rating'].mean() - mu
# Item biases
item_biases = ratings.groupby('movieId')['rating'].mean() - mu

In [10]:
# Function to predict rating using the item-item collaborative filtering formula with baseline estimate
def predict_item_rating_with_baseline(user_id, item_id, item_sim_df, ratings_pivot, top_n=10):
    user_ratings = ratings_pivot.loc[user_id].dropna()
    
    # Baseline estimate
    bx = user_biases[user_id] if user_id in user_biases else 0
    bi = item_biases[item_id] if item_id in item_biases else 0
    bxi = mu + bx + bi
    
    # Similar items to the target item
    similar_items = item_sim_df[item_id].sort_values(ascending=False).index
    
    numerator = 0
    denominator = 0
    
    for similar_item in similar_items:
        if similar_item in user_ratings.index:
            similarity = item_sim_df.loc[item_id, similar_item]
            rating = user_ratings[similar_item]
            bxi_similar = mu + bx + (item_biases[similar_item] if similar_item in item_biases else 0)
            numerator += similarity * (rating - bxi_similar)
            denominator += abs(similarity)
            
    if denominator == 0:
        return bxi
    
    return bxi + (numerator / denominator)

In [11]:
# Function to recommend movies based on item-item collaborative filtering
def item_item_cf(user_id, mode, top_n=6):
    user_ratings = ratings_pivot.loc[user_id].dropna()
    predictions = pd.Series(dtype=float)
    
    for item_id in ratings_pivot.columns:
        if item_id not in user_ratings.index:
            if mode == 0:
                predicted_rating = predict_item_rating(user_id, item_id, item_sim_df, ratings_pivot)
            else:
                predicted_rating = predict_item_rating_with_baseline(user_id, item_id, item_sim_df, ratings_pivot)
            predictions[item_id] = predicted_rating
    
    recommendations = predictions.dropna().nlargest(top_n)
    full_recommendations = predictions.dropna().nlargest(150)
    return recommendations, full_recommendations

Part c output

In [12]:
item_item_recommendations, item_for_combined = item_item_cf(user_id, 0)
# print(item_item_recommendations)
print("Top 6 reccomended movies using item-item CF:")
for movie_id, rating in item_item_recommendations.items():
    print(f"Movie {movie_id} ({rating})")

Top 6 reccomended movies using item-item CF:
Movie 2268 (3.59011168088767)
Movie 2 (3.5243121063177916)
Movie 1101 (3.502890351669596)
Movie 3578 (3.5008311549944136)
Movie 2706 (3.500130255277038)
Movie 2571 (3.4877977492085033)


Part d output

In [13]:
item_item_recommendations_with_baseline, item_with_b_for_combined = item_item_cf(user_id, 1)
# print(item_item_recommendations)
print("Top 6 reccomended movies using item-item CF (using baaseline):")
for movie_id, rating in item_item_recommendations_with_baseline.items():
    print(f"Movie {movie_id} ({rating})")

Top 6 reccomended movies using item-item CF (using baaseline):
Movie 741 (4.425681217089712)
Movie 44195 (4.41897691784027)
Movie 1272 (4.355954629885956)
Movie 904 (4.346099139826445)
Movie 1252 (4.333022965431732)
Movie 2761 (4.298575343505807)


Part e

In [14]:
# Define weights
w_user = 0.7
w_item = 0.3

# Find common movie IDs
common_movie_ids = user_for_combined.index.intersection(item_for_combined.index)

if len(common_movie_ids) == 0:
    print("Error: No common movie IDs found.")
else:
    # Combine recommendations using weighted average for common movie IDs
    combined_recommendations = {}
    for movie_id in common_movie_ids:
        rating_user = user_user_recommendations.get(movie_id, 0) 
        rating_item = item_item_recommendations.get(movie_id, 0)
        
        combined_rating = (w_user * rating_user) + (w_item * rating_item)
        combined_recommendations[movie_id] = combined_rating
    
    # Add ratings from user-user for IDs not in item-item
    for movie_id in user_user_recommendations.index.difference(item_item_recommendations.index):
        rating_user = user_user_recommendations[movie_id]
        combined_recommendations[movie_id] = w_user * rating_user
    
    # Convert combined recommendations to a pandas Series
    combined_series = pd.Series(combined_recommendations)
    final_recommendations = combined_series.dropna().nlargest(6)
    print("Top 6 reccomended movies using combination of user-user and item-item CF:")
    for movie_id, rating in final_recommendations.items():
        print(f"Movie {movie_id} ({rating})")

Top 6 reccomended movies using combination of user-user and item-item CF:
Movie 1584 (3.5)
Movie 912 (3.5)
Movie 1172 (3.5)
Movie 1250 (3.5)
Movie 1283 (3.5)
Movie 1304 (3.5)
