In [104]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np
import dask as dd

In [105]:
df = pd.read_csv("../cleaned_data/movies_main.csv")
ratings_df = pd.read_csv("../../source_data/ratings_small.csv")

In [106]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [107]:
# Filter out movies with very few ratings (e.g., less than 50)
filtered_movies = ratings_df['movieId'].value_counts()
filtered_movies = filtered_movies[filtered_movies >= 50].index
ratings_df = ratings_df[ratings_df['movieId'].isin(filtered_movies)]

# Filter out users with very few ratings (e.g., less than 10)
filtered_users = ratings_df['userId'].value_counts()
filtered_users = filtered_users[filtered_users >= 10].index
ratings_df = ratings_df[ratings_df['userId'].isin(filtered_users)]

# Create a user-item matrix
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')

# Fill NaN with 0
user_item_matrix = user_item_matrix.fillna(0)

# Convert to sparse matrix
sparse_matrix = csr_matrix(user_item_matrix.values)

# Compute item-item similarity
item_similarity = cosine_similarity(sparse_matrix.T)  # Transpose for item-item similarity

# Multiply user-item matrix by item-item similarity matrix to predict ratings
predicted_ratings = sparse_matrix.dot(item_similarity)

# Convert predictions back to DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Get top N recommendations for a given user (example for userId=220)
user_id = 2
top_n = 5

# Ensure user_id is in the index
if user_id in predicted_ratings_df.index:
    user_ratings = predicted_ratings_df.loc[user_id]
    user_recommendations = user_ratings.sort_values(ascending=False).head(top_n)

    print("Top N Recommendations for User:", user_id)
    print(user_recommendations)
else:
    print(f"User ID {user_id} not found in the dataset.")

Top N Recommendations for User: 2
movieId
457    99.764902
590    97.332955
480    97.156764
377    96.853636
380    96.682308
Name: 2, dtype: float64


In [108]:
predicted_ratings_df

movieId,1,2,3,5,6,7,10,11,16,17,...,59315,60069,63082,68157,68358,68954,70286,72998,74458,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,70.683136,72.978475,36.696637,44.092567,58.314750,42.772665,82.532509,71.770229,54.655690,62.879285,...,29.794536,26.380053,19.453053,22.436596,24.101412,23.297054,22.217573,24.121771,21.565688,27.200440
3,47.655393,33.076350,15.615708,19.498468,31.751630,16.121116,34.052120,28.565509,34.965707,24.584896,...,33.656082,30.527187,24.422351,28.258879,28.345624,28.267987,26.981509,28.731150,26.403666,32.449955
4,179.504074,135.936134,77.949012,85.888614,125.085551,89.758550,145.050871,142.007496,144.550988,120.864749,...,95.164626,82.507173,61.510685,79.241280,82.624818,80.518761,73.107371,76.181970,72.196906,90.113019
5,109.202024,84.714920,51.818053,59.220081,71.168718,51.505519,78.376252,80.536889,85.263160,66.293078,...,66.675548,62.903937,51.647729,53.998071,57.168972,59.898304,50.765279,58.442997,51.317723,61.492360
6,35.145572,23.381578,11.192168,14.942729,24.710911,12.670663,23.127756,21.824055,29.783820,19.976048,...,24.416430,22.631518,18.178282,22.202764,21.826205,22.118845,22.808288,21.287485,20.346791,24.066625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,60.205763,54.996295,32.728058,36.486859,56.706727,38.895307,61.521451,60.745363,49.997252,52.837982,...,23.177674,20.766245,15.579329,19.068177,19.273748,18.509300,18.958010,18.789878,18.204993,22.042026
668,19.837499,11.551990,6.971026,7.481606,15.915400,7.428968,12.575172,13.146968,18.380480,13.079321,...,10.475539,11.551124,9.824055,11.599470,8.416337,9.696327,10.161619,9.429064,9.904699,12.454959
669,21.723745,13.031875,9.272572,9.333907,15.445434,9.130620,14.041318,15.336923,18.686224,12.570603,...,12.619016,10.133360,8.233710,10.669780,10.848378,10.608438,9.836827,10.066004,9.388761,12.077559
670,45.289350,29.660828,17.551383,20.672397,34.876554,19.348268,32.373067,31.059764,35.405965,29.812302,...,21.958265,20.821370,16.353656,18.815281,18.841243,18.043196,18.574351,18.671852,17.679476,22.429060
