In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
def load_data(ratings_file_path, features_file_path):
    df_ratings = pd.read_csv(ratings_file_path)
    df_features = pd.read_csv(features_file_path)
    return df_ratings, df_features


In [3]:
traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [4]:
user_ids = traindf['userId'].astype("category").cat.codes
item_ids = traindf['movieId'].astype("category").cat.codes
rating_matrix = np.zeros((user_ids.max()+1, item_ids.max()+1))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [5]:
scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [6]:
df_movies['movieId'] = pd.Categorical(df_movies['movieId'], categories=traindf['movieId'].unique())
df_movies_aligned = df_movies.dropna(subset=['genres'])
df_movies_aligned = df_movies_aligned[df_movies_aligned['movieId'].isin(traindf['movieId'].unique())]

In [7]:
vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [8]:
item_features_array = tags_features.toarray()
full_features_matrix = np.hstack([rating_matrix_scaled.T, item_features_array]).T

In [9]:
n_components = 15
model = NMF(n_components=n_components, init='random', random_state=0, max_iter=200)
W = model.fit_transform(full_features_matrix)
H = model.components_



In [10]:
user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [11]:
user_ids = user_categories.codes
item_ids = item_categories.codes
rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [12]:
user_category_mapping = pd.Series(index=user_categories.categories, data=np.arange(user_categories.categories.size))
item_category_mapping = pd.Series(index=item_categories.categories, data=np.arange(item_categories.categories.size))

In [13]:
def get_top_n_recommendations(user_id, n):
    if user_id not in user_category_mapping:
        return pd.DataFrame()  # Return an empty DataFrame if user_id is not in the mapping

    user_idx = user_category_mapping[user_id]  # Convert user_id to user index using the mapping
    predicted_ratings = np.dot(W[user_idx, :], H)
    top_n_indices = np.argsort(predicted_ratings)[-n:]  # Indices of top N recommendations
    top_n_movie_ids = item_categories.categories[top_n_indices]  # Map indices to movie IDs

    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [30]:
recommended_movies = get_top_n_recommendations(55, 20)  # Example: user_id 10
print(recommended_movies)

     movieId                                              title  \
224      260          Star Wars: Episode IV - A New Hope (1977)   
257      296                                Pulp Fiction (1994)   
277      318                   Shawshank Redemption, The (1994)   
314      356                                Forrest Gump (1994)   
461      527                            Schindler's List (1993)   
510      593                   Silence of the Lambs, The (1991)   
659      858                              Godfather, The (1972)   
1503    2028                         Saving Private Ryan (1998)   
1939    2571                                 Matrix, The (1999)   
2145    2858                             American Beauty (1999)   
2226    2959                                  Fight Club (1999)   
2674    3578                                   Gladiator (2000)   
3141    4226                                     Memento (2000)   
3194    4306                                       Shrek (2001

In [15]:
reconstructed_ratings = np.dot(W, H)
original_mask = rating_matrix > 0  # Only consider originally rated items
rmse = np.sqrt(mean_squared_error(rating_matrix_scaled[original_mask], reconstructed_ratings.T[original_mask]))
print(f"RMSE of the hybrid model: {rmse}")

IndexError: boolean index did not match indexed array along dimension 0; dimension is 8931 but corresponding boolean dimension is 610

In [31]:
plt.figure(figsize=(10, 6))
plt.hist(reconstructed_ratings.T[original_mask], bins=50, alpha=0.75, label='Predicted Ratings')
plt.title('Histogram of Predicted Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.legend()
plt.show()

IndexError: boolean index did not match indexed array along dimension 0; dimension is 8931 but corresponding boolean dimension is 610

<Figure size 1000x600 with 0 Axes>