In [None]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_data(ratings_file_path, features_file_path):
    """
    Loads and returns datasets for ratings and movie features from specified file paths.

    Parameters
    ----------
    ratings_file_path : str
        Path to the CSV file containing user ratings with columns: userId, movieId, and rating.
    features_file_path : str
        Path to the CSV file containing movie features.

    Returns
    -------
    tuple of pd.DataFrame
        Returns two pandas DataFrames: one for ratings and another for movie features.

    """
    dtype_ratings = {
        'userId': 'int32',
        'movieId': 'int32',
        'rating': 'float32'
    }
    dtype_features = {
        'movieId': 'int32'
    }

    df_ratings = pd.read_csv(ratings_file_path, dtype=dtype_ratings)
    df_features = pd.read_csv(features_file_path, dtype=dtype_features)
    
    return df_ratings, df_features

In [None]:
traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [None]:
# Ensure 'movieId' columns are of type int
traindf['movieId'] = traindf['movieId'].astype('int')
df_movies['movieId'] = df_movies['movieId'].astype('int')

In [None]:
# Align movie features to training data
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [None]:
# Convert user and movie IDs to categorical types
user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [None]:
# Convert user and item IDs to numerical codes
user_ids = user_categories.codes
item_ids = item_categories.codes

In [None]:
# Create rating matrix
rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [None]:
# Scale the rating matrix
scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [None]:
# Repeat alignment of movie features to training data
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [None]:
# Vectorize movie genres using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [None]:
# Calculate user weights based on rating counts
user_counts = traindf['userId'].value_counts()
user_weights = user_counts / user_counts.max()

# Adjust scaled rating matrix by user weights
for idx, row in traindf.iterrows():
    user_idx = user_categories.categories.get_loc(row['userId'])
    item_idx = item_categories.categories.get_loc(row['movieId'])
    user_id = row['userId']
    rating_matrix_scaled[user_idx, item_idx] *= user_weights.loc[user_id]

In [None]:
importance_of_genre = 0.5
content_weighted_features = tags_features.multiply(importance_of_genre).toarray()
full_features_matrix = np.hstack([rating_matrix_scaled.T, content_weighted_features]).T

In [None]:
# NMF with importance_of_genre = 0.5
model = NMF(n_components=15, init='nndsvd', max_iter=100, random_state=42)
W = model.fit_transform(full_features_matrix)
H = model.components_

In [None]:
def recommend_movies_by_user_genre_choice(user_id, selected_genre, df_movies, df_ratings, n=10):
    """
    Recommends top N movies for a given user based on a chosen genre and predicted ratings from an NMF model.

    Parameters
    ----------
    user_id : int
        User ID for whom recommendations are to be made.
    selected_genre : str
        The genre chosen by the user to filter the recommendations.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    n : int
        Number of top recommendations to generate.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies filtered by the chosen genre, sorted by predicted rating.
    """
    if user_id not in user_categories.categories:
        return pd.DataFrame()

    # Predict ratings for the user
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    # Normalize genres to prevent case and spacing issues
    df_movies['genres'] = df_movies['genres'].str.lower().str.replace(r'\s+', '', regex=True)

    # Filter movies by the chosen genre using normalized genre strings
    genre_filtered_movies = df_movies[df_movies['genres'].apply(
        lambda x: selected_genre.lower().replace(' ', '') in x)]

    # Merge predicted ratings with movie details
    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])
    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]
    
    # Get top N genre-based movie recommendations
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]                                  
    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [None]:
recommended_movies = recommend_movies_by_user_genre_choice(
    user_id=25, selected_genre='fantasy', df_movies=df_movies, df_ratings=traindf,n=20)
recommended_movies