In [None]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_data(ratings_file_path, features_file_path):
    """
    Loads and returns datasets for ratings and movie features from specified file paths.

    Parameters
    ----------
    ratings_file_path : str
        Path to the CSV file containing user ratings with columns: userId, movieId, and rating.
    features_file_path : str
        Path to the CSV file containing movie features.

    Returns
    -------
    tuple of pd.DataFrame
        Returns two pandas DataFrames: one for ratings and another for movie features.

    """
    dtype_ratings = {
        'userId': 'int32',
        'movieId': 'int32',
        'rating': 'float32'
    }
    dtype_features = {
        'movieId': 'int32'
    }

    df_ratings = pd.read_csv(ratings_file_path, dtype=dtype_ratings)
    df_features = pd.read_csv(features_file_path, dtype=dtype_features)
    
    return df_ratings, df_features

In [None]:
traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [None]:
# Ensure 'movieId' columns are of type int
traindf['movieId'] = traindf['movieId'].astype('int')
df_movies['movieId'] = df_movies['movieId'].astype('int')

In [None]:
# Align movie features to training data
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [None]:
# Convert user and movie IDs to categorical types
user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [None]:
# Convert user and item IDs to numerical codes
user_ids = user_categories.codes
item_ids = item_categories.codes

In [None]:
# Create rating matrix
rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [None]:
# Scale the rating matrix
scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [None]:
# Repeat alignment of movie features to training data
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [None]:
# Vectorize movie genres using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [None]:
# Calculate user weights based on rating counts
user_counts = traindf['userId'].value_counts()
user_weights = user_counts / user_counts.max()

# Adjust scaled rating matrix by user weights
for idx, row in traindf.iterrows():
    user_idx = user_categories.categories.get_loc(row['userId'])
    item_idx = item_categories.categories.get_loc(row['movieId'])
    user_id = row['userId']
    rating_matrix_scaled[user_idx, item_idx] *= user_weights.loc[user_id]

In [None]:
importance_of_genre = 0.5
content_weighted_features = tags_features.multiply(importance_of_genre).toarray()
full_features_matrix = np.hstack([rating_matrix_scaled.T, content_weighted_features]).T

In [None]:
# NMF with importance_of_genre = 0.5
model = NMF(n_components=15, init='nndsvd', max_iter=100, random_state=42)
W = model.fit_transform(full_features_matrix)
H = model.components_

In [None]:
def get_favorite_genres(user_id, df_ratings, df_movies, top_n=3):
    """
    Calculates and returns the user's favorite genres based on their historical ratings.

    Parameters
    ----------
    user_id : int
        The user ID whose favorite genres are to be determined.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.
    top_n : int, optional
        Number of top genres to return, default is 3.

    Returns
    -------
    dict
        A dictionary containing:
        - 'top_genres': list of the user's top_n favorite genres.
        - 'average_ratings': dictionary mapping each genre to its average rating by the user.
        - 'genre_counts': dictionary mapping each genre to the number of times it has been rated by the user.
    """
    user_ratings = df_ratings[df_ratings['userId'] == user_id]
    user_ratings.loc[:, 'movieId'] = user_ratings['movieId'].astype(int)
    df_movies.loc[:, 'movieId'] = df_movies['movieId'].astype(int)

    # Merge user ratings with movie genres
    user_genres = user_ratings.merge(df_movies[['movieId']], on='movieId', how='left')
    # Calculate genre ratings and counts
    genre_ratings = {}
    genre_counts = {}
    for index, row in user_genres.iterrows():
        if pd.isnull(row['genres']):
            continue
        genres = eval(row['genres'])
        rating = row['rating']
        for genre in genres:
            if genre in genre_ratings:
                genre_ratings[genre].append(rating)
                genre_counts[genre] += 1
            else:
                genre_ratings[genre] = [rating]
                genre_counts[genre] = 1

    max_count = max(genre_counts.values(), default=0)
    
    # Calculate preference scores for genres
    genre_preferences = {}
    genre_avg_ratings = {}
    for genre, ratings in genre_ratings.items():
        average_rating = np.mean(ratings)
        normalized_count = genre_counts[genre] / max_count if max_count > 0 else 0
        count_weight = np.tanh(normalized_count)
        preference_score = average_rating * count_weight
        genre_preferences[genre] = preference_score
        genre_avg_ratings[genre] = average_rating
    
    # Sort genres by preference score
    sorted_genres = sorted(genre_preferences.items(), key=lambda x: x[1], reverse=True)
    
    detailed_output = {
        'top_genres': [genre for genre, _ in sorted_genres[:top_n]],
        #'average_ratings': genre_avg_ratings,
        #'genre_counts': genre_counts,
        'genre_preferences': genre_preferences
    }
    print(detailed_output) 

    return detailed_output

In [None]:
def get_top_n_genre_based_recommendations(user_id, n, df_ratings, df_movies):
    """
    Generates top N genre-based movie recommendations for a given user.

    Parameters
    ----------
    user_id : int
        The user ID for whom genre-based recommendations are to be made.
    n : int
        Number of top recommendations to generate based on the user's favorite genres.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies filtered by user's top genres,
        with columns: movieId, title, and genres.
    """
    genre_data = get_favorite_genres(user_id, df_ratings, df_movies, 5)
    top_genres = genre_data['top_genres']
    
    if user_id not in user_categories.categories:
        return pd.DataFrame()
    
    # Predict ratings for the user
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    # Filter movies by user's favorite genres
    genre_filtered_movies = df_movies[df_movies['genres'].apply(
        lambda x: any(genre in x for genre in top_genres))]

    # Get indices of genre-filtered movies
    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])
    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]
    
    # Get top N genre-based movie recommendations
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]
    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [None]:
recommended_movies = get_top_n_genre_based_recommendations(45, 20, traindf, df_movies)
recommended_movies