In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_data(ratings_file_path, features_file_path):
    """
    Loads and returns datasets for ratings and movie features from specified file paths.

    Parameters
    ----------
    ratings_file_path : str
        Path to the CSV file containing user ratings with columns: userId, movieId, and rating.
    features_file_path : str
        Path to the CSV file containing movie features.

    Returns
    -------
    tuple of pd.DataFrame
        Returns two pandas DataFrames: one for ratings and another for movie features.

    """
    dtype_ratings = {
        'userId': 'int32',
        'movieId': 'int32',
        'rating': 'float32'
    }
    dtype_features = {
        'movieId': 'int32'
    }

    df_ratings = pd.read_csv(ratings_file_path, dtype=dtype_ratings)
    df_features = pd.read_csv(features_file_path, dtype=dtype_features)
    
    return df_ratings, df_features

In [3]:
# Example of function usage
traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [4]:
traindf['movieId'] = traindf['movieId'].astype('int')
df_movies['movieId'] = df_movies['movieId'].astype('int')

In [5]:
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]

In [6]:
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [7]:
user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [8]:
user_ids = user_categories.codes
item_ids = item_categories.codes

In [9]:
rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [10]:
scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [11]:
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]

df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [12]:
vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [13]:
user_counts = traindf['userId'].value_counts()
user_weights = user_counts / user_counts.max()

for idx, row in traindf.iterrows():
    user_idx = user_categories.categories.get_loc(row['userId'])
    item_idx = item_categories.categories.get_loc(row['movieId'])
    user_id = row['userId']
    rating_matrix_scaled[user_idx, item_idx] *= user_weights.loc[user_id]

In [14]:
importance_of_genre = 0.5
content_weighted_features = tags_features.multiply(importance_of_genre).toarray()

full_features_matrix = np.hstack([rating_matrix_scaled.T, content_weighted_features]).T

In [15]:
"""
Best parameters: {'init': 'random', 'l1_ratio': 0.0, 'n_components': 15}
Best cross-validation score: nan
"""

"\nBest parameters: {'init': 'random', 'l1_ratio': 0.0, 'n_components': 15}\nBest cross-validation score: nan\n"

In [16]:
model = NMF(n_components=15, init='nndsvd', max_iter=100, random_state=42)
W = model.fit_transform(full_features_matrix)
H = model.components_

In [17]:
def get_top_n_recommendations(user_id, n):
    """
    Generates top N movie recommendations for a given user based on NMF model predictions.

    Parameters
    ----------
    user_id : int
        The user ID for whom recommendations are to be made.
    n : int
        Number of top recommendations to generate.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies with columns: movieId, title, and genres.

    """
    if user_id not in user_categories.categories:
        return pd.DataFrame()
    
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)
    top_n_indices = np.argsort(predicted_ratings)[-n:]
    top_n_movie_ids = item_categories.categories[top_n_indices]
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [18]:
recommended_movies = get_top_n_recommendations(610, 20)
recommended_movies

Unnamed: 0,movieId,title,genres
257,296,Pulp Fiction (1994),"['Comedy', 'Crime', 'Drama', 'Thriller']"
474,541,Blade Runner (1982),"['Action', 'Sci-Fi', 'Thriller']"
613,778,Trainspotting (1996),"['Comedy', 'Crime', 'Drama']"
828,1089,Reservoir Dogs (1992),"['Crime', 'Mystery', 'Thriller']"
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,"['Action', 'Adventure']"
922,1221,"Godfather: Part II, The (1974)","['Crime', 'Drama']"
939,1240,"Terminator, The (1984)","['Action', 'Sci-Fi', 'Thriller']"
1939,2571,"Matrix, The (1999)","['Action', 'Sci-Fi', 'Thriller']"
2674,3578,Gladiator (2000),"['Action', 'Adventure', 'Drama']"
2983,3996,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...","['Action', 'Drama', 'Romance']"


In [19]:
def get_favorite_genres(user_id, df_ratings, df_movies, top_n=3):
    """
    Calculates and returns the user's favorite genres based on their historical ratings.

    Parameters
    ----------
    user_id : int
        The user ID whose favorite genres are to be determined.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.
    top_n : int, optional
        Number of top genres to return, default is 3.

    Returns
    -------
    dict
        A dictionary containing:
        - 'top_genres': list of the user's top_n favorite genres.
        - 'average_ratings': dictionary mapping each genre to its average rating by the user.
        - 'genre_counts': dictionary mapping each genre to the number of times it has been rated by the user.
    """
    user_ratings = df_ratings[df_ratings['userId'] == user_id]

    user_ratings.loc[:, 'movieId'] = user_ratings['movieId'].astype(int)
    df_movies.loc[:, 'movieId'] = df_movies['movieId'].astype(int)

    user_genres = user_ratings.merge(df_movies[['movieId']], on='movieId', how='left')

    genre_ratings = {}
    genre_counts = {}
    for index, row in user_genres.iterrows():
        if pd.isnull(row['genres']):
            continue
        genres = eval(row['genres'])
        rating = row['rating']
        for genre in genres:
            if genre in genre_ratings:
                genre_ratings[genre].append(rating)
                genre_counts[genre] += 1
            else:
                genre_ratings[genre] = [rating]
                genre_counts[genre] = 1

    max_count = max(genre_counts.values(), default=0)

    genre_preferences = {}
    genre_avg_ratings = {}
    for genre, ratings in genre_ratings.items():
        average_rating = np.mean(ratings)
        normalized_count = genre_counts[genre] / max_count if max_count > 0 else 0
        count_weight = np.tanh(normalized_count)
        preference_score = average_rating * count_weight
        genre_preferences[genre] = preference_score
        genre_avg_ratings[genre] = average_rating

    sorted_genres = sorted(genre_preferences.items(), key=lambda x: x[1], reverse=True)
    
    detailed_output = {
        'top_genres': [genre for genre, _ in sorted_genres[:top_n]],
        'average_ratings': genre_avg_ratings,
        'genre_counts': genre_counts
    }
    print(detailed_output) 

    return detailed_output

In [20]:
def get_top_n_genre_based_recommendations(user_id, n, df_ratings, df_movies):
    """
    Generates top N genre-based movie recommendations for a given user.

    Parameters
    ----------
    user_id : int
        The user ID for whom genre-based recommendations are to be made.
    n : int
        Number of top recommendations to generate based on the user's favorite genres.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies filtered by user's top genres,
        with columns: movieId, title, and genres.
    """
    genre_data = get_favorite_genres(user_id, df_ratings, df_movies)
    top_genres = genre_data['top_genres']

    if user_id not in user_categories.categories:
        return pd.DataFrame()

    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    genre_filtered_movies = df_movies[df_movies['genres'].apply(lambda x: any(genre in x for genre in top_genres))]

    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])

    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]

    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [21]:
# Example of generating genre-based recommendations
recommended_movies = get_top_n_genre_based_recommendations(610, 20, traindf, df_movies)
recommended_movies

{'top_genres': ['Drama', 'Action', 'Thriller'], 'average_ratings': {'Adventure': 3.6940639269406392, 'Animation': 3.9098360655737703, 'Children': 3.66, 'Comedy': 3.68562874251497, 'Fantasy': 3.5381679389312977, 'Mystery': 3.75, 'Sci-Fi': 3.658974358974359, 'Thriller': 3.5561097256857854, 'Crime': 3.738425925925926, 'Action': 3.559610705596107, 'Horror': 3.5040983606557377, 'Drama': 3.8324468085106385, 'War': 3.782051282051282, 'Romance': 3.6907216494845363, 'Western': 3.76, 'Film-Noir': 4.357142857142857, 'Musical': 3.9615384615384617, 'Documentary': 4.2, 'IMAX': 3.6}, 'genre_counts': {'Adventure': 219, 'Animation': 61, 'Children': 50, 'Comedy': 334, 'Fantasy': 131, 'Mystery': 96, 'Sci-Fi': 195, 'Thriller': 401, 'Crime': 216, 'Action': 411, 'Horror': 244, 'Drama': 376, 'War': 39, 'Romance': 97, 'Western': 25, 'Film-Noir': 7, 'Musical': 13, 'Documentary': 5, 'IMAX': 70}}


Unnamed: 0,movieId,title,genres
43,47,Seven (a.k.a. Se7en) (1995),"['Mystery', 'Thriller']"
257,296,Pulp Fiction (1994),"['Comedy', 'Crime', 'Drama', 'Thriller']"
474,541,Blade Runner (1982),"['Action', 'Sci-Fi', 'Thriller']"
613,778,Trainspotting (1996),"['Comedy', 'Crime', 'Drama']"
828,1089,Reservoir Dogs (1992),"['Crime', 'Mystery', 'Thriller']"
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,"['Action', 'Adventure']"
922,1221,"Godfather: Part II, The (1974)","['Crime', 'Drama']"
939,1240,"Terminator, The (1984)","['Action', 'Sci-Fi', 'Thriller']"
1939,2571,"Matrix, The (1999)","['Action', 'Sci-Fi', 'Thriller']"
2674,3578,Gladiator (2000),"['Action', 'Adventure', 'Drama']"


In [22]:
def load_test_data(test_file_path):
    """
    Loads test dataset and aligns user and movie IDs with the training dataset categories.

    Parameters
    ----------
    test_file_path : str
        Path to the CSV file containing the test dataset.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the test dataset with user and movie IDs aligned to training categories.

    """
    df_test = pd.read_csv(test_file_path)
    
    df_test['userId'] = pd.Categorical(df_test['userId'], categories=user_categories.categories)
    df_test['movieId'] = pd.Categorical(df_test['movieId'], categories=item_categories.categories)
    
    return df_test

In [23]:
testdf = load_test_data('./datasets/testing_data.csv')
test_rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))

test_user_ids = testdf['userId'].cat.codes
test_item_ids = testdf['movieId'].cat.codes

test_rating_matrix[test_user_ids, test_item_ids] = testdf['rating']

predicted_test_ratings = np.dot(W, H)
test_predicted_ratings = predicted_test_ratings[test_user_ids, test_item_ids]

In [24]:
mse_test = mean_squared_error(testdf['rating'], test_predicted_ratings)
rmse_test = sqrt(mse_test)
print("Test RMSE:", rmse_test)

Test RMSE: 3.1593882771567148
