In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_data(ratings_file_path, features_file_path):
    """
    Loads and returns datasets for ratings and movie features from specified file paths.

    Parameters
    ----------
    ratings_file_path : str
        Path to the CSV file containing user ratings with columns: userId, movieId, and rating.
    features_file_path : str
        Path to the CSV file containing movie features.

    Returns
    -------
    tuple of pd.DataFrame
        Returns two pandas DataFrames: one for ratings and another for movie features.

    """
    dtype_ratings = {
        'userId': 'int32',
        'movieId': 'int32',
        'rating': 'float32'
    }
    dtype_features = {
        'movieId': 'int32'
    }

    df_ratings = pd.read_csv(ratings_file_path, dtype=dtype_ratings)
    df_features = pd.read_csv(features_file_path, dtype=dtype_features)
    
    return df_ratings, df_features

In [3]:
traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [4]:
# Ensure 'movieId' columns are of type int
traindf['movieId'] = traindf['movieId'].astype('int')
df_movies['movieId'] = df_movies['movieId'].astype('int')

In [5]:
# Align movie features to training data
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [6]:
# Convert user and movie IDs to categorical types
user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [7]:
# Convert user and item IDs to numerical codes
user_ids = user_categories.codes
item_ids = item_categories.codes

In [8]:
# Create rating matrix
rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [9]:
# Scale the rating matrix
scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [10]:
# Repeat alignment of movie features to training data
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [11]:
# Vectorize movie genres using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [12]:
# Calculate user weights based on rating counts
user_counts = traindf['userId'].value_counts()
user_weights = user_counts / user_counts.max()

# Adjust scaled rating matrix by user weights
for idx, row in traindf.iterrows():
    user_idx = user_categories.categories.get_loc(row['userId'])
    item_idx = item_categories.categories.get_loc(row['movieId'])
    user_id = row['userId']
    rating_matrix_scaled[user_idx, item_idx] *= user_weights.loc[user_id]

In [13]:
importance_of_genre = 0.5
content_weighted_features = tags_features.multiply(importance_of_genre).toarray()
full_features_matrix = np.hstack([rating_matrix_scaled.T, content_weighted_features]).T

In [14]:
# NMF with importance_of_genre = 0.5
model = NMF(n_components=15, init='nndsvd', max_iter=100, random_state=42)
W = model.fit_transform(full_features_matrix)
H = model.components_

In [15]:
def get_top_n_recommendations(user_id, n, W, H):
    """
    Generates top N movie recommendations for a given user based on NMF model predictions.

    Parameters
    ----------
    user_id : int
        The user ID for whom recommendations are to be made.
    n : int
        Number of top recommendations to generate.
    W : np.array
        User feature matrix from NMF.
    H : np.array
        Item feature matrix from NMF.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies with columns: movieId, title, and genres.
    """
    if user_id not in user_categories.categories:
        return pd.DataFrame()

    # Predict ratings for the user
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    # Get top N movie recommendations
    top_n_indices = np.argsort(predicted_ratings)[-n:]
    top_n_movie_ids = item_categories.categories[top_n_indices]

    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [None]:
recommended_movies = get_top_n_recommendations(45, 20, W, H)
recommended_movies

In [None]:
def get_favorite_genres(user_id, df_ratings, df_movies, top_n=3):
    """
    Calculates and returns the user's favorite genres based on their historical ratings.

    Parameters
    ----------
    user_id : int
        The user ID whose favorite genres are to be determined.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.
    top_n : int, optional
        Number of top genres to return, default is 3.

    Returns
    -------
    dict
        A dictionary containing:
        - 'top_genres': list of the user's top_n favorite genres.
        - 'average_ratings': dictionary mapping each genre to its average rating by the user.
        - 'genre_counts': dictionary mapping each genre to the number of times it has been rated by the user.
    """
    user_ratings = df_ratings[df_ratings['userId'] == user_id]
    user_ratings.loc[:, 'movieId'] = user_ratings['movieId'].astype(int)
    df_movies.loc[:, 'movieId'] = df_movies['movieId'].astype(int)

    # Merge user ratings with movie genres
    user_genres = user_ratings.merge(df_movies[['movieId']], on='movieId', how='left')
    # Calculate genre ratings and counts
    genre_ratings = {}
    genre_counts = {}
    for index, row in user_genres.iterrows():
        if pd.isnull(row['genres']):
            continue
        genres = eval(row['genres'])
        rating = row['rating']
        for genre in genres:
            if genre in genre_ratings:
                genre_ratings[genre].append(rating)
                genre_counts[genre] += 1
            else:
                genre_ratings[genre] = [rating]
                genre_counts[genre] = 1

    max_count = max(genre_counts.values(), default=0)
    
    # Calculate preference scores for genres
    genre_preferences = {}
    genre_avg_ratings = {}
    for genre, ratings in genre_ratings.items():
        average_rating = np.mean(ratings)
        normalized_count = genre_counts[genre] / max_count if max_count > 0 else 0
        count_weight = np.tanh(normalized_count)
        preference_score = average_rating * count_weight
        genre_preferences[genre] = preference_score
        genre_avg_ratings[genre] = average_rating
    
    # Sort genres by preference score
    sorted_genres = sorted(genre_preferences.items(), key=lambda x: x[1], reverse=True)
    
    detailed_output = {
        'top_genres': [genre for genre, _ in sorted_genres[:top_n]],
        #'average_ratings': genre_avg_ratings,
        #'genre_counts': genre_counts,
        'genre_preferences': genre_preferences
    }
    print(detailed_output) 

    return detailed_output

In [None]:
def get_top_n_genre_based_recommendations(user_id, n, df_ratings, df_movies):
    """
    Generates top N genre-based movie recommendations for a given user.

    Parameters
    ----------
    user_id : int
        The user ID for whom genre-based recommendations are to be made.
    n : int
        Number of top recommendations to generate based on the user's favorite genres.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies filtered by user's top genres,
        with columns: movieId, title, and genres.
    """
    genre_data = get_favorite_genres(user_id, df_ratings, df_movies, 5)
    top_genres = genre_data['top_genres']
    
    if user_id not in user_categories.categories:
        return pd.DataFrame()
    
    # Predict ratings for the user
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    # Filter movies by user's favorite genres
    genre_filtered_movies = df_movies[df_movies['genres'].apply(
        lambda x: any(genre in x for genre in top_genres))]

    # Get indices of genre-filtered movies
    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])
    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]
    
    # Get top N genre-based movie recommendations
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]
    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [None]:
recommended_movies = get_top_n_genre_based_recommendations(45, 20, traindf, df_movies)
recommended_movies

In [None]:
def recommend_movies_by_user_genre_choice(user_id, selected_genre, df_movies, df_ratings, n=10):
    """
    Recommends top N movies for a given user based on a chosen genre and predicted ratings from an NMF model.

    Parameters
    ----------
    user_id : int
        User ID for whom recommendations are to be made.
    selected_genre : str
        The genre chosen by the user to filter the recommendations.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    n : int
        Number of top recommendations to generate.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies filtered by the chosen genre, sorted by predicted rating.
    """
    if user_id not in user_categories.categories:
        return pd.DataFrame()

    # Predict ratings for the user
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    # Normalize genres to prevent case and spacing issues
    df_movies['genres'] = df_movies['genres'].str.lower().str.replace(r'\s+', '', regex=True)

    # Filter movies by the chosen genre using normalized genre strings
    genre_filtered_movies = df_movies[df_movies['genres'].apply(
        lambda x: selected_genre.lower().replace(' ', '') in x)]

    # Merge predicted ratings with movie details
    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])
    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]
    
    # Get top N genre-based movie recommendations
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]                                  
    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [None]:
recommended_movies = recommend_movies_by_user_genre_choice(
    user_id=45, selected_genre='documentary', df_movies=df_movies, df_ratings=traindf,n=20)
recommended_movies

In [16]:
def get_least_favorite_genres(user_id, df_ratings, df_movies, bottom_n=3):
    """
    Calculates and returns the user's least favorite genres based on their historical ratings.

    Parameters
    ----------
    user_id : int
        The user ID whose least favorite genres are to be determined.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.
    bottom_n : int, optional
        Number of least favorite genres to return, default is 3.

    Returns
    -------
    dict
        A dictionary containing:
        - 'bottom_genres': list of the user's bottom_n least favorite genres.
        - 'genre_preferences': dictionary mapping each genre to its preference score.
    """
    user_ratings = df_ratings[df_ratings['userId'] == user_id]
    user_ratings.loc[:, 'movieId'] = user_ratings['movieId'].astype(int)
    df_movies.loc[:, 'movieId'] = df_movies['movieId'].astype(int)

    # Merge user ratings with movie genres
    user_genres = user_ratings.merge(df_movies[['movieId']], on='movieId', how='left')

    # Calculate genre ratings and counts
    genre_ratings = {}
    genre_counts = {}
    for index, row in user_genres.iterrows():
        if pd.isnull(row['genres']):
            continue
        genres = eval(row['genres'])
        rating = row['rating']
        for genre in genres:
            if genre in genre_ratings:
                genre_ratings[genre].append(rating)
                genre_counts[genre] += 1
            else:
                genre_ratings[genre] = [rating]
                genre_counts[genre] = 1

    max_count = max(genre_counts.values(), default=0)

    # Calculate preference scores for genres
    genre_preferences = {}
    genre_avg_ratings = {}
    for genre, ratings in genre_ratings.items():
        average_rating = np.mean(ratings)
        normalized_count = genre_counts[genre] / max_count if max_count > 0 else 0
        count_weight = np.tanh(normalized_count)
        preference_score = average_rating * count_weight
        genre_preferences[genre] = preference_score
        genre_avg_ratings[genre] = average_rating

    # Sort genres by preference score
    sorted_genres = sorted(genre_preferences.items(), key=lambda x: x[1])

    detailed_output = {
        'bottom_genres': [genre for genre, _ in sorted_genres[:bottom_n]],
        'genre_preferences': genre_preferences
    }
    print(detailed_output)

    return detailed_output

In [21]:
def get_bottom_n_genre_based_recommendations(user_id, n, df_ratings, df_movies):
    """
    Generates top N genre-based movie recommendations for a given user from their least liked genres.

    Parameters
    ----------
    user_id : int
        The user ID for whom genre-based recommendations are to be made.
    n : int
        Number of top recommendations to generate based on the user's least favorite genres.
    df_ratings : pd.DataFrame
        DataFrame containing user ratings.
    df_movies : pd.DataFrame
        DataFrame containing movie details including genres.

    Returns
    -------
    pd.DataFrame
        DataFrame containing top N recommended movies filtered by user's least favorite genres,
        with columns: movieId, title, and genres.
    """
    genre_data = get_least_favorite_genres(user_id, df_ratings, df_movies, 5)
    bottom_genres = genre_data['bottom_genres']
    
    if user_id not in user_categories.categories:
        return pd.DataFrame()

    # Predict ratings for the user
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)
    
    # Filter movies by user's least favorite genres
    genre_filtered_movies = df_movies[df_movies['genres'].apply(
        lambda x: any(genre in x for genre in bottom_genres))]
    
    # Get indices of genre-filtered movies
    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])
    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]

    # Get top N genre-based movie recommendations
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]
    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [22]:
recommended_movies_least_liked_genres = get_bottom_n_genre_based_recommendations(45, 20, traindf, df_movies)
recommended_movies_least_liked_genres

{'bottom_genres': ['Documentary', 'Western', 'IMAX', 'Horror', 'Mystery'], 'genre_preferences': {'Adventure': 1.7835369232562608, 'Animation': 0.6588345031240023, 'Children': 0.9426685403185072, 'Comedy': 2.8355782413710173, 'Fantasy': 0.8025422347473031, 'Action': 1.9198085515923424, 'Crime': 0.9961284207513251, 'Thriller': 1.639181929970816, 'Romance': 1.166758131358318, 'Mystery': 0.35928774780234174, 'Sci-Fi': 1.3004876507225789, 'Horror': 0.2792870881994441, 'Drama': 2.1923677650677327, 'War': 0.36819570849165606, 'Musical': 0.5622658486653132, 'IMAX': 0.16659584112392364, 'Western': 0.06546923173469396, 'Documentary': 0.023809242616374033}}


Unnamed: 0,movieId,title,genres
686,904,Rear Window (1954),"['Mystery', 'Thriller']"
705,923,Citizen Kane (1941),"['Drama', 'Mystery']"
828,1089,Reservoir Dogs (1992),"['Crime', 'Mystery', 'Thriller']"
915,1214,Alien (1979),"['Horror', 'Sci-Fi']"
920,1219,Psycho (1960),"['Crime', 'Horror']"
957,1258,"Shining, The (1980)",['Horror']
1218,1617,L.A. Confidential (1997),"['Crime', 'Film-Noir', 'Mystery', 'Thriller']"
2326,3081,Sleepy Hollow (1999),"['Fantasy', 'Horror', 'Mystery', 'Romance']"
2641,3535,American Psycho (2000),"['Crime', 'Horror', 'Mystery', 'Thriller']"
3141,4226,Memento (2000),"['Mystery', 'Thriller']"


In [None]:
def load_test_data(test_file_path):
    """
    Loads test dataset and aligns user and movie IDs with the training dataset categories.

    Parameters
    ----------
    test_file_path : str
        Path to the CSV file containing the test dataset.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the test dataset with user and movie IDs aligned to training categories.
    """
    df_test = pd.read_csv(test_file_path)
    
    df_test['userId'] = pd.Categorical(df_test['userId'], categories=user_categories.categories)
    df_test['movieId'] = pd.Categorical(df_test['movieId'], categories=item_categories.categories)
    
    return df_test

In [None]:
testdf = load_test_data('./datasets/testing_data.csv')
test_rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))

test_user_ids = testdf['userId'].cat.codes
test_item_ids = testdf['movieId'].cat.codes

test_rating_matrix[test_user_ids, test_item_ids] = testdf['rating']

predicted_test_ratings = np.dot(W, H)
test_predicted_ratings = predicted_test_ratings[test_user_ids, test_item_ids]

In [None]:
mse_test = mean_squared_error(testdf['rating'], test_predicted_ratings)
rmse_test = sqrt(mse_test)
print("Test MSE:", mse_test)
print("Test RMSE:", rmse_test)

In [None]:
def calculate_precision_recall_f1(testdf, predicted_ratings, k=10):
    """
    Calculates precision, recall, and F1-score for top-k recommendations.

    Parameters
    ----------
    testdf : pd.DataFrame
        DataFrame containing the test dataset.
    predicted_ratings : np.array
        Array of predicted ratings.
    k : int, optional
        Number of top recommendations to consider, default is 10.

    Returns
    -------
    tuple
        Precision, recall, and F1-score.
    """
    testdf['predicted_rating'] = predicted_ratings
    testdf_sorted = testdf.sort_values(by='predicted_rating', ascending=False)
    top_k_recommendations = testdf_sorted.groupby('userId').head(k)

    true_positives = (top_k_recommendations['rating'] >= 4).sum()
    precision = true_positives / (k * testdf['userId'].nunique())
    recall = true_positives / (testdf['rating'] >= 4).sum()

    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

precision, recall, f1 = calculate_precision_recall_f1(testdf, test_predicted_ratings)
print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}")

In [None]:
mae_test = mean_absolute_error(testdf['rating'], test_predicted_ratings)
print("Test MAE:", mae_test)

In [None]:
def calculate_coverage(top_n_recommendations, all_items):
    """
    Calculates the coverage of recommendations.

    Parameters
    ----------
    top_n_recommendations : pd.DataFrame
        DataFrame containing the top N recommended movies.
    all_items : pd.DataFrame
        DataFrame containing all items.

    Returns
    -------
    float
        Coverage value.
    """
    recommended_items = set(top_n_recommendations['movieId'].unique())
    all_items_set = set(all_items['movieId'].unique())
    coverage = len(recommended_items) / len(all_items_set)
    return coverage

coverage = calculate_coverage(recommended_movies, df_movies)
print("Coverage:", coverage)

In [None]:
def calculate_hit_rate(testdf, predicted_ratings, k=10):
    """
    Calculates the hit rate of top-k recommendations.

    Parameters
    ----------
    testdf : pd.DataFrame
        DataFrame containing the test dataset.
    predicted_ratings : np.array
        Array of predicted ratings.
    k : int, optional
        Number of top recommendations to consider, default is 10.

    Returns
    -------
    float
        Hit rate value.
    """
    testdf['predicted_rating'] = predicted_ratings
    testdf_sorted = testdf.sort_values(by='predicted_rating', ascending=False)
    top_k_recommendations = testdf_sorted.groupby('userId').head(k)

    hits = top_k_recommendations[top_k_recommendations['rating'] >= 4].groupby('userId').size()
    hit_rate = hits.sum() / testdf['userId'].nunique()
    return hit_rate

hit_rate = calculate_hit_rate(testdf, test_predicted_ratings)
print("Hit Rate:", hit_rate)

In [None]:
def precision_at_k(testdf, predicted_ratings, k=10):
    """
    Calculates precision at top-k recommendations.

    Parameters
    ----------
    testdf : pd.DataFrame
        DataFrame containing the test dataset.
    predicted_ratings : np.array
        Array of predicted ratings.
    k : int, optional
        Number of top recommendations to consider, default is 10.

    Returns
    -------
    float
        Precision at top-k value.
    """
    testdf['predicted_rating'] = predicted_ratings
    testdf_sorted = testdf.sort_values(by='predicted_rating', ascending=False)
    top_k_recommendations = testdf_sorted.groupby('userId').head(k)

    precision_at_k = (top_k_recommendations['rating'] >= 4).sum() / (k * testdf['userId'].nunique())
    return precision_at_k

def recall_at_k(testdf, predicted_ratings, k=10):
    """
    Calculates recall at top-k recommendations.

    Parameters
    ----------
    testdf : pd.DataFrame
        DataFrame containing the test dataset.
    predicted_ratings : np.array
        Array of predicted ratings.
    k : int, optional
        Number of top recommendations to consider, default is 10.

    Returns
    -------
    float
        Recall at top-k value.
    """
    testdf['predicted_rating'] = predicted_ratings
    testdf_sorted = testdf.sort_values(by='predicted_rating', ascending=False)
    top_k_recommendations = testdf_sorted.groupby('userId').head(k)

    recall_at_k = (top_k_recommendations['rating'] >= 4).sum() / (testdf['rating'] >= 4).sum()
    return recall_at_k

precision_k = precision_at_k(testdf, test_predicted_ratings, k=10)
recall_k = recall_at_k(testdf, test_predicted_ratings, k=10)
print(f"Precision@10: {precision_k}, Recall@10: {recall_k}")