In [25]:
#Import necessary libraries

import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

In [26]:
# Function to load data from CSV files

def load_data(ratings_file_path, features_file_path):
    # Specify data types for efficient memory usage
    dtype_ratings = {
        'userId': 'int32',
        'movieId': 'int32',
        'rating': 'float32'
    }
    dtype_features = {
        'movieId': 'int32'  # Same type as in ratings for consistency
    }

    # Load data with specified types
    df_ratings = pd.read_csv(ratings_file_path, dtype=dtype_ratings)
    df_features = pd.read_csv(features_file_path, dtype=dtype_features)
    
    return df_ratings, df_features

In [27]:
# Load training data and movie features

traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [28]:
# Ensure both movieId columns are of the same type
traindf['movieId'] = traindf['movieId'].astype('int')
df_movies['movieId'] = df_movies['movieId'].astype('int')

In [29]:
# Filter df_movies to only include movieIds that are present in traindf
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]

In [30]:
# Optionally, drop rows in df_movies_aligned where genres are NaN
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [31]:
# Convert userId and movieId to categorical data types

user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [32]:
# Get numeric codes for users and items from categorical types

user_ids = user_categories.codes
item_ids = item_categories.codes

In [33]:
# Initialize the user-item rating matrix

rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [34]:
# Scale ratings to be between 0.5 and 5 using MinMaxScaler

scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [35]:
# Filter df_movies to only include movieIds that are present in traindf
df_movies_aligned = df_movies[df_movies['movieId'].isin(traindf['movieId'].unique())]

# Optionally, drop rows in df_movies_aligned where genres are NaN if needed (seems you were already doing this)
df_movies_aligned = df_movies_aligned.dropna(subset=['genres'])

In [36]:
# Convert movie genres to a TF-IDF matrix

vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [37]:
# Adjust user ratings based on user activity

user_counts = traindf['userId'].value_counts()
user_weights = user_counts / user_counts.max()

for idx, row in traindf.iterrows():
    user_idx = user_categories.categories.get_loc(row['userId'])
    item_idx = item_categories.categories.get_loc(row['movieId'])
    user_id = row['userId']
    rating_matrix_scaled[user_idx, item_idx] *= user_weights.loc[user_id]

In [38]:
# Combine user-item interactions with content features and weight them

importance_of_genre = 0.5
content_weighted_features = tags_features.multiply(importance_of_genre).toarray()

full_features_matrix = np.hstack([rating_matrix_scaled.T, content_weighted_features]).T

In [39]:
# Set up and train the NMF model

model = NMF(n_components=15, init='nndsvd', max_iter=100, random_state=42)
W = model.fit_transform(full_features_matrix)
H = model.components_



In [40]:
# Function to get top N movie recommendations for a user

def get_top_n_recommendations(user_id, n):
    if user_id not in user_categories.categories:
        return pd.DataFrame()
    
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)
    top_n_indices = np.argsort(predicted_ratings)[-n:]
    top_n_movie_ids = item_categories.categories[top_n_indices]
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [41]:
# Example usage of the recommendation function

recommended_movies = get_top_n_recommendations(45, 20)
recommended_movies

Unnamed: 0,movieId,title,genres
706,924,2001: A Space Odyssey (1968),"['Adventure', 'Drama', 'Sci-Fi']"
828,1089,Reservoir Dogs (1992),"['Crime', 'Mystery', 'Thriller']"
863,1136,Monty Python and the Holy Grail (1975),"['Adventure', 'Comedy', 'Fantasy']"
909,1208,Apocalypse Now (1979),"['Action', 'Drama', 'War']"
914,1213,Goodfellas (1990),"['Crime', 'Drama']"
920,1219,Psycho (1960),"['Crime', 'Horror']"
957,1258,"Shining, The (1980)",['Horror']
964,1265,Groundhog Day (1993),"['Comedy', 'Fantasy', 'Romance']"
1267,1682,"Truman Show, The (1998)","['Comedy', 'Drama', 'Sci-Fi']"
1298,1732,"Big Lebowski, The (1998)","['Comedy', 'Crime']"


In [42]:
########

In [43]:
def get_favorite_genres(user_id, df_ratings, df_movies, top_n=3):
    user_ratings = df_ratings[df_ratings['userId'] == user_id]

    user_ratings.loc[:, 'movieId'] = user_ratings['movieId'].astype(int)
    df_movies.loc[:, 'movieId'] = df_movies['movieId'].astype(int)

    user_genres = user_ratings.merge(df_movies[['movieId']], on='movieId', how='left')

    # Calculate the average rating for each genre and count occurrences
    genre_ratings = {}
    genre_counts = {}
    for index, row in user_genres.iterrows():
        if pd.isnull(row['genres']):
            continue  # Skip rows where genres are NaN
        genres = eval(row['genres'])  # Assuming 'genres' are stored as string representation of lists
        rating = row['rating']
        for genre in genres:
            if genre in genre_ratings:
                genre_ratings[genre].append(rating)
                genre_counts[genre] += 1
            else:
                genre_ratings[genre] = [rating]
                genre_counts[genre] = 1

    max_count = max(genre_counts.values(), default=0)  # Find the maximum count to normalize counts

    # Calculate preference scores using tanh function
    genre_preferences = {}
    genre_avg_ratings = {}
    for genre, ratings in genre_ratings.items():
        average_rating = np.mean(ratings)
        normalized_count = genre_counts[genre] / max_count if max_count > 0 else 0
        count_weight = np.tanh(normalized_count)  # Apply tanh to normalized counts
        preference_score = average_rating * count_weight
        genre_preferences[genre] = preference_score
        genre_avg_ratings[genre] = average_rating  # Store the average rating for each genre

    sorted_genres = sorted(genre_preferences.items(), key=lambda x: x[1], reverse=True)
    
    # Prepare output with detailed information
    detailed_output = {
        'top_genres': [genre for genre, _ in sorted_genres[:top_n]],
        'average_ratings': genre_avg_ratings,
        'genre_counts': genre_counts
    }
    print(detailed_output)  # Print the detailed results for debugging

    return detailed_output  # Return the detailed genre preferences, counts, and ratings


In [44]:
get_favorite_genres(45, traindf, df_movies)

{'top_genres': ['Comedy', 'Drama', 'Action'], 'average_ratings': {'Adventure': 3.9814814814814814, 'Animation': 4.46, 'Children': 3.9390243902439024, 'Comedy': 3.7232142857142856, 'Fantasy': 3.7027027027027026, 'Action': 3.9943181818181817, 'Crime': 4.2625, 'Thriller': 4.006849315068493, 'Romance': 3.9607843137254903, 'Mystery': 4.321428571428571, 'Sci-Fi': 3.737704918032787, 'Horror': 3.9166666666666665, 'Drama': 4.044117647058823, 'War': 4.428571428571429, 'Musical': 4.318181818181818, 'IMAX': 4.666666666666667, 'Western': 3.6666666666666665, 'Documentary': 4.0}, 'genre_counts': {'Adventure': 81, 'Animation': 25, 'Children': 41, 'Comedy': 168, 'Fantasy': 37, 'Action': 88, 'Crime': 40, 'Thriller': 73, 'Romance': 51, 'Mystery': 14, 'Sci-Fi': 61, 'Horror': 12, 'Drama': 102, 'War': 14, 'Musical': 22, 'IMAX': 6, 'Western': 3, 'Documentary': 1}}


{'top_genres': ['Comedy', 'Drama', 'Action'],
 'average_ratings': {'Adventure': 3.9814814814814814,
  'Animation': 4.46,
  'Children': 3.9390243902439024,
  'Comedy': 3.7232142857142856,
  'Fantasy': 3.7027027027027026,
  'Action': 3.9943181818181817,
  'Crime': 4.2625,
  'Thriller': 4.006849315068493,
  'Romance': 3.9607843137254903,
  'Mystery': 4.321428571428571,
  'Sci-Fi': 3.737704918032787,
  'Horror': 3.9166666666666665,
  'Drama': 4.044117647058823,
  'War': 4.428571428571429,
  'Musical': 4.318181818181818,
  'IMAX': 4.666666666666667,
  'Western': 3.6666666666666665,
  'Documentary': 4.0},
 'genre_counts': {'Adventure': 81,
  'Animation': 25,
  'Children': 41,
  'Comedy': 168,
  'Fantasy': 37,
  'Action': 88,
  'Crime': 40,
  'Thriller': 73,
  'Romance': 51,
  'Mystery': 14,
  'Sci-Fi': 61,
  'Horror': 12,
  'Drama': 102,
  'War': 14,
  'Musical': 22,
  'IMAX': 6,
  'Western': 3,
  'Documentary': 1}}

In [50]:
def get_top_n_genre_based_recommendations(user_id, n, df_ratings, df_movies):
    # Get the top genres for the user
    genre_data = get_favorite_genres(user_id, df_ratings, df_movies)
    top_genres = genre_data['top_genres']

    if user_id not in user_categories.categories:
        return pd.DataFrame()

    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)

    # Filter movies by top genres
    genre_filtered_movies = df_movies[df_movies['genres'].apply(lambda x: any(genre in x for genre in top_genres))]

    # Get indices of genre-filtered movies in the full movie list
    genre_filtered_indices = item_categories.categories.get_indexer(genre_filtered_movies['movieId'])

    # Select predictions for genre-filtered movies and get top N recommendations
    genre_filtered_ratings = predicted_ratings[genre_filtered_indices]
    top_n_indices = np.argsort(genre_filtered_ratings)[-n:]

    top_n_movie_ids = genre_filtered_movies.iloc[top_n_indices]['movieId']
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

recommended_movies = get_top_n_genre_based_recommendations(45, 20, traindf, df_movies)
recommended_movies

{'top_genres': ['Comedy', 'Drama', 'Action'], 'average_ratings': {'Adventure': 3.9814814814814814, 'Animation': 4.46, 'Children': 3.9390243902439024, 'Comedy': 3.7232142857142856, 'Fantasy': 3.7027027027027026, 'Action': 3.9943181818181817, 'Crime': 4.2625, 'Thriller': 4.006849315068493, 'Romance': 3.9607843137254903, 'Mystery': 4.321428571428571, 'Sci-Fi': 3.737704918032787, 'Horror': 3.9166666666666665, 'Drama': 4.044117647058823, 'War': 4.428571428571429, 'Musical': 4.318181818181818, 'IMAX': 4.666666666666667, 'Western': 3.6666666666666665, 'Documentary': 4.0}, 'genre_counts': {'Adventure': 81, 'Animation': 25, 'Children': 41, 'Comedy': 168, 'Fantasy': 37, 'Action': 88, 'Crime': 40, 'Thriller': 73, 'Romance': 51, 'Mystery': 14, 'Sci-Fi': 61, 'Horror': 12, 'Drama': 102, 'War': 14, 'Musical': 22, 'IMAX': 6, 'Western': 3, 'Documentary': 1}}


Unnamed: 0,movieId,title,genres
706,924,2001: A Space Odyssey (1968),"['Adventure', 'Drama', 'Sci-Fi']"
863,1136,Monty Python and the Holy Grail (1975),"['Adventure', 'Comedy', 'Fantasy']"
909,1208,Apocalypse Now (1979),"['Action', 'Drama', 'War']"
914,1213,Goodfellas (1990),"['Crime', 'Drama']"
964,1265,Groundhog Day (1993),"['Comedy', 'Fantasy', 'Romance']"
1267,1682,"Truman Show, The (1998)","['Comedy', 'Drama', 'Sci-Fi']"
1298,1732,"Big Lebowski, The (1998)","['Comedy', 'Crime']"
1905,2529,Planet of the Apes (1968),"['Action', 'Drama', 'Sci-Fi']"
2462,3275,"Boondock Saints, The (2000)","['Action', 'Crime', 'Drama', 'Thriller']"
2907,3897,Almost Famous (2000),['Drama']


In [46]:
########

In [47]:
# Function to load and prepare test data

def load_test_data(test_file_path):
    df_test = pd.read_csv(test_file_path)
    
    df_test['userId'] = pd.Categorical(df_test['userId'], categories=user_categories.categories)
    df_test['movieId'] = pd.Categorical(df_test['movieId'], categories=item_categories.categories)
    
    return df_test

In [48]:
# Load and predict test dataset ratings

testdf = load_test_data('./datasets/testing_data.csv')
test_rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))

test_user_ids = testdf['userId'].cat.codes
test_item_ids = testdf['movieId'].cat.codes

test_rating_matrix[test_user_ids, test_item_ids] = testdf['rating']

predicted_test_ratings = np.dot(W, H)
test_predicted_ratings = predicted_test_ratings[test_user_ids, test_item_ids]

In [49]:
# Calculate and print the test RMSE

mse_test = mean_squared_error(testdf['rating'], test_predicted_ratings)
rmse_test = sqrt(mse_test)
print("Test RMSE:", rmse_test)

Test RMSE: 3.1593882771567134
