In [None]:
#Import necessary libraries

import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

In [None]:
# Function to load data from CSV files

def load_data(ratings_file_path, features_file_path):
    # Specify data types for efficient memory usage
    dtype_ratings = {
        'userId': 'int32',
        'movieId': 'int32',
        'rating': 'float32'
    }
    dtype_features = {
        'movieId': 'int32'  # Same type as in ratings for consistency
    }

    # Load data with specified types
    df_ratings = pd.read_csv(ratings_file_path, dtype=dtype_ratings)
    df_features = pd.read_csv(features_file_path, dtype=dtype_features)
    
    return df_ratings, df_features

In [None]:
# Load training data and movie features

traindf, df_movies = load_data('./datasets/training_data.csv', './datasets/movies.csv')

In [None]:
# Convert userId and movieId to categorical data types

user_categories = pd.Categorical(traindf['userId'])
item_categories = pd.Categorical(traindf['movieId'])

In [None]:
# Get numeric codes for users and items from categorical types

user_ids = user_categories.codes
item_ids = item_categories.codes

In [None]:
# Initialize the user-item rating matrix

rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [None]:
# Scale ratings to be between 0.5 and 5 using MinMaxScaler

scaler = MinMaxScaler(feature_range=(0.5, 5))
rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [None]:
# Align movie data to only include movies in the training dataset

df_movies['movieId'] = pd.Categorical(df_movies['movieId'], categories=traindf['movieId'].unique())
df_movies_aligned = df_movies.dropna(subset=['genres'])
df_movies_aligned = df_movies_aligned[df_movies_aligned['movieId'].isin(traindf['movieId'].unique())]

In [None]:
# Convert movie genres to a TF-IDF matrix

vectorizer = TfidfVectorizer(max_features=100)
tags_features = vectorizer.fit_transform(df_movies_aligned['genres'].fillna(''))

In [None]:
# Adjust user ratings based on user activity

user_counts = traindf['userId'].value_counts()
user_weights = user_counts / user_counts.max()

for idx, row in traindf.iterrows():
    user_idx = user_categories.categories.get_loc(row['userId'])
    item_idx = item_categories.categories.get_loc(row['movieId'])
    user_id = row['userId']
    rating_matrix_scaled[user_idx, item_idx] *= user_weights.loc[user_id]

In [None]:
# Combine user-item interactions with content features and weight them

importance_of_genre = 0.5
content_weighted_features = tags_features.multiply(importance_of_genre).toarray()

full_features_matrix = np.hstack([rating_matrix_scaled.T, content_weighted_features]).T

In [None]:
# Set up and train the NMF model

model = NMF(n_components=15, init='nndsvd', max_iter=100, random_state=42)
W = model.fit_transform(full_features_matrix)
H = model.components_

In [None]:
# Function to get top N movie recommendations for a user

def get_top_n_recommendations(user_id, n):
    if user_id not in user_categories.categories:
        return pd.DataFrame()
    
    user_idx = user_categories.categories.get_loc(user_id)
    predicted_ratings = np.dot(W[user_idx, :], H)
    top_n_indices = np.argsort(predicted_ratings)[-n:]
    top_n_movie_ids = item_categories.categories[top_n_indices]
    
    return df_movies[df_movies['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]

In [None]:
# Example usage of the recommendation function

recommended_movies = get_top_n_recommendations(1, 20)
recommended_movies

In [None]:
# Function to load and prepare test data

def load_test_data(test_file_path):
    df_test = pd.read_csv(test_file_path)
    
    df_test['userId'] = pd.Categorical(df_test['userId'], categories=user_categories.categories)
    df_test['movieId'] = pd.Categorical(df_test['movieId'], categories=item_categories.categories)
    
    return df_test

In [None]:
# Load and predict test dataset ratings

testdf = load_test_data('./datasets/testing_data.csv')
test_rating_matrix = np.zeros((user_categories.categories.size, item_categories.categories.size))

test_user_ids = testdf['userId'].cat.codes
test_item_ids = testdf['movieId'].cat.codes

test_rating_matrix[test_user_ids, test_item_ids] = testdf['rating']

predicted_test_ratings = np.dot(W, H)
test_predicted_ratings = predicted_test_ratings[test_user_ids, test_item_ids]

In [None]:
# Calculate and print the test RMSE

mse_test = mean_squared_error(testdf['rating'], test_predicted_ratings)
rmse_test = sqrt(mse_test)
print("Test RMSE:", rmse_test)