In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler

from utils.ScikitBasedNMF import UserDefinedNMF

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating'])
    return df

In [3]:
traindf = load_data('./datasets/training_data.csv')

In [4]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [5]:
user_ids = traindf['userId'].astype("category").cat.codes
item_ids = traindf['movieId'].astype("category").cat.codes

In [6]:
rating_matrix = np.zeros((user_ids.max()+1, item_ids.max()+1))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [7]:
scaler = MinMaxScaler(feature_range = (0.5, 5))

rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [8]:
model = UserDefinedNMF(n_components=15, max_iter=200)
model.fit(rating_matrix_scaled)



<utils.ScikitBasedNMF.UserDefinedNMF at 0x1c73cc01bd0>

In [9]:
W = model.transform(rating_matrix_scaled)
H = model.model.components_

In [10]:
def get_top_n(user_id, n):
    predicted_ratings = np.dot(W[user_id, :], H)
    recommended_item_ids = np.argsort(predicted_ratings)[::-1][:n]
    
    return recommended_item_ids

In [11]:
def show_user_recommended_movies_in_df(user_id,n):
    item_ids = get_top_n(user_id, n)
    
    recommended_movies_df = df_movies.loc[df_movies['movieId'].isin(item_ids)]

    return recommended_movies_df

In [21]:
show_user_recommended_movies_in_df(45,20)

Unnamed: 0,movieId,title,genres
84,95,Broken Arrow (1996),"['Action', 'Adventure', 'Thriller']"
211,247,Heavenly Creatures (1994),"['Crime', 'Drama']"
215,251,"Hunted, The (1995)",['Action']
232,270,Love Affair (1994),"['Drama', 'Romance']"
260,300,Quiz Show (1994),['Drama']
267,307,Three Colors: Blue (Trois couleurs: Bleu) (1993),['Drama']
274,315,"Specialist, The (1994)","['Action', 'Drama', 'Thriller']"
285,327,Tank Girl (1995),"['Action', 'Comedy', 'Sci-Fi']"
288,330,Tales from the Hood (1995),"['Action', 'Crime', 'Horror']"
342,385,"Man of No Importance, A (1994)",['Drama']


In [13]:
predicted_ratings_train = np.dot(W, H)
actual_ratings_train = rating_matrix_scaled

In [14]:
rmse_train = np.sqrt(mean_squared_error(actual_ratings_train[actual_ratings_train > 0], predicted_ratings_train[actual_ratings_train > 0]))
print(f"RMSE: {rmse_train}")

RMSE: 0.3424286049615392


In [15]:
threshold = 4

actual_binary = (actual_ratings_train > threshold).astype(int)
predicted_binary = (predicted_ratings_train > threshold).astype(int)

accuracy = accuracy_score(actual_binary[actual_ratings_train > 0], predicted_binary[actual_ratings_train > 0])
precision = precision_score(actual_binary[actual_ratings_train > 0], predicted_binary[actual_ratings_train > 0])
recall = recall_score(actual_binary[actual_ratings_train > 0], predicted_binary[actual_ratings_train > 0])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9927607100704674
Precision: 0.9135640478924061
Recall: 0.12521918978463198


In [16]:
actual_ratings_train

array([[4.1 , 0.5 , 4.1 , ..., 0.5 , 0.5 , 0.5 ],
       [0.5 , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ],
       [0.5 , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ],
       ...,
       [2.75, 2.3 , 2.3 , ..., 0.5 , 0.5 , 0.5 ],
       [3.2 , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ],
       [5.  , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ]])

In [17]:
predicted_ratings_train

array([[2.65909838, 1.22553842, 1.09715395, ..., 0.47340324, 0.47340324,
        0.46674387],
       [0.75693616, 0.52261114, 0.47516351, ..., 0.51958623, 0.51958623,
        0.5133159 ],
       [0.7745614 , 0.50236775, 0.54121626, ..., 0.52106163, 0.52106163,
        0.50276108],
       ...,
       [3.12874249, 1.88587282, 1.56406789, ..., 0.45966028, 0.45966028,
        0.46101854],
       [1.20852296, 1.0641241 , 0.69586582, ..., 0.51278906, 0.51278906,
        0.50611615],
       [4.60592838, 0.20659066, 0.3502578 , ..., 0.49806215, 0.49806215,
        0.5058655 ]])

In [18]:
"""plt.figure(figsize=(10, 6))
plt.hist(predicted_ratings_train[actual_ratings_train > 0], bins=50, alpha=0.75)
plt.title('Histogram of Predicted Ratings')
plt.xlabel('Predicted Rating')
plt.ylabel('Frequency')
plt.show()"""

"plt.figure(figsize=(10, 6))\nplt.hist(predicted_ratings_train[actual_ratings_train > 0], bins=50, alpha=0.75)\nplt.title('Histogram of Predicted Ratings')\nplt.xlabel('Predicted Rating')\nplt.ylabel('Frequency')\nplt.show()"