In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler

from utils.ScikitBasedNMF import UserDefinedNMF

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating'])
    return df

In [3]:
traindf = load_data('./datasets/training_data.csv')

In [4]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [5]:
user_ids = traindf['userId'].astype("category").cat.codes
item_ids = traindf['movieId'].astype("category").cat.codes

In [6]:
rating_matrix = np.zeros((user_ids.max()+1, item_ids.max()+1))
rating_matrix[user_ids, item_ids] = traindf['rating']

In [7]:
scaler = MinMaxScaler(feature_range = (0.5, 5))

rating_matrix_scaled = scaler.fit_transform(rating_matrix)

In [8]:
model = UserDefinedNMF(n_components=15, max_iter=200)
model.fit(rating_matrix_scaled)



<utils.ScikitBasedNMF.UserDefinedNMF at 0x1763d1b4d90>

In [9]:
W = model.transform(rating_matrix_scaled)
H = model.model.components_

In [10]:
def get_top_n(user_id, n):
    predicted_ratings = np.dot(W[user_id, :], H)
    recommended_item_ids = np.argsort(predicted_ratings)[::-1][:n]
    
    return recommended_item_ids

In [11]:
def show_user_recommended_movies_in_df(user_id,n):
    item_ids = get_top_n(user_id, n)
    
    recommended_movies_df = df_movies.loc[df_movies['movieId'].isin(item_ids)]

    return recommended_movies_df

In [12]:
show_user_recommended_movies_in_df(325,5)

Unnamed: 0,movieId,title,genres
215,251,"Hunted, The (1995)",['Action']
232,270,Love Affair (1994),"['Drama', 'Romance']"
1610,2152,Air Bud: Golden Receiver (1998),"['Children', 'Comedy']"
4607,6862,Out of Time (2003),"['Crime', 'Drama', 'Thriller']"


In [13]:
predicted_ratings_train = np.dot(W, H)
actual_ratings_train = rating_matrix_scaled

In [14]:
rmse_train = np.sqrt(mean_squared_error(actual_ratings_train[actual_ratings_train > 0], predicted_ratings_train[actual_ratings_train > 0]))
print(f"RMSE: {rmse_train}")

RMSE: 0.3423225992330588


In [15]:
threshold = 4

actual_binary = (actual_ratings_train > threshold).astype(int)
predicted_binary = (predicted_ratings_train > threshold).astype(int)

accuracy = accuracy_score(actual_binary[actual_ratings_train > 0], predicted_binary[actual_ratings_train > 0])
precision = precision_score(actual_binary[actual_ratings_train > 0], predicted_binary[actual_ratings_train > 0])
recall = recall_score(actual_binary[actual_ratings_train > 0], predicted_binary[actual_ratings_train > 0])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9927922818108229
Precision: 0.915273132664437
Recall: 0.12919832741333573


In [16]:
actual_ratings_train

array([[4.1 , 0.5 , 4.1 , ..., 0.5 , 0.5 , 0.5 ],
       [0.5 , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ],
       [0.5 , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ],
       ...,
       [2.75, 2.3 , 2.3 , ..., 0.5 , 0.5 , 0.5 ],
       [3.2 , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ],
       [5.  , 0.5 , 0.5 , ..., 0.5 , 0.5 , 0.5 ]])

In [17]:
predicted_ratings_train

array([[2.79541108, 1.11253998, 1.27233415, ..., 0.4876446 , 0.4876446 ,
        0.47292164],
       [0.73888853, 0.50927933, 0.47792821, ..., 0.52002485, 0.52002485,
        0.51390113],
       [0.8084493 , 0.477497  , 0.54790545, ..., 0.52252198, 0.52252198,
        0.50389189],
       ...,
       [3.7369163 , 2.29103677, 1.92476157, ..., 0.45005279, 0.45005279,
        0.47348523],
       [1.18129086, 1.0370738 , 0.69229464, ..., 0.5122413 , 0.5122413 ,
        0.50590669],
       [4.91817104, 0.14726562, 0.33242331, ..., 0.49181717, 0.49181717,
        0.49498035]])

In [18]:
"""plt.figure(figsize=(10, 6))
plt.hist(predicted_ratings_train[actual_ratings_train > 0], bins=50, alpha=0.75)
plt.title('Histogram of Predicted Ratings')
plt.xlabel('Predicted Rating')
plt.ylabel('Frequency')
plt.show()"""

"plt.figure(figsize=(10, 6))\nplt.hist(predicted_ratings_train[actual_ratings_train > 0], bins=50, alpha=0.75)\nplt.title('Histogram of Predicted Ratings')\nplt.xlabel('Predicted Rating')\nplt.ylabel('Frequency')\nplt.show()"