In [1]:
from joblib import dump, load
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.impute import KNNImputer
from sqlalchemy import create_engine

from credentials import PG_USER, PG_PASSWORD, PG_URL

In [2]:
DATABASE_USER = PG_USER
DATABASE_PASSWORD = PG_PASSWORD
DATABASE_HOST = PG_URL
DATABASE_PORT = "5432"
DATABASE_DB_NAME = "movielens"

In [3]:
conn = f"postgres://{DATABASE_USER}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_DB_NAME}"

In [4]:
# get ratings for 2019 for each movie with ratings >= 5 (implemented as a view in database)
filtered_ratings_2019 = pd.read_sql_table('filtered_ratings_2019', conn)
filtered_ratings_2019

Unnamed: 0,user_id,movie_id,rating
0,149873,1,4.0
1,152759,1,4.0
2,150751,1,3.5
3,151326,1,4.5
4,152303,1,3.5
...,...,...,...
1150904,151152,208939,4.0
1150905,127451,208939,3.0
1150906,3013,208939,4.0
1150907,117863,208939,5.0


In [5]:
movies = pd.read_sql_table('movies', conn)
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Combine Data for NMF

Create input table for NMF

In [6]:
rtrue = filtered_ratings_2019

In [7]:
rtrue = rtrue.pivot(index='user_id', columns='movie_id')

In [8]:
rtrue.shape

(10595, 13285)

Fill missing values

In [9]:
n = 1000  # chunk row size
list_df = [rtrue[i:i+n] for i in range(0,rtrue.shape[0],n)]  # create chunks to make computation easier

In [10]:
### Alternative: fill na's with imputed values
# from sklearn.impute import KNNImputer


# imputer = KNNImputer(n_neighbors=5)
# for i in range(len(list_df)):
#     list_df[i] = pd.DataFrame(imputer.fit_transform(list_df[i]), columns=list_df[i].columns, index=list_df[i].index)

In [11]:
for i in range(len(list_df)):
    list_df[i] = list_df[i].fillna(2.5)

In [12]:
rtrue_fill = pd.concat(list_df)

In [13]:
rtrue_fill.shape

(10595, 13285)

In [14]:
# Remove unnecessary column index from Rtrue
rtrue_fill.columns = rtrue_fill.columns.droplevel(0)

In [15]:
# Save vector for faster execution in web app
# dump(rtrue_fill, "../data/rtrue_fillna_25.joblib")

In [16]:
movie_ids = filtered_ratings_2019.groupby('movie_id')['rating'].mean().index

In [17]:
mean_rating_vector = filtered_ratings_2019.groupby('movie_id')[['rating']].mean()

In [18]:
mean_rating_vector

Unnamed: 0_level_0,rating
movie_id,Unnamed: 1_level_1
1,3.963053
2,3.510084
3,3.008475
4,2.000000
5,2.971591
...,...
208385,2.722222
208715,2.933333
208737,3.333333
208747,3.750000


In [19]:
# Save vector for faster execution in web app
# dump(mean_rating_vector, "../data/mean_rating_vector.joblib")

In [20]:
mean_rating_vector = load("../data/mean_rating_vector.joblib")

In [21]:
mean_rating_vector.transpose()

movie_id,1,2,3,4,5,6,7,8,9,10,...,207890,207930,208096,208104,208295,208385,208715,208737,208747,208939
rating,3.963053,3.510084,3.008475,2.0,2.971591,3.964789,3.391304,2.5,2.585714,3.421703,...,2.8125,1.916667,3.4,1.1,2.083333,2.722222,2.933333,3.333333,3.75,4.3125


### Train model

In [22]:
m = NMF(n_components=20, max_iter=100)

In [23]:
m.fit(rtrue_fill)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=100,
    n_components=20, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [24]:
# Save model for faster execution in web app
# dump(m, "../data/nmf_model.joblib")

In [25]:
# Get the two matrices P, Q out
P = m.components_
Q = m.transform(rtrue_fill)

In [26]:
P.shape, Q.shape

((20, 13285), (10595, 20))

In [27]:
# Reconstruct
np.dot(Q, P).round(1)

array([[2.5, 2.6, 2.5, ..., 2.5, 2.5, 2.5],
       [3.4, 2.6, 2.5, ..., 2.5, 2.5, 2.5],
       [2.6, 2.5, 2.5, ..., 2.5, 2.5, 2.5],
       ...,
       [2.6, 2.5, 2.5, ..., 2.5, 2.5, 2.5],
       [2.5, 2.5, 2.5, ..., 2.5, 2.5, 2.5],
       [2.4, 2.4, 2.5, ..., 2.5, 2.5, 2.5]])

### Prediction

In [28]:
new_user = [2.5] * len(movie_ids)

In [29]:
# Take movie id's from landing page
movies_seen = [
    858,  # The Godfather
    63992,  # Twilight
    58559,  # The Dark Knight
    1924,  # Plan 9 from Outer Space
    2324,  # Life is Beautiful
    171011,  # Planet Earth II
    177765,  # Coco
    296,  # Pulp Fiction
    5618,  # Spirited Away
    1136,  # Monty Python and the Holy Grail
]

In [30]:
# Get column indices from Rtrue
indices = []
for movie_seen in movies_seen:
    indices.append(np.where(movie_ids == movie_seen)[0][0])

In [31]:
indices

[510, 6538, 6296, 1115, 1398, 10990, 11408, 196, 3293, 680]

In [32]:
for i in indices:
    new_user[i] = 5

In [33]:
# Prepare ratings
new_user_final = np.array([new_user])

In [34]:
user_profile = m.transform(new_user_final)

In [35]:
user_profile.shape

(1, 20)

In [36]:
result = np.dot(user_profile[0], P)

In [37]:
result.shape

(13285,)

In [38]:
result

array([2.6013711 , 2.50200649, 2.50387237, ..., 2.50163624, 2.50101292,
       2.50227815])

In [39]:
new_result = pd.DataFrame(result)

In [40]:
new_result = new_result.transpose().copy()

In [41]:
new_result.columns = rtrue_fill.columns

In [42]:
# Create dict with 20 highest scoring recommendations
recommendations = new_result.iloc[0].sort_values(ascending=False)[:20].to_dict()

In [43]:
# Remove already seen movies
clean_recommendations = {}
for key, value in recommendations.items():
    if key not in movies_seen:
        clean_recommendations[key] = value

In [44]:
# Print out recommendations for new user
print('-' * 20 + '\nRECOMMENDATIONS FOR NEW USER\n' + '-' * 20)
for index, score in clean_recommendations.items():
    print(movies[movies.movie_id == index].title.values[0], f"- ({score:.2f})")

--------------------
RECOMMENDATIONS FOR NEW USER
--------------------
Shawshank Redemption, The (1994) - (3.08)
Fight Club (1999) - (3.00)
Inception (2010) - (2.93)
Godfather: Part II, The (1974) - (2.92)
Howl's Moving Castle (Hauru no ugoku shiro) (2004) - (2.90)
Goodfellas (1990) - (2.90)
Usual Suspects, The (1995) - (2.87)
Princess Mononoke (Mononoke-hime) (1997) - (2.86)
Reservoir Dogs (1992) - (2.86)
Spider-Man: Into the Spider-Verse (2018) - (2.84)
My Neighbor Totoro (Tonari no Totoro) (1988) - (2.83)
Fargo (1996) - (2.83)
Silence of the Lambs, The (1991) - (2.82)
Big Lebowski, The (1998) - (2.80)
Inglourious Basterds (2009) - (2.80)
Grand Budapest Hotel, The (2014) - (2.79)
