In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF

In [2]:
for fn in os.listdir('data/ml-latest-small'):
    print(fn)

tags_datetime.csv
tags.csv
links.csv
ratings_datetime.csv
movies.csv
README.txt
ratings.csv


In [3]:
links = pd.read_csv('data/ml-latest-small/links.csv')
movies = pd.read_csv('data/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
tags = pd.read_csv('data/ml-latest-small/tags.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Change timestamp from unix to datetime

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
tags['timestamp'] = pd.to_datetime(tags.timestamp, unit='s')

In [6]:
# Save changed dataframe as csv for Postgres
# ratings.set_index('movieId').to_csv('data/ml-latest-small/ratings_datetime.csv')
# tags.set_index('userId').to_csv('ml-latest-small/tags_datetime.csv')

In [7]:
# Look at longest tag to determine data type in Postgres
tags.tag.str.len().max()

85

### Combine Data for NMF

Create input table for NMF

In [8]:
df = pd.merge(ratings, movies, 'left', on='movieId')

In [9]:
rtrue = df[['userId', 'movieId', 'rating']].set_index('userId')

In [10]:
rtrue = rtrue.pivot(index=rtrue.index, columns='movieId').copy()

In [11]:
# Fill nans with 2.5 (alternative: 0 or 3)
rtrue_fill = rtrue.fillna(2.5).copy()

In [12]:
rtrue_fill

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,2.5,4.0,2.5,2.5,4.0,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
2,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
3,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
4,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
5,4.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
607,4.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
608,2.5,2.0,2.0,2.5,2.5,2.5,2.5,2.5,2.5,4.0,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
609,3.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,4.0,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5


### Train model

In [13]:
m = NMF(12)

In [14]:
m.fit(rtrue_fill)



NMF(n_components=12)

In [15]:
# get the two matrices P, Q out
P = m.components_
Q = m.transform(rtrue_fill)

In [16]:
P.shape, Q.shape

((12, 9724), (610, 12))

In [17]:
# reconstruct
np.dot(Q, P).round(1)

array([[3.8, 2.8, 2.8, ..., 2.5, 2.5, 2.5],
       [2.6, 2.5, 2.5, ..., 2.5, 2.5, 2.5],
       [2.6, 2.5, 2.5, ..., 2.5, 2.5, 2.5],
       ...,
       [2.9, 2.2, 2.5, ..., 2.5, 2.5, 2.5],
       [2.8, 2.6, 2.6, ..., 2.5, 2.5, 2.5],
       [3.8, 3.4, 2.4, ..., 2.5, 2.5, 2.6]])

### Prediction

In [18]:
new_user = [2.5] * 9724

In [19]:
# Take worst and best rated movies for initial ratings
worst_best = [442, 508, 153, 567, 311, 53, 251, 515, 25, 30]

In [20]:
# Remove unnecessary column index from Rtrue
rtrue_fill.columns = rtrue_fill.columns.droplevel(0)

In [21]:
# Get column indices from Rtrue
indices = []
for film in worst_best:
    indices.append(rtrue_fill.columns.get_loc(film))

In [22]:
indices

[385, 443, 126, 492, 270, 48, 215, 450, 24, 29]

In [23]:
for i in indices:
    new_user[i] = 5

In [24]:
# Prepare ratings
new_user_final = np.array([new_user])

In [25]:
user_profile = m.transform(new_user_final)

In [26]:
user_profile.shape

(1, 12)

In [27]:
result = np.dot(user_profile[0], P)

In [28]:
result.shape

(9724,)

In [29]:
result

array([2.69707422, 2.52687982, 2.51653931, ..., 2.50529013, 2.50529013,
       2.50350016])

In [30]:
new_result = pd.DataFrame(result)

In [31]:
new_result = new_result.transpose().copy()

In [32]:
new_result.columns = rtrue_fill.columns

In [33]:
# Create dict with 20 highest scoring recommendations
recommendations = new_result.iloc[0].sort_values(ascending=False)[:20].to_dict()

In [34]:
# Remove already seen movies
clean_recommendations = {}
for key, value in recommendations.items():
    if key not in worst_best:
        clean_recommendations[key] = value

In [43]:
# Print out recommendations for new user
print('-' * 20 + '\nRECOMMENDATIONS FOR NEW USER\n' + '-' * 20)
for index, score in clean_recommendations.items():
    print(movies[movies.movieId == index].title.values[0], f"- ({score:.2f})")

--------------------
RECOMMENDATIONS FOR NEW USER
--------------------
Shawshank Redemption, The (1994) - (2.90)
Forrest Gump (1994) - (2.81)
Pulp Fiction (1994) - (2.74)
Silence of the Lambs, The (1991) - (2.71)
Matrix, The (1999) - (2.71)
Star Wars: Episode IV - A New Hope (1977) - (2.70)
Toy Story (1995) - (2.70)
Fugitive, The (1993) - (2.68)
Apollo 13 (1995) - (2.68)
Braveheart (1995) - (2.64)
Jurassic Park (1993) - (2.64)
Dark Knight, The (2008) - (2.63)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) - (2.62)
Independence Day (a.k.a. ID4) (1996) - (2.62)
Lion King, The (1994) - (2.62)
True Lies (1994) - (2.61)
Fargo (1996) - (2.61)
Beauty and the Beast (1991) - (2.61)
Babe (1995) - (2.61)
Speed (1994) - (2.61)
