In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF 

In [2]:
movie_df = pd.read_csv('./data/ml-latest-small/movies.csv')
rating_df = pd.read_csv('./data/ml-latest-small/ratings.csv')
link_df = pd.read_csv('./data/ml-latest-small/links.csv')
tag_df = pd.read_csv('./data/ml-latest-small/tags.csv')

In [3]:
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
rating_df = rating_df.rename(columns={'movieId': 'movie_id'})
movie_df = movie_df.rename(columns={'movieId': 'movie_id'})
rating_df = rating_df.rename(columns={'userId': 'user_id'})


In [5]:
# calculate the number of ratings per movie
rating_count = rating_df.groupby('movie_id')[['rating']].count()
rating_count

Unnamed: 0_level_0,rating
movie_id,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


In [6]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = rating_count[rating_count['rating']>20].index
popular_movies


Int64Index([     1,      2,      3,      5,      6,      7,     10,     11,
                16,     17,
            ...
            122920, 122922, 134130, 134853, 139385, 148626, 152081, 164179,
            166528, 168252],
           dtype='int64', name='movie_id', length=1235)

In [7]:
# filter the ratings matrix and only keep the popular movies
df = rating_df[rating_df['movie_id'].isin(popular_movies)].copy()
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100803,610,148626,4.0,1493847175
100808,610,152081,4.0,1493846503
100829,610,164179,5.0,1493845631
100830,610,166528,4.0,1493879365


In [8]:
rating_df.shape, df.shape

((100836, 4), (66658, 4))

In [9]:
# need to remake user ids and movie ids since they are not sequential
user_ids = df['user_id'].unique()
user_ids

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [10]:
user_id_map = {v:k for k,v in enumerate(user_ids)}
df['user_id'] = df['user_id'].map(user_id_map)
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1,4.0,964982703
1,0,3,4.0,964981247
2,0,6,4.0,964982224
3,0,47,5.0,964983815
4,0,50,5.0,964982931
...,...,...,...,...
100803,609,148626,4.0,1493847175
100808,609,152081,4.0,1493846503
100829,609,164179,5.0,1493845631
100830,609,166528,4.0,1493879365


In [11]:
movie_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [12]:
#movie_id_map = {}
#for key, value in enumerate(movie_ids):
#    movie_id_map[value] = key

In [13]:
#similarly for the movie_id:
movie_ids = df['movie_id'].unique()
movie_ids


array([   1,    3,    6, ..., 4247, 2986, 2013])

In [14]:
movie_id_map = {v:k for k,v in enumerate(movie_ids)}
df['movie_id'] = df['movie_id'].map(movie_id_map)
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931
...,...,...,...,...
100803,609,808,4.0,1493847175
100808,609,643,4.0,1493846503
100829,609,809,5.0,1493845631
100830,609,644,4.0,1493879365


In [15]:
#movie_id_map = {v:k for k,v in enumerate(movie_ids)}
#df['movie_id'] = df['movie_id'].map(movie_id_map)

#movie_id_map = {v:k for k,v in enumerate(movie_df['movie_id'].unique())}
#movie_df['movie_id'] =movie_df['movie_id'].map(movie_id_map)
#movie_df

In [16]:
# filter out unpopular movies
movies = movie_df[movie_df['movie_id'].isin(movie_ids)]
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
...,...,...,...
9162,148626,"Big Short, The (2015)",Drama
9223,152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
9392,164179,Arrival (2016),Sci-Fi
9433,166528,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi


In [17]:
# redefine movie ids
movies['movie_id'] = movies['movie_id'].map(movie_id_map)
movies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['movie_id'] = movies['movie_id'].map(movie_id_map)


Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,325,Jumanji (1995),Adventure|Children|Fantasy
2,1,Grumpier Old Men (1995),Comedy|Romance
4,326,Father of the Bride Part II (1995),Comedy
5,2,Heat (1995),Action|Crime|Thriller
...,...,...,...
9162,808,"Big Short, The (2015)",Drama
9223,643,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
9392,809,Arrival (2016),Sci-Fi
9433,644,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi


In [18]:
movie_title = movies.sort_values('movie_id')['title']
movie_title

0                     Toy Story (1995)
2              Grumpier Old Men (1995)
5                          Heat (1995)
43         Seven (a.k.a. Se7en) (1995)
46          Usual Suspects, The (1995)
                     ...              
1435        Terms of Endearment (1983)
2968               Little Nicky (2000)
3158                   Joe Dirt (2001)
2249                  RoboCop 2 (1990)
1488    Poseidon Adventure, The (1972)
Name: title, Length: 1235, dtype: object

In [19]:
from scipy.sparse import csr_matrix
R = csr_matrix((df['rating'], (df['user_id'], df['movie_id'])))

In [20]:
R.shape

(610, 1235)

In [21]:
R.todense()

matrix([[4. , 4. , 4. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        ...,
        [2.5, 2. , 0. , ..., 1. , 1.5, 0. ],
        [3. , 0. , 0. , ..., 0. , 0. , 0. ],
        [5. , 0. , 5. , ..., 3. , 0. , 0. ]])

In [22]:
Rating = pd.DataFrame(R.todense(), columns = movie_title)
Rating

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Model**

#### Step 1. Create a movie-features matrix $Q$

1. *Instatiate the NMF model*

In [23]:
nmf_model = NMF(n_components=1000, max_iter=1000, verbose=2)

2. *Fit the model on the full imputed user/movie dataframe/matrix*

In [24]:
nmf_model.fit(Rating)

violation: 1.0
violation: 0.1954350368326224
violation: 0.07516724131741903
violation: 0.03776883596104708
violation: 0.022896793430109443
violation: 0.015189322927695186
violation: 0.010710549439917718
violation: 0.007893159813339231
violation: 0.0061144073625449295
violation: 0.004871777561731761
violation: 0.003915859897269129
violation: 0.003202978208306037
violation: 0.0026139746888256318
violation: 0.0022079752469471355
violation: 0.0018691740721860166
violation: 0.001625571337400939
violation: 0.0014335990386835301
violation: 0.001273836763230075
violation: 0.0011510475065458345
violation: 0.0010679255874334334
violation: 0.0009940062539825275
violation: 0.0009166795657618177
violation: 0.0008441549937082787
violation: 0.0007686613745573583
violation: 0.0007079461115206908
violation: 0.0006555600118010658
violation: 0.000609112042625558
violation: 0.0005610047352092242
violation: 0.0005259622062912608
violation: 0.0004978854917859634
violation: 0.0004714026941628841
violation: 0

3. *Get the movie-feature dataframe/matrix $Q$ from the model components*

In [25]:
Q_matrix = nmf_model.components_
Q_matrix

array([[1.24953157e-02, 9.68371418e-04, 3.86311251e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.59898402e-02, 1.28964580e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.31433082e-02, 1.41271361e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [3.98743200e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.00028222e-05, 2.48992429e-06, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [26]:
Q_matrix.shape

(1000, 1235)

In [30]:
# to dataframe for better visualization
Q = pd.DataFrame(Q_matrix, columns= movie_title,index=nmf_model.get_feature_names_out())
Q

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
nmf0,0.012495,0.000968,0.038631,0.038256,0.000609,0.0,0.0,0.020094,0.000004,0.023964,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
nmf1,0.025990,0.012896,0.000000,0.009572,0.039255,0.0,0.0,0.031295,0.000288,0.005903,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
nmf2,0.013143,0.001413,0.000000,0.014111,0.043685,0.0,0.0,0.000318,0.000076,0.018565,...,0.0,0.000013,0.0,0.000128,0.0,0.0,0.000000,0.0,0.0,0.0
nmf3,0.000000,0.000000,0.000000,0.003250,0.006940,0.0,0.0,0.000184,0.000000,0.000000,...,0.0,0.000108,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
nmf4,0.199994,0.000000,0.000000,0.319700,0.114232,0.0,0.0,0.000051,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nmf995,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
nmf996,0.019154,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
nmf997,0.398743,0.000000,0.000000,0.000000,0.066434,0.0,0.0,0.177484,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
nmf998,0.000030,0.000002,0.000000,0.000033,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0


#### Step 2. Create user-features matrix $P$

In [31]:
P_matrix = nmf_model.transform(Rating)
P_matrix

violation: 1.0
violation: 1.0659718111542724
violation: 0.5672158036089501
violation: 0.24861753870072062
violation: 0.12401622971521555
violation: 0.06386528012078167
violation: 0.03491888513730523
violation: 0.021505377172216004
violation: 0.013322828106994066
violation: 0.008860754095537033
violation: 0.006380145066487109
violation: 0.0046975674917030304
violation: 0.003966344681035731
violation: 0.002787571998901417
violation: 0.0022578240857151566
violation: 0.0015671832564683843
violation: 0.001458893603226078
violation: 0.001425997214038964
violation: 0.001236722958712338
violation: 0.0009493909029303981
violation: 0.0008027639092098682
violation: 0.0006764202036912985
violation: 0.0006616988481857669
violation: 0.0006287907610416959
violation: 0.0005322258799046942
violation: 0.00048259805986357166
violation: 0.0003985255727618304
violation: 0.0003526249353384295
violation: 0.0002945047838774697
violation: 0.00027273075745663875
violation: 0.0002666191945739785
violation: 0.000

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.21726319e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [7.14925862e-03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [32]:
P_matrix.shape

(610, 1000)

#### Step 3. Reconstruct the ratings matrix 
$\hat{R} := P\cdot Q \sim R$

1. *Get the dot product of $P$ and $Q$*

In [33]:
R_hat_matrix = np.dot(P_matrix,Q_matrix)
R_hat_matrix

array([[3.99997626e+00, 4.00000530e+00, 4.00000141e+00, ...,
        9.94665527e-07, 2.19403001e-05, 0.00000000e+00],
       [2.27666368e-06, 1.04825874e-09, 4.24717808e-09, ...,
        1.33031945e-10, 1.02310859e-08, 0.00000000e+00],
       [1.89231294e-06, 0.00000000e+00, 0.00000000e+00, ...,
        1.22604115e-11, 1.18441256e-06, 0.00000000e+00],
       ...,
       [2.49999359e+00, 1.99958290e+00, 5.47868230e-03, ...,
        9.99973805e-01, 1.49998082e+00, 2.50200867e-04],
       [2.99992876e+00, 8.07181160e-09, 2.88402829e-04, ...,
        1.82771993e-10, 1.18711128e-13, 3.20558396e-05],
       [4.99998398e+00, 2.58552932e-03, 4.99907429e+00, ...,
        3.00006096e+00, 1.16238742e-03, 1.31859606e-03]])

Get the reconstructed error $R$ and $\hat{R}$:

In [34]:
nmf_model.reconstruction_err_

6.5266166432464985

### Save a Model with pickle
The pickle module dumps an object into a binary strings

In [36]:
import pickle

with open('nmf_1000.pkl',mode='wb') as file:
    pickle.dump(nmf_model,file)

### Model deployment: Make recommendations for a new user

In [37]:
#load the model
with open('nmf_1000.pkl','rb') as file:
    model_5000 = pickle.load(file)

#### Step 1. Receive a user query

1. *Create a new_user-item query (e.g. dictionary having movie title as key and rating as value)*

In [38]:
new_user_query = {'Toy Story (1995)': 1, 
                 'Joe Dirt (2001)':2, 
                 "Heat (1995)": 3.5 ,
                 "Little Nicky (2000)":5}

2. *Get new_user-item dataframe with the previous dictionary*

In [39]:
new_user_dataframe =  pd.DataFrame(new_user_query,columns=movie_title, index=['new_user_query'] )
new_user_dataframe

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user_query,1,,3.5,,,,,,,,...,,,,,,,5,2,,


3. *Fill the missing value*<br>

In [40]:
# using the same imputation as training data

new_user_dataframe_imputed = new_user_dataframe.fillna(0)
new_user_dataframe_imputed

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user_query,1,0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,2,0,0


#### Step 3. Create user-feature matrix $P$ for new user

1. *Get the user-feature matrix/dataframe $P$ by using the model method transform on the full imputed user/movie dataframe/matrix*

In [41]:
P_new_user_matrix = model_5000.transform(new_user_dataframe_imputed)
P_new_user_matrix

violation: 1.0
violation: 3.198142214450967
violation: 0.7452106996835066
violation: 0.30213506220649394
violation: 0.046739063736439226
violation: 0.05878043619234075
violation: 0.03745563629634637
violation: 0.015131812814334273
violation: 0.003368643258022193
violation: 0.001759466555509926
violation: 0.0015675835905209596
violation: 0.000841155050245798
violation: 0.0003265589025138131
violation: 7.878891282237604e-05
Converged at iteration 15


array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        8.20021959e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.000000

In [42]:
P_new_user_matrix.shape

(1, 1000)

In [43]:
# get as dataframe for a better visualizarion
P_new_user = pd.DataFrame(P_new_user_matrix, 
                         columns = model_5000.get_feature_names_out(),
                         index = ['new_user'])

In [44]:
P_new_user

Unnamed: 0,nmf0,nmf1,nmf2,nmf3,nmf4,nmf5,nmf6,nmf7,nmf8,nmf9,...,nmf990,nmf991,nmf992,nmf993,nmf994,nmf995,nmf996,nmf997,nmf998,nmf999
new_user,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007653


#### Step 3. Reconstruct the user-movie(item) matrix/dataframe for the new user
$\hat{R}_{new-user} = P_{new-user} \cdot Q \sim R_{new-user}$

In [45]:
R_hat_new_user_matrix = np.dot(P_new_user, Q)
R_hat_new_user_matrix

array([[9.99504083e-01, 2.84101260e-01, 8.77698483e-01, ...,
        7.07131510e-02, 6.49362158e-04, 7.92779117e-03]])

In [46]:
# get as dataframe for a better visualizarion
R_hat_new_user = pd.DataFrame(data=R_hat_new_user_matrix,
                         columns=movie_title,
                         index = ['new_user'])
R_hat_new_user

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,0.999504,0.284101,0.877698,0.049801,0.130293,0.078863,0.03238,0.131012,0.049782,0.128969,...,0.027578,0.0,0.044629,0.010346,1.618528e-08,2.4e-05,0.17988,0.070713,0.000649,0.007928


#### Step 4. Get a list of k-top rated movie to recommend to the new user

In [47]:
new_user_query.keys()

dict_keys(['Toy Story (1995)', 'Joe Dirt (2001)', 'Heat (1995)', 'Little Nicky (2000)'])

In [48]:
R_hat_new_user_filtered =  R_hat_new_user.drop(new_user_query.keys(), axis=1)
R_hat_new_user_filtered

title,Grumpier Old Men (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),Billy Madison (1995),Clerks (1994),...,Metropolis (1927),Babe: Pig in the City (1998),Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,0.284101,0.049801,0.130293,0.078863,0.03238,0.131012,0.049782,0.128969,0.000133,0.191237,...,0.005476,0.0,0.027578,0.0,0.044629,0.010346,1.618528e-08,2.4e-05,0.000649,0.007928


In [49]:
R_hat_new_user_filtered.T

Unnamed: 0_level_0,new_user
title,Unnamed: 1_level_1
Grumpier Old Men (1995),2.841013e-01
Seven (a.k.a. Se7en) (1995),4.980053e-02
"Usual Suspects, The (1995)",1.302929e-01
From Dusk Till Dawn (1996),7.886277e-02
Bottle Rocket (1996),3.238013e-02
...,...
"American Werewolf in London, An (1981)",1.034567e-02
eXistenZ (1999),1.618528e-08
Terms of Endearment (1983),2.415666e-05
RoboCop 2 (1990),6.493622e-04


In [52]:
ranked =  R_hat_new_user_filtered.T.sort_values(by =['new_user'],ascending=False)
ranked

Unnamed: 0_level_0,new_user
title,Unnamed: 1_level_1
Leaving Las Vegas (1995),0.397024
Fargo (1996),0.329227
Independence Day (a.k.a. ID4) (1996),0.290510
Grumpier Old Men (1995),0.284101
Pulp Fiction (1994),0.253073
...,...
Stripes (1981),0.000000
"League of Their Own, A (1992)",0.000000
"To Wong Foo, Thanks for Everything! Julie Newmar (1995)",0.000000
Spawn (1997),0.000000


In [53]:
recommendtions = ranked[:3]
recommendtions

Unnamed: 0_level_0,new_user
title,Unnamed: 1_level_1
Leaving Las Vegas (1995),0.397024
Fargo (1996),0.329227
Independence Day (a.k.a. ID4) (1996),0.29051
