In [1]:
import numpy as np
import pandas as pd
# from scipy.linalg import clarkson_woodruff_transform
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF

In [2]:
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', 
                        names = ['UserID', 'MovieID', 'Rating', 'Timestamp'],
                        encoding = 'latin1',
                        engine = 'python')
movies  = pd.read_table('ml-1m/movies.dat',  sep='::',
                        names = ['MovieID', 'Title', 'Genres'], 
                        encoding = 'latin1',
                        engine ='python')
users   = pd.read_table('ml-1m/users.dat',  sep='::', 
                        names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip'], 
                        encoding = 'latin1',
                        engine = 'python')

FileNotFoundError: [Errno 2] No such file or directory: 'ml-1m/ratings.dat'

In [None]:
movies.head()

# Films with sufficient numbers of reviews
Let's just get those titles which have at least N reviews

The output is **ratings_topN**

In [None]:
N = 1000
ratings_count = ratings.groupby(by='MovieID', as_index=True).size()
# top_ratings = ratings_count.sort_values(ascending=False)[:N]
top_ratings = ratings_count[ratings_count>=N]
top_ratings.head(10)

In [None]:
# movies_topN = movies[movies.MovieID.isin(top_ratings.index)]
# print('Shape: {}'.format(movies_topN.shape))
# movies_topN
ratings_topN = ratings[ratings.MovieID.isin(top_ratings.index)]
print('Shape: {}'.format(ratings_topN.shape))
ratings_topN.head(10)

In [None]:
n_users = ratings_topN.UserID.unique().shape[0]
n_movies = ratings_topN.MovieID.unique().shape[0]
print('Number of users = {} | Number of movies = {}'.format(n_users, n_movies))

# Low Rank Matrix Factorization
build our model of users, the rating that user i assigns to movie j. We factor the user-movie matrix using low-rank matrix factorization: M = UV'

In [None]:
R_df = ratings_topN.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

In [None]:
M = R_df.as_matrix()
sparsity=round(1.0-np.count_nonzero(M)/float(n_users*n_movies),3)
print('Number of users = {} | Number of movies = {}'.format(n_users, n_movies))
print('The sparsity level is {}%'.format(sparsity*100))

We should define how many components we want for the low rank matrix factorization

In [None]:
K = 30

### Sparse SVD

In [None]:
U, s, Vt = svds(M, k = K)
s=np.diag(s)
U = np.dot(U,s)
print('U: {}'.format(U.shape))
print('Vt: {}'.format(Vt.shape))

### Non-negative matrix factorization (NMF)
Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X.

In [None]:
model = NMF(n_components=K, init='random', random_state=0)
W = model.fit_transform(M)
H = model.components_
print('W: {}'.format(W.shape))
print('H: {}'.format(H.shape))

In [None]:
np.savetxt('U.csv', W, delimiter=',') 
np.savetxt('Vt.csv', H, delimiter=',') 