In [1]:
import numpy as np
import pandas as pd
# from scipy.linalg import clarkson_woodruff_transform
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF

In [3]:
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', 
                        names = ['UserID', 'MovieID', 'Rating', 'Timestamp'],
                        encoding = 'latin1',
                        engine = 'python')
movies  = pd.read_table('ml-1m/movies.dat',  sep='::',
                        names = ['MovieID', 'Title', 'Genres'], 
                        encoding = 'latin1',
                        engine ='python')
users   = pd.read_table('ml-1m/users.dat',  sep='::', 
                        names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip'], 
                        encoding = 'latin1',
                        engine = 'python')

In [7]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# Films with sufficient numbers of reviews
Let's just get those titles which have at least N reviews

The output is **ratings_topN**

In [8]:
N = 1000
ratings_count = ratings.groupby(by='MovieID', as_index=True).size()
# top_ratings = ratings_count.sort_values(ascending=False)[:N]
top_ratings = ratings_count[ratings_count>=N]
top_ratings.head(10)

MovieID
1      2077
11     1033
21     1356
32     1511
34     1751
39     1362
47     1137
50     1783
110    2443
111    1240
dtype: int64

In [9]:
# movies_topN = movies[movies.MovieID.isin(top_ratings.index)]
# print('Shape: {}'.format(movies_topN.shape))
# movies_topN
ratings_topN = ratings[ratings.MovieID.isin(top_ratings.index)]
print('Shape: {}'.format(ratings_topN.shape))
ratings_topN.head(10)

Shape: (305373, 4)


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
7,1,2804,5,978300719
9,1,919,4,978301368
10,1,595,5,978824268
13,1,2918,4,978302124
15,1,2791,4,978302188
19,1,2797,4,978302039


In [10]:
n_users = ratings_topN.UserID.unique().shape[0]
n_movies = ratings_topN.MovieID.unique().shape[0]
print('Number of users = {} | Number of movies = {}'.format(n_users, n_movies))

Number of users = 6039 | Number of movies = 207


# Low Rank Matrix Factorization
build our model of users, the rating that user i assigns to movie j. We factor the user-movie matrix using low-rank matrix factorization: M = UV'

In [11]:
R_df = ratings_topN.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,11,21,32,34,39,47,50,110,111,...,3527,3578,3623,3671,3702,3703,3751,3753,3755,3793
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,4.0,4.0,3.0,3.0,5.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [12]:
M = R_df.as_matrix()
sparsity=round(1.0-np.count_nonzero(M)/float(n_users*n_movies),3)
print('Number of users = {} | Number of movies = {}'.format(n_users, n_movies))
print('The sparsity level is {}%'.format(sparsity*100))

Number of users = 6039 | Number of movies = 207
The sparsity level is 75.6%


  """Entry point for launching an IPython kernel.


We should define how many components we want for the low rank matrix factorization

In [13]:
K = 30

### Sparse SVD

In [14]:
U, s, Vt = svds(M, k = K)
s=np.diag(s)
U = np.dot(U,s)
print('U: {}'.format(U.shape))
print('Vt: {}'.format(Vt.shape))

U: (6039, 30)
Vt: (30, 207)


### Non-negative matrix factorization (NMF)
Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X.

In [15]:
model = NMF(n_components=K, init='random', random_state=0)
W = model.fit_transform(M)
H = model.components_
print('W: {}'.format(W.shape))
print('H: {}'.format(H.shape))

W: (6039, 30)
H: (30, 207)


In [16]:
np.savetxt('U.csv', W, delimiter=',') 
np.savetxt('Vt.csv', H, delimiter=',') 