In [0]:
# importing necessary libraries

from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
import numpy as np
import timeit

In [0]:
# generate a random matrix of user ratings for music items
# users = 1 million
# music items = 5 million

n_users = 10000
n_music_items = 50000

In [0]:
X = sparse_random_matrix(n_music_items,n_users, density=0.01, random_state=45)

# **SKLEARN**

In [0]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=123)

In [0]:
from sklearn.preprocessing import normalize
x = np.absolute(X)
x_normalized = normalize(x, norm='l1', axis=0)

In [39]:
print x_normalized

  (234, 0)	0.0020080321285140474
  (284, 0)	0.0020080321285140474
  (554, 0)	0.0020080321285140474
  (654, 0)	0.0020080321285140474
  (655, 0)	0.0020080321285140474
  (715, 0)	0.0020080321285140474
  (718, 0)	0.0020080321285140474
  (1068, 0)	0.0020080321285140474
  (1081, 0)	0.0020080321285140474
  (1168, 0)	0.0020080321285140474
  (1178, 0)	0.0020080321285140474
  (1329, 0)	0.0020080321285140474
  (1401, 0)	0.0020080321285140474
  (1410, 0)	0.0020080321285140474
  (1418, 0)	0.0020080321285140474
  (1505, 0)	0.0020080321285140474
  (1834, 0)	0.0020080321285140474
  (1864, 0)	0.0020080321285140474
  (2001, 0)	0.0020080321285140474
  (2009, 0)	0.0020080321285140474
  (2108, 0)	0.0020080321285140474
  (2568, 0)	0.0020080321285140474
  (2597, 0)	0.0020080321285140474
  (2614, 0)	0.0020080321285140474
  (2654, 0)	0.0020080321285140474
  :	:
  (46772, 9999)	0.0021367521367521274
  (46845, 9999)	0.0021367521367521274
  (46956, 9999)	0.0021367521367521274
  (47223, 9999)	0.0021367521367521274

In [0]:
start = timeit.default_timer()
svd.fit(b_normalized)
stop = timeit.default_timer()

# **speed (time taken)**

In [41]:
print stop-start

2.15900015831


# **memory utilization**

In [42]:
import sys
print sys.getsizeof(svd),sys.getsizeof(X) 

64 64


In [43]:
##printing the values of the decomposed components
print("VT")
print(svd.components_)  
print("Sigma")
print(svd.singular_values_)

VT
[[ 0.00998653  0.01002046  0.01006625 ...  0.00998956  0.00994894
   0.00997071]
 [-0.0192061  -0.00264812 -0.00685641 ... -0.03111749 -0.01327087
  -0.00676885]
 [ 0.00386587  0.01168584  0.00520264 ...  0.0148165  -0.00097988
   0.01013017]
 [ 0.00174292  0.00392516 -0.00325744 ... -0.00613744 -0.00048666
   0.00316204]
 [-0.00208161  0.00036121 -0.01118827 ... -0.00054636 -0.01055634
   0.01307337]]
Sigma
[0.44943705 0.06218267 0.06200957 0.06192296 0.06188153]


# **GENSIM**

In [22]:
!pip install gensim



In [0]:
import gensim
start = timeit.default_timer()
mat= gensim.models.lsimodel.stochastic_svd(X,2,50000)
stop = timeit.default_timer()

# **Speed (Time Taken)**

In [45]:
print stop-start

0.127331018448


# **Memory Utilization**

In [46]:
print sys.getsizeof(svd)

64


In [47]:
#sigma (singular value of corpus)
print mat[1]

[0.46064843 0.45576656]


In [48]:
#left singular value
print mat[0]

[[ 0.00639344 -0.00076639]
 [-0.01236334  0.00993784]
 [-0.00193718 -0.00540985]
 ...
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]]


# **Benchmark  (comparison)**

We found that in terms of time taken, the gensim method to compute SVD is almost 200 times faster than Sklearn SVD method.

However when we compared memory utilization it was almost the same.

Also the benefit of Gensim is that the whole matrix need not to be loaded into the RAM but can be operated in chunks.