Without using package

In [14]:
import sys
import numpy as np
import time
from collections import Counter
import random


def mycur(matrix, rank):
	
	"""
	INPUT: matrix: user-rating matrix, rank: desired rank.
	OUTPUT: returns C, U and R resulting from the CUR decomposition of the matrix.
	"""
	m = matrix.shape[0]
	n = matrix.shape[1]
	
	if((rank>m) or (rank>n)):
		print("error: rank greater than matrix dimensions.\n")
		return;
		
	C = np.zeros((m, rank))
	R = np.zeros((rank, n))
	
	matrix_sq = matrix**2
	sum_sq = np.sum(matrix_sq)
	
	frob_col = np.sum(matrix_sq, axis=0)
	probs_col = frob_col/sum_sq			
	
	count=0
	count1=0
	temp = 0
	idx = np.arange(n)					
	taken_c = []
	dup_c = []
	
	while(count<rank):
		i = np.random.choice(idx, p = probs_col)	
		count1 = count1+1
		if(i not in taken_c):
			C[:, count] = matrix[:, i]/np.sqrt(rank*probs_col[i])	
			count = count+1
			taken_c.append(i)
			dup_c.append(1)
		else:									
			temp = taken_c.index(i)
			dup_c[temp] = dup_c[temp]+1
			
	C = np.multiply(C, np.sqrt(dup_c))			
			
	frob_row = np.sum(matrix_sq, axis=1)
	probs_row = frob_row/sum_sq				
	
	count=0
	count1=0
	idx = np.arange(m)						
	taken_r = []
	dup_r = []
	
	while(count<rank):
		i = np.random.choice(idx, p = probs_row)		
		count1 = count1+1
		if(i not in taken_r):
			R[count, :] = matrix[i, :]/np.sqrt(rank*probs_row[i])	
			count = count+1
			taken_r.append(i)
			dup_r.append(1)
		else:
			temp = taken_r.index(i)						
			dup_r[temp] = dup_r[temp]+1
		
	R = np.multiply(R.T, np.sqrt(dup_r))			
	R = R.T
	
	W = np.zeros((rank, rank))
	
	for i, I in enumerate(taken_r):
		for j, J in enumerate(taken_c):			
			W[i, j] = matrix[I, J]
	
	X, sigma, Y_t = mysvd(W,rank)				
	
	for i in range(rank):
		if(sigma[i,i] >= 1):					
			sigma[i,i] = 1/sigma[i,i]
		else:
			sigma[i,i] = 0
	
	U = np.dot(Y_t.T, np.dot(np.dot(sigma,sigma), X.T))	
	
	return C, U, R
	

if __name__=='__main__':		

	np.random.seed(0)
	#filename = sys.argv[1]
	filename = "ratings.txt"
	list = [line.split(' ') for line in open(filename)]

	from sklearn.model_selection import train_test_split 
	train_data, test_data = train_test_split(list, test_size=1)

	num_users = len(Counter(elem[0] for elem in list))
	num_movies = len(Counter(elem[1] for elem in list))

	rating = np.zeros((num_users, num_movies))

	for elem in train_data:
		
		rating[int(elem[0])-1, int(elem[1])-1] = float(elem[2])
	
	print(num_users, num_movies)
	
	ranks = [50, 200, 500, 800, 1000]
	
	
	print("rating:", rating, "\n")
	
	for rank in ranks:
		print("\t***RANK :", rank, "***\n")
		t0 = time.time()
		t1 = time.time()
		
		
		t2 = time.time()
		C, U, R = mycur(rating, rank)
		t3 = time.time()
		
		ans = np.dot(C, np.dot(U, R))
		print("\n\nCUR error:", np.sum((ans - rating)**2))
		print("\nCUR time:", t3-t2, " secs\n")


1508 2071
rating: [[2.  4.  3.5 ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  4.  0.  ... 0.  0.  0. ]
 [1.5 3.  2.  ... 0.  0.  0. ]] 

	***RANK : 50 ***



CUR error: 9010298.693142464

CUR time: 0.0287325382232666  secs

	***RANK : 200 ***



CUR error: 1629353.4107519032

CUR time: 0.1482384204864502  secs

	***RANK : 500 ***



CUR error: 512258.57321098336

CUR time: 0.5383045673370361  secs

	***RANK : 800 ***



CUR error: 434251.9378108118

CUR time: 1.3130311965942383  secs

	***RANK : 1000 ***



CUR error: 381755.83182498976

CUR time: 2.183309316635132  secs



Using CUR Package

In [17]:
	np.random.seed(0)
	file = "/content/ratings.txt"
	list = [line.split(' ') for line in open(filename)]

	train_data, test_data = train_test_split(list, test_size=1)

	num_users = len(Counter(elem[0] for elem in list))
	num_movies = len(Counter(elem[1] for elem in list))

	rating = np.zeros((num_users, num_movies))

	for elem in train_data:
		
		rating[int(elem[0])-1, int(elem[1])-1] = float(elem[2])
	
	print(num_users, num_movies)
	
	ranks = [50, 200, 500, 800, 1000]
	
	
	print("rating:", rating, "\n")
r = 5
from cur import cur_decomposition
C, U, R = cur_decomposition(rating, r)
ans = np.dot(C, np.dot(U, R))
print(ans)
print("\n\nCUR error:", np.sum((ans - rating)**2))
print("\nCUR time:", t3-t2, " secs\n")

1508 2071
rating: [[2.  4.  3.5 ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  4.  0.  ... 0.  0.  0. ]
 [1.5 3.  2.  ... 0.  0.  0. ]] 

[[-1.45223555e+18 -7.32036317e+18 -1.29246722e+18 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 9.47899110e+17  3.53754642e+18  3.63485901e+17 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.93847622e+17 -2.70435415e+18 -7.65935163e+17 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


CUR error: 5.9088505074112e+42

CUR time: 2.183309316635132  secs

