# Importing the necessary packages

In [1]:
import numpy as np
import pandas as pd
import time
import sklearn as sk
import random
from sklearn.linear_model import Ridge
from scipy.sparse.linalg import svds

# Setting the parameters
fname: File containing the triplets.

N: Number of triplets to be read.

Cutoff: Treshold parameter for refining the matrix to avoid cold start.

B: Bin size.

TS: Testing set size.

K: Parameter K for Q P estimation.

MaxIter: Maximum number of iterations for alternating Q P estimation.

MaxMSE: Maximum MSE at each step for alternating Q P estimation between t and t+1.

Note: Alternating QP estimation has two cutoff parameters. One, if the max number of iterations have been reached. Two, if the matrices have converged. Convergence is measured by MSE. I.E: If MSE betwenn P and Q between t and t+1 are lower than MaxMSE, then the iterations are cut off.

In [2]:
fname = "train_triplets.txt"
N = 300000
cutoff = 5
B = 10
TS = 200
K = 30
MaxIter = 10
MaxMSE = 0.01

# Reading the dataset

In [3]:
begin = time.clock()

f = open(fname, "r")
usrs = []
songs = []
reps = []
for n in range(0, N):
    line = next(f)
    a = line.split('\t')
    usrs.append(a[0])
    songs.append(a[1])
    reps.append(int(a[2]))
    
f.close()

songList = list(set(songs))
usrList = list(set(usrs))

v = pd.DataFrame(0, index = usrList, columns = songList)
for i in range(0, N):
    v.set_value(usrs[i], songs[i], reps[i])
M = np.array(v.as_matrix()).astype(np.float32)

del v
del usrs
del songs
del reps

print("Loading the Dataset took ", time.clock() - begin, " seconds.")

Loading the Dataset took  52.88938484699528  seconds.


# Refining the dataset to avoid cold-start issue

In [4]:
begin = time.clock()

carryOn = True
while(carryOn):
    
    carryOn = False
    delCols = []
    for cols in range(0, len(M[0])):
        if(np.count_nonzero(M[:, cols]) <= cutoff):
            delCols.append(cols)
            
    if(delCols):
        M = np.delete(M, delCols, axis = 1)
        carryOn = True

    delRows = []
    for rows in range(0, len(M)):
        if(np.count_nonzero(M[rows, :]) <= cutoff):
            delRows.append(rows)

    if(delRows):
        M = np.delete(M, delRows, axis = 0)
        carryOn = True

print("Refining the data took ", time.clock() - begin, " seconds")

Refining the data took  23.596229252144298  seconds


# Binning the values in the dataset

In [5]:
begin = time.clock()
bins = [[np.power(2,i), np.power(2,(i + 1)) - 1, i + 1] for i in range(0, B)]
bins[len(bins) - 1][1] = np.infty

for i in range(0, len(bins)-1):
    b = bins[i]
    for r in range(b[0], b[1] + 1):
        M[M == r] = b[2]

M[M >= bins[len(bins) - 1][0]] = bins[len(bins) - 1][2]
print("Binning the dataset took ", time.clock() - begin, " seconds")

Binning the dataset took  30.12228684706639  seconds


# Creating the testing dataset

In [6]:
N = list(np.transpose([M.nonzero()[0], M.nonzero()[1]]))
testI = random.sample(N, 200)
testR = [M[i[0]][i[1]] for i in testI]
for i in testI:
    M[i[0]][i[1]] = 0

# MSE function to check for convergence 

In [7]:
def MSE(v1, v2):
    if(v1.shape == v2.shape):
        e = v1 - v2
    else:
        e = np.transpose(v1) - v2
    s = v1.shape[0] * v1.shape[1]
    return np.sqrt(sum(sum(e * e)) / s) 

# Estimation of Q and P

In [8]:
begin = time.clock()
clf = Ridge(alpha=1.0)
clf.fit_intercept = False

U, s, V = svds(M, k=K)
S = np.diag(s)
Q = U.dot(S)
P = V
print("Estimating P and Q")
MSE_P = 1
MSE_Q = 1
t = 1
for t in range(1, MaxIter+1):
    P_prev = P.copy()
    P = []
    print("Iteration: ", t)
    for i in range(0, len(M[0])):
        NZ = np.nonzero(M[:,i])
        R = M[NZ,i][0]
        Q_ = Q[NZ,:][0]
        clf.fit(Q_,R)
        P.append(clf.coef_)
    Q_prev = Q.copy()
    Q = []
    P = np.array(P)
    for i in range(0, len(M)):
        NZ = np.nonzero(M[i,:])
        R = M[i,NZ][0]
        P_ = P[NZ,:][0]
        clf.fit(P_,R)
        Q.append(clf.coef_)
    Q = np.array(Q)
    MSE_P = MSE(P, P_prev)
    MSE_Q = MSE(Q,Q_prev)
    print("\tMSE Q: ", MSE_P)
    print("\tMSE P  ", MSE_Q)
    if((MSE_P < MaxMSE) & (MSE_Q < MaxMSE)):
        break
    t+=1
print("P and Q estimation finished.")
print("Estimation process ended because", end="")
if(t > MaxIter):
    print(" max number of iterations reached")
else:
    print("P and Q both converged")
print("Estimating P and Q took ", time.clock() - begin, " seconds")

Estimating P and Q
Iteration:  1
	MSE Q:  0.331345920672
	MSE P   0.450759392173
Iteration:  2
	MSE Q:  0.157407080751
	MSE P   0.133100367238
Iteration:  3
	MSE Q:  0.0607244219264
	MSE P   0.0762062649589
Iteration:  4
	MSE Q:  0.0403068822261
	MSE P   0.0535943689368
Iteration:  5
	MSE Q:  0.0305574189905
	MSE P   0.0412994558812
Iteration:  6
	MSE Q:  0.0247230408254
	MSE P   0.0336061546415
Iteration:  7
	MSE Q:  0.020819437956
	MSE P   0.0283641400943
Iteration:  8
	MSE Q:  0.0180184472174
	MSE P   0.0245669169099
Iteration:  9
	MSE Q:  0.0159060511927
	MSE P   0.0216885941172
Iteration:  10
	MSE Q:  0.0142540735456
	MSE P   0.0194314551922
P and Q estimation finished.
Estimation process ended because max number of iterations reached
Estimating P and Q took  93.57609804501598  seconds


# Testing for results

In [9]:
err = 0
j = 0
N = np.dot(Q,np.transpose(P))
for i in testI:
    err += (N[i[0]][i[1]] - testR[j])**2
    j+=1
    
print("Testing SMSE is: ", np.sqrt(err / TS))

Testing SMSE is:  1.23787584705
