# Importing the necessary libraries

In [1]:
import numpy as np
import zipfile as zf
import json
import time
import scipy.sparse as sps
from sklearn.preprocessing import normalize
from itertools import count

# Setting up the parameters

* fileName: The name of the database file
* simCutoff: The minimum similarity there has to be for there to be a link between track i to j
* tagCutoff: The minimum Tag value there has to be for there to be a link between a track and a particular Tag
* Beta: The Beta value for $r^{new} = \beta M r^{old} + A$
* n_most_i: The parameter that defines n in "Top n most popluar songs"
* tagList: The selected list of tags for this experiment. Note that The songs will be selected as an intersection of tags. IE: If the taglist is ['a', 'b'], then only the songs that have both 'a' and 'b' tags will be selected

In [144]:
fileName = 'lastfm_test.zip'
simCutoff = 0.0
tagCutoff = 50
Beta = 0.2
n_most_i = 5
tagList = ['Hip-Hop']

# Reading the dataset

In [3]:
begin = time.clock()
simD = []
tagD = []
trkToIndex = {}
tagToIndex = {}
file = zf.ZipFile(fileName)
trkCounter = count()
tagCounter = count()

for i in range(0,len(file.filelist)):
    
    if(i%10000 == 0):
        print(round(100 * i / len(file.filelist)), " % of dataset loaded.")
    
    if(file.filelist[i].filename.endswith('.json')):
        fCont = file.read(file.filelist[i]).decode("utf-8")
        jCont = json.loads(fCont)
        
        sims = np.array(jCont['similars'])
        tags = np.array(jCont['tags'])
        tName = jCont['track_id']
        
        if(tName in trkToIndex):
            trkIndex = trkToIndex[tName]
        else:
            trkIndex = next(trkCounter)
            trkToIndex[tName] = trkIndex
        
        for s in sims:
            similarity = float(s[1])
            
            if((similarity >= simCutoff) & (similarity <= 1.0)):
                tName2 = s[0]
                
                
                if(tName2 in trkToIndex):
                    trkIndex2 = trkToIndex[tName2]
                else:
                    trkIndex2 = next(trkCounter)
                    trkToIndex[tName2] = trkIndex2
                    
                simD.append([trkIndex, trkIndex2])
                
        for t in tags:
            tagCount = int(t[1])
            
            if(tagCount >= tagCutoff):
                tagName = t[0]
                
                if(tagName in tagToIndex):
                    tagIndex = tagToIndex[tagName]
                else:
                    tagIndex = next(tagCounter)
                    tagToIndex[tagName] = tagIndex
                    
                tagD.append([trkIndex, tagIndex])
                
simD = np.array(list(set(tuple(sim) for sim in simD)))
file.close()
print("Loading the dataset took ", time.clock() - begin, " seconds.")

0  % of dataset loaded.
8  % of dataset loaded.
16  % of dataset loaded.
25  % of dataset loaded.
33  % of dataset loaded.
41  % of dataset loaded.
49  % of dataset loaded.
57  % of dataset loaded.
65  % of dataset loaded.
74  % of dataset loaded.
82  % of dataset loaded.
90  % of dataset loaded.
98  % of dataset loaded.
Loading the dataset took  61.35269417968627  seconds.


# Creating the Adjacency and Tag matrices

In [4]:
begin = time.clock()
simD = np.array(simD)

X = simD[:,0].astype(np.int)
Y = simD[:,1].astype(np.int)
Z = np.ones(len(simD))

mMat = sps.coo_matrix((Z,(Y,X)),shape=(len(trkToIndex),len(trkToIndex))).tocsc()
normalize(mMat, norm='l1', axis=0, copy=False)

tagD = np.array(tagD)

X = tagD[:,0].astype(np.int)
Y = tagD[:,1].astype(np.int)
Z = np.ones(len(tagD))

tagMat = sps.coo_matrix((Z,(Y,X))).tocsc()
#To get "from i to j", plug in tagMat[j,i]

del simD, tagD, X, Y
print("Creating the matrices for computation of R took ", time.clock() - begin, " seconds")

Creating the matrices for computation of R took  3.3804449745717235  seconds


# Selecting the songs that will be given a bias according to the tagList

In [5]:
trkByTag = []

for tagid in tagList:
    trkByTag.append(set(tagMat[tagToIndex[tagid]].nonzero()[1]))
trkByTag = np.array(list(set.intersection(*trkByTag)))

# Iterating through the $r^{new} = \beta M r^{old} + A$ to find the r matrix

* Please note the code "rNew += iMat * probDiff / N". This is done because in the M matrix, there are many columns that have no adjacencies. That means the column sum of those columsn don't add up to 1. This causes the sum of r matrix to be less than 0. IE: there remains a small probability that is not distributed amongst the dataset.

* This is normally tackled by removing the 0 columns and their corresponding rows from the M matrix. But we can't do that because to do that, we would need to spend a lot of computation time in reassigning the remaining indexes to remaining track ids in trkToIndex dictionary.

* One hack that works perfectly fine is giving $\frac{1}{N}$ probability to all entries of 0 columns. We also can't do that because: 1- It takes too much time to edit the M matrix. 2- That reduces the sparsity of our M matrix, so the rest of the computations become expensive as well.

* Instead, we implemented a workaround in the $\frac{1}{N}$ probability hack. At the end of each iteration, we calculate $1 - ||r||$, and divide this amongst all the values in r. That is because if we used the $\frac{1}{N}$ probability hack, these probabilites would have already be assigned in r matrix accordingly. Doing so, we're fixing the probability loss in an effective and efficient way.

In [61]:
begin = time.clock()

aMat = sps.lil_matrix((mMat.shape[0],1))
aMat[trkByTag,:] = (1 - Beta) / len(trkByTag)

iMat = sps.csc_matrix(np.ones(mMat.shape[0])).T

N = aMat.shape[0]
rOld = sps.coo_matrix(np.ones(N)*(1/N)).T.tocsc()

mMat = mMat * Beta
difference = 1

while(difference > 0.0001):
    
    rNew = (mMat).dot(rOld) + aMat
    probDiff = 1 - rNew.sum()
    rNew += iMat * probDiff / N
    
    difference = np.abs(rNew - rOld).sum()
    rOld = rNew.copy()
    
print("Computing the r matrix took ", time.clock() - begin, " seconds")

1.59531893905 1.0 1.0 0.000563593952689

0.0486634057893 1.0 1.0 0.000587570169797

0.000234524792369 1.0 1.0 0.000587691244906

1.29050777275e-06 1.0 1.0 0.000587691831623



# Selecting the top n highe values in r that correspond to the tagList and extracting the song names

In [147]:
selMat = np.array([np.array([trkByTag]), np.array(rNew[trkByTag].toarray()).T]).T
selMat = selMat.reshape(selMat.shape[0],selMat.shape[2])
selMat = sorted(selMat, key=lambda smE: smE[1])
MIT = np.array(selMat[-n_most_i-1:-1])[:,0].astype(np.int)
trkNameList = []
for trk in trkToIndex:
    if(trkToIndex[trk] in MIT):
        trkNameList.append(trk)

# Printing the results

In [152]:
print("The most influential tracks for the tag(s) ", tagList, " are:\n")
for i in trkNameList:
    print(i)


The most influential tracks for the tag(s)  ['Hip-Hop', 'Hip-Hop']  are:

TRVXYAU128F428B84B
TRFOVKJ128F92C3021
TRCCEXX128F4292E09
TRNGVCV128F149F339
TRGRONS128E0786070
