# Importing the necessary libraries

In [1]:
import numpy as np
import zipfile as zf
import json
import time
import scipy.sparse as sps
from sklearn.preprocessing import normalize
from itertools import count

# Setting up the parameters

* fileName: The name of the database file
* simCutoff: The minimum similarity there has to be for there to be a link between track i to j
* tagCutoff: The minimum Tag value there has to be for there to be a link between a track and a particular Tag
* Beta: The Beta value for $r^{new} = \beta M r^{old} + A$
* n_most_i: The parameter that defines n in "Top n most popluar songs"
* tagList: The selected list of tags for this experiment. Note that The songs will be selected as an intersection of tags. IE: If the taglist is ['a', 'b'], then only the songs that have both 'a' and 'b' tags will be selected

In [2]:
fileName = 'lastfm_test.zip'
simCutoff = 0.0
tagCutoff = 50
Beta = 0.2
n_most_i = 5
tagList = ['Hip-Hop']

# Reading the dataset

In [3]:
begin = time.clock()
simD = []
tagD = []
file = zf.ZipFile(fileName)

for i in range(0,len(file.filelist)):
    
    if(i%10000 == 0):
        print(round(100 * i / len(file.filelist)), " % of dataset loaded.")
    
    if(file.filelist[i].filename.endswith('.json')):
        fCont = file.read(file.filelist[i]).decode("utf-8")
        jCont = json.loads(fCont)
        
        sims = np.array(jCont['similars'])
        tags = np.array(jCont['tags'])
        tName = jCont['track_id']
        
        for s in sims:
            if(float(s[1]) >= simCutoff):
                simD.append([tName, s[0]])
                
        for t in tags:
            if(int(t[1]) >= tagCutoff):
                tagD.append([tName, t[0]])
                
file.close()
print("Loading the dataset took ", time.clock() - begin, " seconds.")

0  % of dataset loaded.
8  % of dataset loaded.
16  % of dataset loaded.
25  % of dataset loaded.
33  % of dataset loaded.
41  % of dataset loaded.
49  % of dataset loaded.
57  % of dataset loaded.
65  % of dataset loaded.
74  % of dataset loaded.
82  % of dataset loaded.
90  % of dataset loaded.
98  % of dataset loaded.
Loading the dataset took  44.250523722090605  seconds.


# Itertively removing the dead-end entries from similarity data

* In each iteration, dead end trakcs are trimmed. Since we delete some dead end trakcs in each iteration, some tracks that were not dead-end tracks in the previous iteration become dead end tracks in the next one.
* Please note that this process needs to go on until there are no more dead-end tracks left in the dataset.

In [4]:
begin = time.clock()
simD = np.array(simD)

prevLen = 0
currentLen = -1
unTrks = set(simD[:,0])

while(currentLen != prevLen):
    
    prevLen = len(unTrks)
    SD = []
    
    for sim in simD:
        if(sim[1] in unTrks):
            SD.append(sim)

    SD = np.array(SD)
    simD = SD.copy()
    unTrks = set(simD[:,0])
    currentLen = len(unTrks)
    print(prevLen - currentLen, " dead end tracks trimmed")
    
del SD, unTrks
print("Trimming the dataset took ", time.clock() - begin, " seconds.")

532  dead end tracks trimmed
58  dead end tracks trimmed
8  dead end tracks trimmed
0  dead end tracks trimmed
Trimming the dataset took  18.25097202453457  seconds.


# Indexing the track names and tag names to integers

In [5]:
begin = time.clock()

trkCounter = count()
tagCounter = count()

trkToIndex = {}
tagToIndex = {}

for trk in set(simD[:,0]):
    trkToIndex[trk] = next(trkCounter)

SD = []
for sim in simD:
    SD.append([trkToIndex[sim[0]], trkToIndex[sim[1]]])
    
SD = np.array(SD)
simD = SD.copy()
del SD

TD = []
for tag in tagD:
    if(tag[0] in trkToIndex):
        if(tag[1] in tagToIndex):
            tagId = tagToIndex[tag[1]]
        else:
            tagId = next(tagCounter)
            tagToIndex[tag[1]] = tagId
        TD.append([trkToIndex[tag[0]], tagId])

TD = np.array(TD)
tagD = TD.copy()
del TD

print("Indexing the track names and tag names to integers took ", time.clock() - begin, " seconds")

Indexing the track names and tag names to integers took  3.762965672789065  seconds


# Creating the Adjacency and Tag matrices

In [6]:
begin = time.clock()

X = simD[:,0].astype(np.int)
Y = simD[:,1].astype(np.int)
Z = np.ones(len(simD))

mMat = sps.coo_matrix((Z,(Y,X)),shape=(len(trkToIndex),len(trkToIndex))).tocsc()
normalize(mMat, norm='l1', axis=0, copy=False)

tagD = np.array(tagD)

X = tagD[:,0].astype(np.int)
Y = tagD[:,1].astype(np.int)
Z = np.ones(len(tagD))

tagMat = sps.coo_matrix((Z,(Y,X))).tocsc()

del simD, tagD, X, Y
print("Creating the matrices for computation of R took ", time.clock() - begin, " seconds")

Creating the matrices for computation of R took  0.3300070489419511  seconds


# Selecting the songs that will be given a bias according to the tagList

In [7]:
trkByTag = []

for tagid in tagList:
    trkByTag.append(set(tagMat[tagToIndex[tagid]].nonzero()[1]))
trkByTag = np.array(list(set.intersection(*trkByTag)))

# Iterating through the $r^{new} = \beta M r^{old} + A$ to find the r matrix

In [8]:
begin = time.clock()

aMat = sps.lil_matrix((mMat.shape[0],1))
aMat[trkByTag,:] = (1 - Beta) / len(trkByTag)

#iMat = sps.csc_matrix(np.ones(mMat.shape[0])).T

N = aMat.shape[0]
rOld = sps.coo_matrix(np.ones(N)*(1/N)).T.tocsc()

mMat = mMat * Beta
difference = 1

while(difference > 0.0001):
    
    rNew = (mMat).dot(rOld) + aMat
    #probDiff = 1 - rNew.sum()
    #rNew += iMat * probDiff / N
    difference = np.abs(rNew - rOld).sum()
    print("Difference between rOld and rNew is: ", difference)
    rOld = rNew.copy()
    
print("Computing the r matrix took ", time.clock() - begin, " seconds")

Difference between rOld and rNew is:  1.59560909265
Difference between rOld and rNew is:  0.293439678498
Difference between rOld and rNew is:  0.054406522751
Difference between rOld and rNew is:  0.0105038104061
Difference between rOld and rNew is:  0.00205705490342
Difference between rOld and rNew is:  0.000404667217653
Difference between rOld and rNew is:  7.98398374574e-05
Computing the r matrix took  0.4429143837637497  seconds


# Selecting the top n highe values in r that correspond to the tagList and extracting the song names

In [10]:
selMat = np.array([np.array([trkByTag]), np.array(rNew[trkByTag].toarray()).T]).T
selMat = selMat.reshape(selMat.shape[0],selMat.shape[2])
selMat = sorted(selMat, key=lambda smE: smE[1])
MIT = np.array(selMat[-n_most_i-1:-1])[:,0].astype(np.int)
trkNameList = []
for trk in trkToIndex:
    if(trkToIndex[trk] in MIT):
        trkNameList.append(trk)

# Printing the results

In [11]:
print("The most influential tracks for the tag(s) ", tagList, " are:\n")
for i in trkNameList:
    print(i)

The most influential tracks for the tag(s)  ['Hip-Hop']  are:

TRRWREZ128F932FAC0
TRRWXIB128F92E8647
TRFBPXX128F14744D8
TRNGVCV128F149F339
TRCCEXX128F4292E09
