In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [0]:
train_link_1="https://drive.google.com/open?id=1Lpmb_R5J1VApLIA22FfQmZKtG-B03X_s"
initial1, train_id1 = train_link_1.split('=')

downloaded1 = drive.CreateFile({'id':train_id1})
downloaded1.GetContentFile('song_data.csv')

In [0]:
train_link_2="https://drive.google.com/open?id=1frWsCzJ2ffCpJ-c-Ta19ybejrEaMmdSX"
initial2, train_id2 = train_link_2.split('=')

downloaded2 = drive.CreateFile({'id':train_id2})
downloaded2.GetContentFile('kaggle_visible_evaluation_triplets.txt')

**Data loading and pre-processing:**

In [0]:
#reading the triplets file and loading songs with the number of times all users used it
f1 = open('kaggle_visible_evaluation_triplets.txt', 'r')
test_song_to_count = dict() 
for line in f1:
    _, song, _ = line.strip().split('\t') 
    if song in test_song_to_count: 
        test_song_to_count[song] += 1 
    else: 
        test_song_to_count[song] = 1 
    

In [0]:
import pandas as pd
song_data = pd.read_csv('song_data.csv')

In [7]:
song_dict = dict.fromkeys(song_data['song_id'], [])
song_id = song_data['song_id']
len(song_id)

1000000

In [0]:
#getting titles, artist_names and release for a song
t = song_data['title'].tolist()
r = song_data['release'].tolist()
a = song_data['artist_name'].tolist()

In [0]:
#preprocessing
titles = []
for i in t:
  i = str(i)
  titles.append(i.replace('[^\w\s]','').replace('\n','').lower().replace('.','').replace('-',' ').replace(',','').replace('?','').replace('/','').replace('\'','').replace('(','').replace(')',''))
releases = []
for i in r:
  i = str(i)
  releases.append(i.replace('[^\w\s]','').replace('\n','').lower().replace('.','').replace('-',' ').replace(',','').replace('?','').replace('/','').replace('\'','').replace('(','').replace(')',''))
artists = []
for i in a:
  i = str(i)
  artists.append(i.replace('[^\w\s]','').replace('\n','').lower().replace('.','').replace('-',' ').replace(',','').replace('?','').replace('/','').replace('\'','').replace('(','').replace(')',''))

In [0]:
song_details = []
for i,j,k,l in zip(song_id, titles, releases, artists):
  song_details.append([i,j,k,l])

In [0]:
trial_songs = []
trial_id = []
for i in song_details:
  if i[0] in test_song_to_count:
    trial_songs.append(i)
    trial_id.append(i[0])

In [12]:
len(trial_songs)

163828

In [0]:
import numpy as np
d = np.arange(0,len(trial_songs))
songids = dict.fromkeys(d)
for i in range(0,len(trial_songs)):
  songids[i] = song_details[i][0]

In [0]:
song_deets = []
for i in trial_songs:
#for i,j,k in zip(titles, releases, artists):
  song_deets.append([i[1],i[2],i[3]])

In [0]:
song_d = []
for i in song_deets:
  temp = []
  for j in i:
    temp.extend(j.split(" "))
  song_d.append(temp)

In [0]:
import random
song_d = random.sample(song_d, 60000)   #taking any 60000 songs for similarity calculation

**Training the model:**

In [17]:
#training the Word2Vec model with the tokenized data
from gensim.models import Word2Vec
model = Word2Vec(song_d, size=10, window=2, sg=1, min_count=1)
model.train(song_d, total_examples=len(song_d), epochs=2)

(946080, 1066332)

In [18]:
## song detail representation based on the model generated
import numpy as np
song_sum = []
for i in range(0,len(song_d)):
  sum1 = np.zeros(10)
  for j in range(0,len(song_d[i])):
    sum1 += model[song_d[i][j]]
  #print(i)
  song_sum.append(sum1/len(sum1))

  


**Finding cosine similarity between songs and storing:**

In [0]:
from scipy import sparse
sp_song_sum = sparse.csr_matrix(song_sum)    #converting the matrix to a sparse matrix
#song_sum_arr = sp_song_sum.todense()

In [0]:
#converts the sparse matrix to a tensor
from scipy.sparse import coo_matrix
import torch
coo = coo_matrix(sp_song_sum)

values = coo.data
indices = np.vstack((coo.row, coo.col))

i = torch.cuda.LongTensor(indices)
v = torch.cuda.FloatTensor(values)
shape = coo.shape

torch_song = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

In [0]:
#freeing up memory which is not being used
del(v)
del(i)
del(indices)
del(values)
del(coo)
del(song_sum)

In [0]:
#finds cosine similarity
a_norm = (torch_song) / (torch_song.norm(dim=1)[:, None])
b_norm = (torch_song[0:]) / (torch_song[0:].norm(dim=1)[:, None])
res1 = torch.mm((a_norm), (b_norm.transpose(0,1)))

In [0]:
del(torch_song)

In [0]:
#getting top k songs for each song
import tensorflow as tf

song_sim = dict.fromkeys(np.arange(60000))
for i in range(len(res1)):
  val, ind = tf.math.top_k(res1[i].cpu(), k=70, sorted=True, name=None)
  song_sim[i] = ind

In [0]:
#converting obtained tensor to numpy array and then list
for i in song_sim:
  song_sim[i] = song_sim[i].numpy()
  song_sim[i] = list(song_sim[i])

In [0]:
#getting song names for each similar other song
song_sim_names = {}
for i in song_sim:
  temp = []
  for j in song_sim[i]:
    temp.append(songids[j])
  song_sim_names[songids[i]] = temp

**Splitting into train and test data set:**

In [0]:
#making a list of all lines in the file, based on user_id
f2 = open('kaggle_visible_evaluation_triplets.txt', 'r')
user_list = []
for line in f2:
  user, song, _ = line.strip().split('\t') 
  user_list.append(user)

In [0]:
#for each user, all the songs he has listened to along with the number of times he has listened it to
from collections import defaultdict

f2 = open('kaggle_visible_evaluation_triplets.txt', 'r')
user_to_song = defaultdict(list)
for line in f2:
  user, song, sc = line.strip().split('\t') 
  sc = int(sc)
  user_to_song[user].extend([song]*sc) 

In [0]:
#splitting each user's items into testing and validation data
from math import floor
import sklearn.model_selection
user_train_data = dict.fromkeys(user_list)
user_test_data = dict.fromkeys(user_list)
for i in user_to_song:
  u_train,u_test = sklearn.model_selection.train_test_split(user_to_song[i],train_size=0.75, test_size = 0.25)
  user_train_data[i] = u_train
  user_test_data[i] = u_test

In [0]:
#taking all similar songs for a given user
user_data_used = dict.fromkeys(user_train_data, [])
for i in user_data_used:
  user_data_used[i] = []
  temp = []
  for j in user_train_data[i]:
    if j in song_sim_names and song_sim_names[j] not in temp:
      temp.extend(song_sim_names[j])
    temp.append(j)
    
  user_data_used[i].extend(temp)

In [0]:
#making predicitions based on the 75% data we have
testing_set = dict.fromkeys(user_test_data, [])
for i in testing_set:
  if i in user_data_used:
    testing_set[i] = (user_data_used[i])

**r-Precision:**

In [0]:
ts = dict()
for i in testing_set:
  ts[i] = testing_set[i][0:len(user_test_data)]    #taking only documents equal to relevant documents 

In [0]:
#finding matches 
user_match = dict()
for i in user_test_data:
  match = 0
  for j in range(len(user_test_data[i])):
    if user_test_data[i][j] in ts[i]:        
      match+=1
  user_match[i] = match

In [0]:
#calculating every user's r-precision
user_r_prec = dict()
for i in user_match:
  try:
    user_r_prec[i] = user_match[i]/len(ts[i])
  except:
    user_r_prec[i] = 0

In [0]:
#average r-precision
sum_prec = 0
for i in user_r_prec:
  sum_prec+=user_r_prec[i]
r_prec = sum_prec/len(user_r_prec)

In [36]:
print("The average r-precision = ", r_prec)

The average r-precision =  0.13029285514326572


**Precision:**

In [0]:
#finding matches
user_match_p = dict()
for i in user_test_data:
  match = 0
  for j in range(len(user_test_data[i])):
    if list(user_test_data[i])[j] in testing_set[i]:        
      match+=1
  user_match_p[i] = match

In [0]:
#calculating every user's precision
user_prec = dict()
for i in user_match_p:
  try:
    user_prec[i] = user_match_p[i]/len(testing_set[i])
  except:
    user_prec[i] = 0

In [0]:
#average precision
sum_p = 0
for i in user_prec:
  sum_p+=user_prec[i]
prec = sum_p/len(user_prec)

In [40]:
print("The average precision = ", prec)

The average precision =  0.13029285514326572


**Recall:**

In [0]:
#calculating every user's recall 
user_recall = dict.fromkeys(user_match_p)
for i in user_match_p:
  try:
    user_recall[i] = user_match_p[i]/len(user_test_data[i])
  except:
    user_recall[i] = 0

In [0]:
#average recall
sum_rec = 0
for i in user_recall:
  sum_rec+=user_recall[i]
recall = sum_rec/len(user_recall)

In [43]:
print("The average recall = ", recall)

The average recall =  0.602622555786544


**MRR:**

In [0]:
#calculating mrr
user_mrr = dict.fromkeys(testing_set)
for i in testing_set:
  rr=0
  for p, j in enumerate(testing_set[i]):
    if j in user_test_data[i]:        
      rr = 1 / (p + 1)
      break
  user_mrr[i] = rr

In [0]:
#average mrr
sum_mrr = 0
for i in user_mrr:
  sum_mrr+=user_mrr[i]
mrr = sum_mrr/len(user_mrr)

In [46]:
print("The average MRR = ", mrr)

The average MRR =  0.5675896037764139


In [47]:
summary = []
summary.append(('Word2Vec with song metadata',prec , recall, mrr, r_prec))

summary_df = pd.DataFrame(summary, columns = ['Model', 'Average Precision', 'Average Recall', 'Average MRR', 'Average r-Precision'])
summary_df

Unnamed: 0,Model,Average Precision,Average Recall,Average MRR,Average r-Precision
0,Word2Vec with song metadata,0.130293,0.602623,0.56759,0.130293
