In [1]:
import json
import os
import re
import pickle
import math
import copy
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

In [2]:
stopwords = ['is','a','at','an','the','of','it','and','as']

In [37]:
def normalize_name(name):
    """
    Normalize playlist names by removing some stopwords.
    
    Params
    ------
    name : str
        Playlist name to normalize
    """
    global stopwords
    name = name.lower()
    querywords = name.split()
    resultwords  = [word for word in querywords if word.lower() not in stopwords]
    name = ' '.join(resultwords)
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

def get_mapping(uri_list,uri2name):
    """
    Create a dictionary to obtain:
    1. ID from URI 
    2. Name from ID
    
    Params
    ------
    uri_list : list
        List of unique URI's.
    uri2name : dict
        Key value mapping between URI and Name.
    """
    uri2id = dict()
    id2name = []
    for index,item in enumerate(uri_list):
        uri2id[item] = index
        id2name.append(uri2name[item])
    return uri2id,id2name

def create_preprocess_info(num_files,dataset):
    """
    Obtain some preprocessing information such 
    as track_ids, artist_ids, etc. from the entire MPD data.
    
    Params
    ------
    num_files : int
        Number of files to read from MPD.
    """
    p_names = []
    #track_uri,artist_uri = (set() for i in range(2))
    track_uri2name,artist_uri2name,track_uri2artist_uri = (dict() for i in range(3))
    files = (os.listdir("../mpd/data"))
    ordered_files = sorted(files, key=lambda x: (int(re.sub('\D','',x)),x))
    num_occurences_track, num_occurences_artist = {},{}
    for filename in tqdm(ordered_files[:num_files],ncols=100):
        filename = os.path.join('../mpd/data',filename)
        with open(filename, 'r') as fp:
            playlists = json.load(fp)['playlists']
        for p in playlists:
            p_names.append(normalize_name(p['name']))
            for song in p['tracks']:
                if song['track_uri'] in num_occurences_track:
                    num_occurences_track[song['track_uri']] += 1
                    num_occurences_artist[song['artist_uri']] += 1
                else:
                    num_occurences_track[song['track_uri']] = 1
                    num_occurences_artist[song['artist_uri']] = num_occurences_artist.get(song['artist_uri'],0) + 1
                    track_uri2name[song['track_uri']] = song['track_name']
                    artist_uri2name[song['artist_uri']] = song['artist_name']
                    track_uri2artist_uri[song['track_uri']] = song['artist_uri']
    
    track_uri = [x for x in num_occurences_track.keys() if num_occurences_track[x]>=1]
    artist_uri = [x for x in num_occurences_artist.keys() if num_occurences_artist[x]>=1]
    
    remove_tracks = [x for x in num_occurences_track.keys() if num_occurences_track[x]<1]
    remove_artists = [x for x in num_occurences_artist.keys() if num_occurences_artist[x]<1]
    
    for t in remove_tracks:
        del track_uri2name[t]
        del track_uri2artist_uri[t]
    
    for a in remove_artists:
        del artist_uri2name[a]
    
    with open('../mpd/preprocessed/playlist_names.pkl','wb') as fp:
        pickle.dump(p_names,fp)                
    
    track_uri2id,track_id2name = get_mapping(track_uri,track_uri2name)
    
    with open('../mpd/preprocessed/tracks-'+dataset+'.pkl','wb') as fp:
        pickle.dump([track_uri2id,track_id2name],fp)
    
    #album_uri2id,album_id2name = get_mapping(album_uri,album_uri2name)
        
    #with open('../mpd/preprocessed/albums.pkl','wb') as fp:
    #    pickle.dump([album_uri2id,album_id2name],fp)

    artist_uri2id,artist_id2name = get_mapping(artist_uri,artist_uri2name) 
    #Removing a problem of same artist having multiple uri's.
    repeating_artists = {}
    for k, v in artist_uri2name.items():
        repeating_artists.setdefault(v, []).append(k)
    repeating_artists = { k:v for k, v in repeating_artists.items() if len(v)>1 }
    for k,v in repeating_artists.items():
        index = artist_uri2id[v[0]]
        for uri in v[1:]:
            artist_uri2id[uri] = index
    
    with open('../mpd/preprocessed/artists-'+dataset+'.pkl','wb') as fp:
        pickle.dump([artist_uri2id,artist_id2name],fp)
        
    #Track ID to Artist ID
    track_id2artist_id = dict()
    for k,v in track_uri2artist_uri.items():
        track_id2artist_id[track_uri2id[k]] = artist_uri2id[v]
    
    with open('../mpd/preprocessed/track2artist-'+dataset+'.pkl','wb') as fp:
        pickle.dump(track_id2artist_id,fp)
    
    return [track_uri2id,track_id2name,artist_uri2id,artist_id2name,track_id2artist_id]

def generate_data(num_files,dataset):
    """
    Create a Pandas DataFrame for the entire dataset.
    
    Params
    ------
    num_files : int
        Number of files to read from MPD.
    dataset : str
        Name of the dataset to be used while storing.
    """
    files = (os.listdir("../mpd/data"))
    ordered_files = sorted(files, key=lambda x: (int(re.sub('\D','',x)),x))
    data = [list() for x in range(num_files*1000)]
    count = 0
    index = 0
    for filename in tqdm(ordered_files[:num_files],ncols=100):
        filename = os.path.join('../mpd/data',filename)
        with open(filename, 'r') as fp:
            playlists = json.load(fp)['playlists']
        for p in playlists:
            for song in p['tracks']:
                if song['track_uri'] in track_uri2id and song['artist_uri'] in artist_uri2id:
                    data[index].append(str(track_uri2id[song['track_uri']]))
                else:
                    count+=1
                    continue
            index+=1
    print('Omitted songs =',count,'\nTotal Playlists =',len(data))
    return data

def train_test_split(ratings, split_count, fraction=None):
    """
    Split recommendation data into train and test sets
    
    Params
    ------
    data : list of lists
        List of users containing lists of items they like.
    split_count : int
        Number of user-item-interactions per user to move
        from training to test set.
    fractions : float
        Fraction of users to split off some of their
        interactions into test set. If None, then all 
        users are considered.
    """
    possible_users = [x for x in range(len(ratings)) if len(ratings[x])>=2*split_count]
    test_list = np.random.choice(possible_users, replace=False,
                                 size = np.int32(np.floor(fraction*len(ratings))))
    test = []
    for index in test_list:
        to_add = ratings[index][-split_count:]
        ratings[index] = ratings[index][:-split_count]
        test.append(copy.deepcopy(to_add))
    train = ratings
    return train,test,test_list



In [9]:
def get_recommendations(model,ratings,pid_list,k,update=False):
    """
    Get recommendations for playlists from the 
    trained recommendation model.
    
    Params
    ------
    model : object
        Trained recommendation model.
    ratings : scipy.sparse matrix
        Interactions between users and items.
    pid_list : list
        List of playlist ID's to recommend
    """
    recommended_tracks = []
    for pid in tqdm(pid_list):
        for rec_track, score in model.recommend(pid,ratings,N=k,recalculate_user=update) :
            recommended_tracks.append([pid,rec_track,score])
    return sorted(recommended_tracks,key=lambda x: (x[0],-x[2]))

def recommend(model,ratings,k,update=True):
    """
    Get recommendations for a particular user.
    
    Params
    ------
    model : model
        Trained recommendation model.
    ratings : scipy.sparse matrix
        Interactions between user and items. [1xN]
    k : int
        K for calculating precision@K.
    """
    recommendations = sorted(model.recommend(0,ratings,N=k,recalculate_user=update),key=lambda x: (-x[1]))
    tracks = []
    for rec in recommendations:
        tracks.append((track_id2name[rec[0]],track_id2artist_id[rec[0]]),rec[1])     
    return tracks

def save_model(model,filename):
    """
    Save the trained recommendation model.
    
    Params
    ------
    model : object
        Trained recommendation model
    """
    try:
        with open('../mpd/models/'+filename+'.pkl','wb') as fp:
            pickle.dump(model,fp)
    except:
        return None
    return 1

def precision_at_k(expected, predicted):
    """
    Compute precision@k metric. Also known as hit-rate.
    
    Params
    ------
    expected : list of list
        Ground truth recommendations for each playlist.
    predicted : list of list
        Predicted recommendations for each playlist.
    """
    precisions = []
    for i in range(len(expected)):
        precision = float(len(set(predicted[i]) & set(expected[i]))) / float(len(predicted[i]))
        precisions.append(precision)
    return np.mean(precisions) 

def compute_dcg(expected,predicted):
    """
    Compute DCG score for each user.
    
    Params
    ------
    expected : list
        Ground truth recommendations for single playlist.
    predicted : list
        Predicted recommendations for single playlist.
    """
    score = [float(el in expected) for el in predicted]
    dcg = np.sum(score / np.log2(1 + np.arange(1, len(score) + 1)))
    return dcg

def dcg_at_k(expected,predicted):
    """
    Compute dcg@k metric. (Discounted Continuous Gain)
    
    Params
    ------
    expected : list of list
        Ground truth recommendations for each playlist.
    predictions : list of list
        Predicted recommendations for each playlist.
    """
    dcg_scores = []
    for i in range(len(expected)):
        dcg = compute_dcg(expected[i],predicted[i])
        dcg_scores.append(dcg)
    return np.mean(dcg_scores)

def ndcg_at_k(expected,predicted):
    """
    Compute ndcg@k metric. (Normalized Discounted Continous Gain)
    
    Params
    ------
    expected : list of list
        Ground truth recommendations for each playlist.
    predicted : list of list
        Predicted recommendations for each playlist.
    """
    ndcg_scores = []
    for i in range(len(expected)):
        labels = expected[i]
        idcg = compute_dcg(labels,labels)
        true_dcg = compute_dcg(labels,predicted[i])
        ndcg_scores.append(true_dcg/idcg)
    return np.mean(ndcg_scores)

def compute_metrics(ratings,predictions,pid_list,k):
    """
    Wrapper function to compute all metrics.
    
    Params
    ------
    ratings : scipy.sparse matrix
        Interactions between users and items.
    predictions : pd.DataFrame
        DataFrame containing all predictions <pid,Recommended,Score>
    pid_list : list
        List of playlist ID's in predictions
    k : int
        K for calculating metric@K.
    """ 
    predicted = []
    expected = []
    for pid in tqdm(pid_list):
        top_k = predictions.loc[predictions['pid']==pid][['Recommended','Score']]\
                    .sort_values('Score',ascending=False)['Recommended'].tolist()
        labels = ratings.getrow(pid).indices
        predicted.append(top_k)
        expected.append(labels)
    
    patk = precision_at_k(expected,predicted)
    dcgatk = dcg_at_k(expected,predicted)
    ndcgatk = ndcg_at_k(expected,predicted)
    return [patk,dcgatk,ndcgatk]

def new_playlist(model,songs):
    track_ids = [track_uri2id[s] for s in songs if s in track_uri2id]
    return track_ids

In [38]:
num_files = 1
dataset = '{0}k'.format(num_files)
importance = 1
k = 1

if(1):
    print('Preprocessing data for future reference')
    track_uri2id,track_id2name,\
    artist_uri2id,artist_id2name,\
    track_id2artist_id = create_preprocess_info(num_files,dataset)
else:
    with open('../mpd/preprocessed/tracks-'+dataset+'.pkl','rb') as fp:
        track_uri2id,track_id2name = pickle.load(fp)

    with open('../mpd/preprocessed/artists-'+dataset+'.pkl','rb') as fp:
        artist_uri2id,artist_id2name = pickle.load(fp)

    with open('../mpd/preprocessed/track2artist-'+dataset+'.pkl','rb') as fp:
        track_id2artist_id = pickle.load(fp)

Preprocessing data for future reference


100%|█████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.70s/it]


In [39]:
if(1):
    print('Generating Training and Testing set for',dataset)
    data = generate_data(num_files,dataset)
    with open('../mpd/neural/data-'+dataset+'.pkl','wb') as fp:
        pickle.dump(data,fp)      
    train,test,pid_list = train_test_split(data,10,fraction=0.2)
    print('Saving Training and Testing Set...')
    with open('../mpd/neural/train-'+dataset+'.pkl','wb') as fp:
        pickle.dump(train,fp)   
    with open('../mpd/neural/test-'+dataset+'.pkl','wb') as fp:
        pickle.dump(test,fp)  
    np.save('../mpd/neural/pid_list-'+dataset+'.npy',pid_list)
else:
    with open('../mpd/neural/data-'+dataset+'.pkl','rb') as fp:
        data = pickle.load(fp) 
    with open('../mpd/neural/train-'+dataset+'.pkl','rb') as fp:
        train = pickle.load(fp)
    with open('../mpd/neural/test-'+dataset+'.pkl','rb') as fp:
        test = pickle.load(fp)
    pid_list = np.load('../mpd/neural/pid_list-'+dataset+'.npy')

Generating Training and Testing set for 1k


100%|█████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.45s/it]


Omitted songs = 0 
Total Playlists = 1000
Saving Training and Testing Set...


In [47]:
len(track_uri2id)

34443

## Deep Neural Network

In [12]:
from gensim.models import Word2Vec
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [13]:
print('Number of Playlists =',len(data))
print('Number of Tracks =',len(track_uri2id))

Number of Playlists = 1000
Number of Tracks = 34443


In [18]:
def getEmbeddingMatrix(data,embedding_dim,precomputed=False):
    if(not precomputed):
        print('Creating Word2Vec Model...')
        model = Word2Vec(data, size=embedding_dim, window=5, min_count=1, workers=6, sg=1)
        model.save('../mpd/preprocessed/word2vec-'+str(embedding_dim)+'.txt')
    else:
        model = Word2Vec.load('../mpd/preprocessed/word2vec-'+str(embedding_dim)+'.txt')

    embedding_matrix = np.zeros((len(model.wv.vocab), embedding_dim))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return model,embedding_matrix

def getBatch(data,w2v_model,batch_index,BATCH_SIZE=256):
    start_index = batch_index*BATCH_SIZE
    end_index = min((batch_index+1)*BATCH_SIZE,len(data))
    inputs = data[start_index:end_index]
    user_input,item_input,expected_score = map(lambda x: np.asarray(x).reshape((BATCH_SIZE,1)),zip(*inputs))
    w2v_input = [w2v_model.wv.vocab[str(x)].index for x in np.nditer(item_input)]
    w2v_input = np.asarray(w2v_input).reshape((BATCH_SIZE,1))
    
    return [user_input,
            item_input,
            w2v_input,
            expected_score
           ]
    
def reformatData(data):
    positive_samples = []
    for index,x in enumerate(tqdm(data)):
        for ele in x:
            positive_samples.append([index,ele,1])
    return positive_samples

def addNegativeSampling(dataset,num_negative = 100):
    all_tracks = np.arange(num_tracks)
    negative_samples = []
    for index,user_ratings in enumerate(tqdm(dataset)):
        possible_negs = np.setdiff1d(all_tracks,user_ratings,assume_unique=True)
        negative_tracks = np.random.choice(possible_negs,replace=False,size=num_negative).tolist()
        for ele in negative_tracks:
            negative_samples.append([index,ele,0])
    return negative_samples

In [19]:
embedding_dim = 256
num_users = len(data)
num_tracks = len(track_uri2id)
hidden_layers = [512,256,128]
num_epochs = 10
batch_size = 256

In [20]:
if(1):
    w2v_model,embedding_matrix = getEmbeddingMatrix(data,embedding_dim,precomputed=False)
    positive_samples = reformatData(train)
    negative_samples = addNegativeSampling(data)
    all_samples = positive_samples + negative_samples
    with open('../mpd/neural/all_samples-'+dataset+'.pkl','wb') as fp:
        pickle.dump(all_samples,fp)
else:
    w2v_model,embedding_matrix = getEmbeddingMatrix(data,embedding_dim,precomputed=True)
    with open('../mpd/neural/all_samples-'+dataset+'.pkl','rb') as fp:
        all_samples = pickle.load(fp)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:48<00:00, 20.69it/s]


In [21]:
tf.reset_default_graph()
with tf.device('/gpu:0'):
    user_input = tf.placeholder(dtype=tf.int32, shape=(None,1), name='user_input')
    item_input = tf.placeholder(dtype=tf.int32, shape=(None,1), name='item_input')
    w2v_input = tf.placeholder(dtype=tf.int32, shape=(None,1), name='w2v_input')
    expected_score = tf.placeholder(dtype=tf.float32, shape=(None,1), name='expected_score')
    with tf.name_scope('embedding'):
        #Item embedding for MLP
        W_item_mlp = tf.Variable(tf.constant(0.0, shape=[num_tracks, embedding_dim]),
                        trainable=False, name='W_item_mlp')
        embedding_placeholder = tf.placeholder(tf.float32, [num_tracks, embedding_dim],name='embedding_placeholder')
        embedding_init = W_item_mlp.assign(embedding_placeholder)
        item_mlp_embedding = tf.nn.embedding_lookup(W_item_mlp, w2v_input)
        
        #W_item_mlp = tf.Variable(tf.random_normal([num_tracks, embedding_dim], 0, 0.1),
        #                name='W_item_mlp')
        #item_mlp_embedding = tf.nn.embedding_lookup(W_item_mlp, item_input)
        
        #User Embedding for MLP
        W_user_mlp = tf.Variable(tf.random_normal([num_users, embedding_dim], 0, 0.1),
                        name='W_user_mlp')
        user_mlp_embedding = tf.nn.embedding_lookup(W_user_mlp, user_input)

        #Item Embedding for MF
        W_item_mf = tf.Variable(tf.random_normal([num_tracks, embedding_dim], 0, 0.1),
                        name='W_item_mf')
        item_mf_embedding = tf.nn.embedding_lookup(W_item_mf, item_input)

        #User Embedding for MF
        W_user_mf = tf.Variable(tf.random_normal([num_users, embedding_dim], 0, 0.1),
                        name='W_user_mf')
        user_mf_embedding = tf.nn.embedding_lookup(W_user_mf, user_input)
    
    with tf.name_scope('post_embedding'):
        mf_output = tf.multiply(tf.layers.Flatten()(user_mf_embedding),
                                   tf.layers.Flatten()(item_mf_embedding))

        mlp_latent_vec = tf.concat([tf.layers.Flatten()(user_mlp_embedding),
                                    tf.layers.Flatten()(item_mlp_embedding)],
                                    axis=1)
    layers = {}
    layers_compute = {}
    
    with tf.name_scope('hidden_layers'):
        for i in range(1,len(hidden_layers)+1):   
            if(i==1):
                l = tf.layers.dense(inputs=mlp_latent_vec,
                                    units=hidden_layers[i-1],
                                    activation=tf.nn.relu)
            else:
                l = tf.layers.dense(inputs=layers_compute[i-2],
                                    units=hidden_layers[i-1],
                                    activation=tf.nn.relu)
            layers_compute[i-1] = l
    mlp_output = layers_compute[i-1]
    predict_layer_input = tf.concat([tf.layers.Flatten()(mf_output),
                                   tf.layers.Flatten()(mlp_output)],
                                  axis=1)
    output = tf.layers.dense(inputs=predict_layer_input,
                             units=1,
                             activation=tf.nn.sigmoid)
    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=expected_score,logits=output))
    optimizer = tf.train.AdamOptimizer().minimize(loss)

with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())
    output_list = []
    for e in range(num_epochs):
        print('EPOCH:',e+1,end='\t')
        start = datetime.now()
        np.random.shuffle(all_samples)
        num_batches = np.int32(np.ceil(len(all_samples)//batch_size))
        epoch_loss = 0
        for i in range(num_batches):
            users,items,embedded_items,targets = getBatch(all_samples,w2v_model,i,batch_size)
            _,op,batch_loss = sess.run([optimizer,output,loss],feed_dict={user_input:users,
                                                                  item_input:items,
                                                                  w2v_input:embedded_items,
                                                                  expected_score:targets,
                                                                  embedding_placeholder:embedding_matrix})
            epoch_loss += batch_loss
        epoch_loss /= num_batches
        end = datetime.now()
        epoch_time = end-start
        print('LOSS:',epoch_loss,'\tTime:',epoch_time.total_seconds())

EPOCH: 1	

KeyError: '7335'

In [36]:
print(w2v_model.wv.vocab['7335'])
track_id2name[7335]

KeyError: '7335'

In [35]:
print(len(w2v_model.wv.vocab))
print(w2v_model.wv.index2word[7335])

33702
14355


In [13]:
import sys
import scipy.sparse as sp
import keras
from keras import backend as K
from keras import initializers
from keras.regularizers import l1, l2
from keras.models import Sequential, Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout
from keras.layers import Multiply, Concatenate
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from time import time
from keras.callbacks import Callback
from keras.utils import plot_model

Using TensorFlow backend.


In [17]:
def build_ncf_model(num_user, num_item, latent_v_dim=8, 
                dense_layers=[64, 32, 16, 8], reg_layers=[0, 0, 0, 0], reg_mf=0
                ):

    # Input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # Embedding layer
    mf_user_embedding = Embedding(input_dim=num_user, output_dim=latent_v_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_item, output_dim=latent_v_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_user, output_dim=dense_layers[0]//2,
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_item, output_dim=dense_layers[0]//2,
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # Matrix Factorization latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # Multi layer perceptron latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    # Build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])
    result = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(input=[input_user,input_item], output=result(predict_layer))

    return model

In [18]:
num_epochs = 61
batch_size = 256
latent_v_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = [0]
num_neg_sample = 4
learning_rate = 0.001
learner = 'adam'
verbose = 1
num_user = len(data)
num_item = len(track_uri2id)

In [19]:
model = build_ncf_model(num_user, num_item, latent_v_dim, dense_layers,
        reg_layers, reg_mf)

