In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import json

In [2]:
def json_reader(begin):
    if begin not in np.arange(0, 11000, 1000):
        raise Exception(
            "Invalid start pid! Start pids must be {0, 1000, 2000, ..., 11000}"
        )

    end = begin + 999
    path = "../data/playlists/playlist.slice." + str(begin) + "-" + str(end) + ".json"

    jsonData = json.load(open(path, "r"))
    actualSlice = pd.DataFrame.from_dict(jsonData["playlists"], orient="columns")
    return actualSlice

In [3]:
jsonList = []
for begin in np.arange(0, 11000, 1000):
    actual = json_reader(begin)
    jsonList.append(actual)

trainData = pd.concat(jsonList, ignore_index=True)
jsonList.clear()

print(trainData.shape)
trainData.head()

(11000, 12)


Unnamed: 0,collaborative,description,duration_ms,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks,pid,tracks
0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't..."
1,False,,11656470,1506556800,Awesome Playlist,23,21,5,1,39,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_..."
2,False,,14039958,1505692800,korean,51,31,18,1,64,2,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri..."
3,False,,28926058,1501027200,mat,107,86,4,1,126,3,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën..."
4,False,,4335282,1401667200,90s,16,16,7,2,17,4,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk..."


In [4]:
# Turn playlist level dataframe into song level dataframe
# is a df of all track ids, cooresponding artist names, track names and playlist ids

songPlaylistArray = []
for index, row in trainData.iterrows():
    for track in row["tracks"]:
        songPlaylistArray.append(
            [track["track_uri"], track["artist_name"], track["track_name"], row["pid"]]
        )
songPlaylist = pd.DataFrame(
    songPlaylistArray, columns=["trackid", "artist_name", "track_name", "pid"]
)
songPlaylistArray.clear()

print(songPlaylist.shape)
songPlaylist.head(10)

(731360, 4)


Unnamed: 0,trackid,artist_name,track_name,pid
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0


In [5]:
# Turn songs into their unqiue cat codes so we have a 0-N index for tracks
songPlaylist["trackindex"] = songPlaylist["trackid"].astype("category").cat.codes

print(len(songPlaylist["trackindex"].unique()))
print(songPlaylist.shape)
songPlaylist.head(10)

180409
(731360, 5)


Unnamed: 0,trackid,artist_name,track_name,pid,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,11647
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,145808
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,12511
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,27379
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,41255
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,12745
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,142288
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,73997
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,168512
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,56109


In [6]:
# Save data in DOK (Dictionary Of Keys) matrix (optimized sparse matrix object)
# Create a sparse pid x trackindex matrix
# If a pid i has song j, mat[i,j]=1

mat = sp.dok_matrix((11000, 180409), dtype=np.float32)
for pid, trackindex in zip(songPlaylist["pid"], songPlaylist["trackindex"]):
    mat[pid, trackindex] = 1.0

In [7]:
import keras
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape, multiply, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from time import time

Using TensorFlow backend.


In [8]:
def get_model(num_playlists, num_items, latent_dim, regs=[0, 0]):
    # Input variables
    playlist_input = Input(shape=(1,), dtype="int32", name="playlist_input")

    music_input = Input(shape=(1,), dtype="int32", name="music_input")

    playlist_embedding = Embedding(
        name="playlist_embedding",
        input_dim=num_playlists,
        output_dim=latent_dim,
        embeddings_regularizer=l2(regs[0]),
        input_length=1,
    )

    music_embedding = Embedding(
        name="music_embedding",
        input_dim=num_items,
        output_dim=latent_dim,
        embeddings_regularizer=l2(regs[1]),
        input_length=1,
    )

    # Flattens the embedding vector
    playlist_latent = Flatten()(playlist_embedding(playlist_input))
    music_latent = Flatten()(music_embedding(music_input))

    # Element-wise product of playlist and music embeddings
    predict_vector = multiply([playlist_latent, music_latent])

    # Final prediction layer
    prediction = Dense(
        1,
        activation="sigmoid",
        kernel_initializer="random_normal",
        name="prediction_layer",
    )(predict_vector)

    model = Model(inputs=[playlist_input, music_input], outputs=prediction)

    return model

In [9]:
def get_train_instances(train, num_negatives):
    playlist_input, music_input, labels = [], [], []
    num_playlists, num_musics = train.shape

    for (u, i) in train.keys():
        # positive instances
        playlist_input.append(u)
        music_input.append(i)
        labels.append(1)

        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_musics)

            while (u, j) in train:
                j = np.random.randint(num_musics)

            playlist_input.append(u)
            music_input.append(j)
            labels.append(0)

    return playlist_input, music_input, labels

In [10]:
# Specify hyperparameters
num_factors = 8
regs = [0, 0]
num_negatives = 4
learning_rate = 0.001
epochs = 30
batch_size = 1000

# Loading data
train = mat
num_playlists, num_musics = train.shape

# Build and compile model
model = get_model(num_playlists, num_musics, num_factors, regs)
model.compile(
    optimizer=Adam(lr=learning_rate), loss="binary_crossentropy", metrics=["acc"]
)
print(model.summary())

history = []
# Train model
for epoch in range(epochs):
    print("Training Epoch: %d/%d" % (epoch + 1, epochs))

    # Generate training instances
    playlist_input, music_input, labels = get_train_instances(train, num_negatives)

    # Train model
    hist = model.fit(
        [np.array(playlist_input), np.array(music_input)],  # input
        np.array(labels),  # labels
        validation_split=0.2,
        batch_size=batch_size,
        shuffle=True,
    )

    history.append(hist.history)

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
playlist_input (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
music_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
playlist_embedding (Embedding)  (None, 1, 8)         88000       playlist_input[0][0]             
__________________________________________________________________________________________________
music_embedding (Embedding)     (None, 1, 8)         1443272     music_input[0][0]                
_____________________________________

Training Epoch: 27/30
Train on 2888180 samples, validate on 722045 samples
Epoch 1/1
Training Epoch: 28/30
Train on 2888180 samples, validate on 722045 samples
Epoch 1/1
Training Epoch: 29/30
Train on 2888180 samples, validate on 722045 samples
Epoch 1/1
Training Epoch: 30/30
Train on 2888180 samples, validate on 722045 samples
Epoch 1/1


In [11]:
history_df = pd.DataFrame(history)
history_df.set_index(pd.Series(range(1, epochs+1)), inplace=True)

history_df.index.name = "Epoch"
history_df.columns = [
    "Training Accuracy",
    "Training Loss",
    "Validation Accuracy",
    "Validation Loss",
]

pd.options.display.float_format = '{:,.20f}'.format

for column in history_df.columns:

    history_df.loc[:, column] = history_df[column].map(
        lambda x: x[0]
    )

display(history_df)

Unnamed: 0_level_0,Training Accuracy,Training Loss,Validation Accuracy,Validation Loss
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.8083017689264818,0.5131816268782403,0.800000011920929,0.500662212726493
2,0.8739427597944283,0.3277411378369006,0.800011091585233,0.5016190172726004
3,0.8926278837407731,0.2698610064718325,0.8000180163547856,0.5060231210407687
4,0.9041874122596184,0.2334211676627068,0.8000152463974348,0.5177799240268032
5,0.9145053290074464,0.2061197759686663,0.8000249411243383,0.5423326206633727
6,0.9245195947845716,0.1838470605056596,0.8000152463974348,0.5751147275803637
7,0.9330270282507764,0.165798380756635,0.8000249411243383,0.6182118538198512
8,0.9398396224253408,0.1510089002169145,0.8000249411243383,0.6704966508746762
9,0.945220519113016,0.139679802985865,0.8000249411243383,0.7285721759287999
10,0.9493369514295688,0.1302052460627015,0.8000221711669873,0.7923221570547538
