In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import json

In [None]:
def json_reader(begin):
    if begin not in np.arange(0, 11000, 1000):
        raise Exception(
            "Invalid start pid! Start pids must be {0, 1000, 2000, ..., 999000}"
        )

    end = begin + 1000
    path = "../data/mpd.slice." + str(begin) + "-" + str(end - 1) + ".json"

    jsonData = json.load(open(path, "r"))
    actualSlice = pd.DataFrame.from_dict(jsonData["playlists"], orient="columns")
    return actualSlice

In [None]:
jsonList = []
for begin in np.arange(0, 11000, 1000):
    actual = json_reader(begin)
    jsonList.append(actual)

trainData = pd.concat(jsonList)
jsonList.clear()

print(trainData.shape)
trainData.head()

In [None]:
# Turn playlist level dataframe into song level dataframe
# is a df of all track ids, cooresponding artist names, track names and playlist ids

songPlaylistArray = []
for index, row in trainData.iterrows():
    for track in row["tracks"]:
        songPlaylistArray.append(
            [track["track_uri"], track["artist_name"], track["track_name"], row["pid"]]
        )
songPlaylist = pd.DataFrame(
    songPlaylistArray, columns=["trackid", "artist_name", "track_name", "pid"]
)

print(songPlaylist.shape)
songPlaylist.head(10)

In [None]:
# Turn songs into their unqiue cat codes so we have a 0-N index for tracks
songPlaylist["trackindex"] = songPlaylist["trackid"].astype("category").cat.codes

print(len(songPlaylist["trackindex"].unique()))
print(songPlaylist.shape)
songPlaylist.head(10)

In [None]:
# Save data in DOK (Dictionary Of Keys) matrix (optimized sparse matrix object)
# Create a sparse pid x trackindex matrix
# If a pid i has song j, mat[i,j]=1

mat = sp.dok_matrix((11000, 180409), dtype=np.float32)
for pid, trackindex in zip(songPlaylist["pid"], songPlaylist["trackindex"]):
    mat[pid, trackindex] = 1.0

In [None]:
import keras
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape, multiply, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from time import time

In [None]:
def get_model(num_playlists, num_items, latent_dim, regs=[0, 0]):
    # Input variables
    playlist_input = Input(shape=(1,), dtype="int32", name="playlist_input")

    item_input = Input(shape=(1,), dtype="int32", name="item_input")

    playlist_embedding = Embedding(
        name="playlist_embedding",
        input_dim=num_playlists,
        output_dim=latent_dim,
        embeddings_regularizer=l2(regs[0]),
        input_length=1,
    )

    item_embedding = Embedding(
        name="item_embedding",
        input_dim=num_items,
        output_dim=latent_dim,
        embeddings_regularizer=l2(regs[1]),
        input_length=1,
    )

    # Flattens the embedding vector
    playlist_latent = Flatten()(playlist_embedding(playlist_input))
    item_latent = Flatten()(item_embedding(item_input))

    # Element-wise product of playlist and item embeddings
    predict_vector = multiply([playlist_latent, item_latent])

    # Final prediction layer
    prediction = Dense(
        1,
        activation="sigmoid",
        kernel_initializer="random_normal",
        name="prediction_layer",
    )(predict_vector)

    model = Model(inputs=[playlist_input, item_input], outputs=prediction)

    return model

In [None]:
def get_train_instances(train, num_negatives):
    playlist_input, item_input, labels = [], [], []
    num_playlists = train.shape[0]
    for (u, i) in train.keys():
        # positive instances
        playlist_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train:
                j = np.random.randint(num_items)
            playlist_input.append(u)
            item_input.append(j)
            labels.append(0)

    return playlist_input, item_input, labels

In [None]:
# Specify hyperparameters
num_factors = 8
regs = [0, 0]
num_negatives = 4
learning_rate = 0.01
epochs = 15
batch_size = 500

# Loading data
train = mat
num_playlists, num_items = train.shape

# Build and compile model
model = get_model(num_playlists, num_items, num_factors, regs)
model.compile(
    optimizer=Adam(lr=learning_rate), loss="binary_crossentropy", metrics=["acc"]
)
print(model.summary())

# Train model
# Generate training instances
playlist_input, item_input, labels = get_train_instances(train, num_negatives)

# Train model
model.fit(
    [np.array(playlist_input), np.array(item_input)],  # input
    np.array(labels),  # labels
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True,
)