In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import json

In [None]:
def json_reader(begin):
    if begin not in np.arange(0, 11000, 1000):
        raise Exception(
            "Invalid start pid! Start pids must be {0, 1000, 2000, ..., 11000}"
        )

    end = begin + 999
    path = "../data/playlists/playlist.slice." + str(begin) + "-" + str(end) + ".json"

    jsonData = json.load(open(path, "r"))
    actualSlice = pd.DataFrame.from_dict(jsonData["playlists"], orient="columns")
    return actualSlice

In [None]:
jsonList = []
for begin in np.arange(0, 11000, 1000):
    actual = json_reader(begin)
    jsonList.append(actual)

trainData = pd.concat(jsonList, ignore_index=True)
jsonList.clear()

print(trainData.shape)
trainData.head()

In [None]:
# Turn playlist level dataframe into song level dataframe
# is a df of all track ids, cooresponding artist names, track names and playlist ids

songPlaylistArray = []
for index, row in trainData.iterrows():
    for track in row["tracks"]:
        songPlaylistArray.append(
            [track["track_uri"], track["artist_name"], track["track_name"], row["pid"]]
        )
songPlaylist = pd.DataFrame(
    songPlaylistArray, columns=["trackid", "artist_name", "track_name", "pid"]
)
songPlaylistArray.clear()

print(songPlaylist.shape)
songPlaylist.head(10)

In [None]:
# Turn songs into their unqiue cat codes so we have a 0-N index for tracks
songPlaylist["trackindex"] = songPlaylist["trackid"].astype("category").cat.codes

print(len(songPlaylist["trackid"].unique()))
print(songPlaylist.shape)
songPlaylist.head(10)

In [None]:
# Save data in DOK (Dictionary Of Keys) matrix (optimized sparse matrix object)
# Create a sparse pid x trackindex matrix
# If a pid i has song j, mat[i,j]=1

mat = sp.dok_matrix((11000, 180409), dtype=np.float32)
for pid, trackindex in zip(songPlaylist["pid"], songPlaylist["trackindex"]):
    mat[pid, trackindex] = 1.0

In [None]:
from keras.models import Model
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Embedding, Input, Dense, multiply, Flatten
from keras.optimizers import Adagrad
from keras.regularizers import l2
from time import time

In [None]:
def get_model(num_playlists, num_items, latent_dim, regs=[0, 0]):
    # Input variables
    playlist_input = Input(shape=(1,), dtype="int32", name="playlist_input")

    music_input = Input(shape=(1,), dtype="int32", name="music_input")

    playlist_embedding = Embedding(
        name="playlist_embedding",
        input_dim=num_playlists,
        output_dim=latent_dim,
        embeddings_regularizer=l2(regs[0]),
        input_length=1,
    )

    music_embedding = Embedding(
        name="music_embedding",
        input_dim=num_items,
        output_dim=latent_dim,
        embeddings_regularizer=l2(regs[1]),
        input_length=1,
    )

    # Flattens the embedding vector
    playlist_latent = Flatten()(playlist_embedding(playlist_input))
    music_latent = Flatten()(music_embedding(music_input))

    # Element-wise product of playlist and music embeddings
    predict_vector = multiply([playlist_latent, music_latent])

    # Final prediction layers
    dense1 = Dense(
        2 ** 10,
        activation="sigmoid",
        kernel_initializer="random_normal",
        name="dense_layer1",
    )(predict_vector)

    dropout1 = Dropout(0.2, seed=1212)(dense1)

    dense2 = Dense(
        2 ** 8,
        activation="sigmoid",
        kernel_initializer="random_normal",
        name="dense_layer2",
    )(dropout1)

    dropout2 = Dropout(0.2, seed=1212)(dense2)

    prediction = Dense(
        1,
        activation="sigmoid",
        kernel_initializer="random_normal",
        name="prediction_layer",
    )(dropout2)

    model = Model(inputs=[playlist_input, music_input], outputs=prediction)

    return model

In [None]:
def get_train_instances(train, num_negatives):
    playlist_input, music_input, labels = [], [], []
    num_playlists, num_musics = train.shape

    for (u, i) in train.keys():
        # positive instances
        playlist_input.append(u)
        music_input.append(i)
        labels.append(1)

        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_musics)

            while (u, j) in train:
                j = np.random.randint(num_musics)

            playlist_input.append(u)
            music_input.append(j)
            labels.append(0)

    return playlist_input, music_input, labels

In [None]:
# Specify hyperparameters
num_factors = 8
regs = [0, 0]
num_negatives = 2
epochs = 30
batch_size = 500

# Loading data
train = mat
num_playlists, num_musics = train.shape

# Build and compile model
model = get_model(num_playlists, num_musics, num_factors, regs)
model.compile(
    optimizer=Adagrad(), loss="binary_crossentropy", metrics=["binary_accuracy"]
)
print(model.summary())

history = []
# Train model


# Generate training instances
playlist_input, music_input, labels = get_train_instances(train, num_negatives)

# Train model
hist = model.fit(
    [np.array(playlist_input), np.array(music_input)],  # input
    np.array(labels),  # labels
    validation_split=0.2,
    batch_size=batch_size,
    shuffle=True,
    epochs=epochs,
)

history.append(hist.history)

In [None]:
history_df = pd.DataFrame(history)
history_df.set_index(pd.Series(range(1, epochs+1)), inplace=True)

history_df.index.name = "Epoch"
history_df.columns = [
    "Training Accuracy",
    "Training Loss",
    "Validation Accuracy",
    "Validation Loss",
]

pd.options.display.float_format = '{:,.20f}'.format

for column in history_df.columns:

    history_df.loc[:, column] = history_df[column].map(
        lambda x: x[0]
    )

display(history_df)