### All imports needed for notebook

In [None]:
import os
import json
import pickle as pk
import pandas as pd
import os
from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

# -- local files
from modules import spotify_methods as sm
from modules import content_recommender as cr
from modules import recommender_methods as rm

# 1) Dataset Preparation
We need to prepare our 1 000 000 playlists into a suitable format for model training

In [None]:
dataset_path = "C:\\Users\\Shivesh\\Documents\\Repos\\Personal\\Projects\\music-recommendor\\millionplaylistsubset"

### Collect dataset songs

In [None]:
dataset_path = "C:\\Users\\Shivesh\\Documents\\Repos\\Personal\\Projects\\music-recommendor\\millionplaylistsubset"

playlist_id_ref = {}
playlist_users_ref = {}
playlist_users_ref['anonymous'] = {}

files_count = 0
playlist_count = 0

filenames = os.listdir(dataset_path)

collect_songs_dict = {}
max_id = 0
for filename in sorted(filenames):

    if filename.startswith("mpd.slice.") and filename.endswith(".json"):

        fullpath = os.sep.join((dataset_path, filename))
        f = open(fullpath)
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)

        for playlist in mpd_slice["playlists"]:
            
            collect_songs_dict[playlist.get("pid")] = {}
            tracks = playlist['tracks']

            temp_list = []
            for track in tracks:

                temp_list += [track.get("track_name").lower() +" --- "+ track.get("artist_name").lower()] 

            collect_songs_dict[playlist.get("pid")] = temp_list

            playlist_id_ref[playlist.get("pid")]  = {"name": playlist.get("name").lower(), "owner": "anonymous"}
            playlist_users_ref['anonymous'].update({playlist.get("pid"): playlist.get("name").lower()})

            playlist_count += 1

            
            max_id +=1

        files_count += 1

print(f'''Files read: {files_count}''')
print(f'''Next available playlist ID: {max_id}''')

In [None]:
playlist_songs = []
for pid, tracks in collect_songs_dict.items():
    playlist_songs += [[str(pid), track] for track in tracks]

### Add additonal spotify playlists to the dataset

In [None]:
spotify_users = ['tmmb1pzyny780errk5d7vuz6r', '1yoo33uf0br9bvkmn1wlahjgr', '22q7rezq5nqdwct7ujf7knkua', 'mgscx8hs8egcjamt31rfkm8ww']

Collect playlist ids

In [None]:
extra_playlists = {}
for su in spotify_users:

    playlist_users_ref[su] = {}
    user_playlists = sm.collect_user_playlists(su)
    extra_playlists[su] = user_playlists

print(f'''No of playlists: {len(extra_playlists)}''')
print("Playlist Users and IDs:")
print((extra_playlists))

Collect all tracks for the above playlists

In [None]:
extra_playlist_songs = []

for ep_user, ep_playlists in extra_playlists.items():

    collect_tracks, new_id_ref, new_users_ref = sm.collect_tracks(ep_playlists, ep_user, playlist_count)
    
    playlist_id_ref.update(new_id_ref)
    playlist_users_ref.update(new_users_ref)

    extra_playlist_songs += collect_tracks
    playlist_count += len(ep_playlists)

print(f'''Total playlists collected: {playlist_count}''')
print(f'''Extra playlists collected from spotify: {len(extra_playlist_songs)}''')
playlist_songs += extra_playlist_songs

In [None]:
with open('./model/extra_playlist_songs.json', 'w') as fp:
    json.dump(extra_playlist_songs, fp, indent=2)

with open('./model/playlists_id_ref.json', 'w') as fp:
    json.dump(playlist_id_ref, fp, indent=2)

with open('./model/playlists_users_ref.json', 'w') as fp:
    json.dump(playlist_users_ref, fp, indent=2)

filename = os.getcwd() + "\\model" + "\\training_data.pkl"
with open(filename, 'wb') as fp:
    pk.dump(playlist_songs, fp)

# with open('./model/training_data.json', 'w') as fp:
#     json.dump(playlist_songs, fp, indent=2)

print("Saved references for playlist IDs names and owners in ./model/")
print("Saved training data within ./model")

# 2) Training Preparation

In [None]:
# check to see if Tensorflow is detecting my GPU
print(tf.__version__)
print(tf.config.list_physical_devices())

In [None]:
playlist_songs = tf.data.Dataset.from_tensor_slices(playlist_songs)

Add columns names (features)

In [None]:

playlist_songs = playlist_songs.map(lambda x: {
    "pid": x[0],
    "song_name": x[1],
})

Dataset Overview

In [None]:
print(type(playlist_songs))
pl_songs_count = len(playlist_songs)
pl_songs_count

In [None]:
print(playlist_songs)

for r in playlist_songs:
    print(r)
    break

We need to have the songs stored seperately for vocab creation

In [None]:
all_songs = playlist_songs.map(lambda x: x["song_name"])
for r in all_songs:
    print(r)
    break

Split data for training and testing

In [None]:
tf.random.set_seed(42)
shuffled = playlist_songs.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(pl_songs_count)
# test = shuffled.skip(pl_songs_count-10000).take(10000)

In [None]:
print(len(train))
for t in train:
    print(t)
    break

In [None]:
song_names = all_songs.batch(2048)
u_ids = playlist_songs.map(lambda x: x["pid"])
playlist_ids = u_ids.batch(2048)

# We need unqiue variables for upcoming vocabs
unique_songs = np.unique(np.concatenate(list(song_names)))
unique_pids = np.unique(np.concatenate(list(playlist_ids)))

print(len(unique_pids))
print(unique_pids[:5])
print(len(unique_songs))
unique_songs[10:20]

# 3) Model Creation

In [None]:
embedding_dimension = 32

pid_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_pids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_pids) + 1, embedding_dimension)
])

In [None]:
song_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_songs, mask_token=None),
  tf.keras.layers.Embedding(len(unique_songs) + 1, embedding_dimension)
])

In [None]:
usongs = tf.data.Dataset.from_tensor_slices(unique_songs)
metrics = tfrs.metrics.FactorizedTopK(
  candidates=usongs.batch(4096).map(song_model)
)

In [None]:
task = tfrs.tasks.Retrieval(
  # metrics=metrics
)

In [None]:
class SongModel(tfrs.Model):

  def __init__(self, pid_model, song_model, task):
    super().__init__()
    self.song_model: tf.keras.Model = song_model
    self.pid_model: tf.keras.Model = pid_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    
    pid_embeddings = self.pid_model(features["pid"])
    positive_song_embeddings = self.song_model(features["song_name"])

    # The task computes the loss and the metrics.
    return self.task(pid_embeddings, positive_song_embeddings)

Combine the above 2 models

In [None]:
model = SongModel(pid_model, song_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(2048).cache()
# cached_test = test.batch(1024).cache()

# 4) Training

In [None]:
model.fit(cached_train, epochs=10, verbose=2)

# 5) Recommendation

Ensure we don't recommend the same song multiple times by parsing a unqiue list to recommend from

In [None]:
usongs = tf.data.Dataset.from_tensor_slices(unique_songs)

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.pid_model, k =1000)
index.index_from_dataset(
  tf.data.Dataset.zip((usongs.batch(32), usongs.batch(32).map(model.song_model)))
)

In [None]:
scores, titles = index(tf.constant(["50000"]))
print(f"Recommendations for user: {titles[0, :1000]}")

# 6) Save Model

In [None]:
cwd = os.getcwd()
path = os.path.join(cwd, "model")
tf.saved_model.save(index, path)

cwd = os.getcwd() + "\\model_weights"
path = os.path.join(cwd, "model_weights")
model.save_weights(path)

In [None]:
filename = os.getcwd() + "\\model_weights" + "\\unique_songs.txt"
textfile = open(filename, "w", encoding="utf-8")

for us in unique_songs:
    textfile.write(us.decode("utf-8"))
    textfile.write("\n")

textfile.close()

filename = os.getcwd() + "\\model_weights" + "\\unique_pids.txt"
textfile = open(filename, "w", encoding="utf-8")

for us in unique_pids:
    textfile.write(us.decode("utf-8"))
    textfile.write("\n")

textfile.close()