In [1]:
import glob
import json
from random import sample

import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [2]:
data_files = glob.glob("../data/spotify_million_playlist_dataset/data/*.json")

In [3]:
N_FILES = 20

data = []

for fn in tqdm(sample(data_files, N_FILES)):
    with open(fn, "r") as file:
        data += json.load(file)['playlists']

100%|███████████████████████████████████████████| 20/20 [00:01<00:00, 10.66it/s]


In [4]:
MIN_TRACKS, MAX_TRACKS = 10 + 1, 100

playlists = []

for pl in tqdm(data):
    n_tracks = len(pl["tracks"])
    
    if n_tracks >= MIN_TRACKS and n_tracks <= MAX_TRACKS:
        playlists += [{"pos": track["pos"], 
                       "artist_name": track["artist_name"], 
                       "track_name": track["track_name"], 
                       "track_uri": track["track_uri"], 
                       "pos": track["pos"], 
                       "pid": pl["pid"]}
                       for track in pl["tracks"]]

100%|██████████████████████████████████| 20000/20000 [00:00<00:00, 98612.59it/s]


In [5]:
df = pd.DataFrame(playlists)
df = df.rename(columns={"pid": "playlist_id", "pos": "playlist_position"})

track_encoder = LabelEncoder()
df["track_id"] = track_encoder.fit_transform(df["track_uri"])

playlist_encoder = LabelEncoder()
df["playlist_id"] = playlist_encoder.fit_transform(df["playlist_id"])

df = df[["playlist_id", "track_id", "track_uri", "playlist_position", "artist_name", "track_name"]]
df.to_csv("../data/spotify_playlists.csv", index=False)

In [6]:
n_tracks = df["track_id"].unique().shape[0]
n_playlists = df["playlist_id"].unique().shape[0]
print("Unique tracks: {}".format(n_tracks))
print("Playlists: {}".format(n_playlists))

Unique tracks: 164576
Playlists: 14750


In [7]:
df

Unnamed: 0,playlist_id,track_id,track_uri,playlist_position,artist_name,track_name
0,9615,146723,spotify:track:6w2VaBGWcColW05nZROWvS,0,Tory Lanez,To D.R.E.A.M.
1,9615,158343,spotify:track:7hMqz88kcK2Tx6wBka9INF,1,Tory Lanez,All The Girls
2,9615,123985,spotify:track:5rqfg8Hz5D1aCcn2XUxzNA,2,Tory Lanez,Traphouse
3,9615,117440,spotify:track:5Z2S8VemQCfM02SLq8o5cR,3,Tory Lanez,One Day
4,9615,66611,spotify:track:39Y7nKwnEoMSfzbSVcAL0J,4,Tory Lanez,B.L.O.W.
...,...,...,...,...,...,...
656048,2958,136011,spotify:track:6Qyc6fS4DsZjB2mRW9DsQs,53,The Goo Goo Dolls,Iris
656049,2958,139630,spotify:track:6bRgyHxbPkvrHiQFSWxo1B,54,Ane Brun,Halo
656050,2958,107358,spotify:track:5522Ut7YYZSv2TjLMLIpEU,55,Kelsea Ballerini,Peter Pan
656051,2958,98481,spotify:track:4fIWvT19w9PR0VVBuPYpWA,56,Michael Bublé,Haven't Met You Yet
