In [1]:
import glob
import json
from random import sample

import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [2]:
data_files = glob.glob("../data/spotify_million_playlist_dataset/data/*.json")

In [3]:
N_FILES = 20

data = []

for fn in tqdm(sample(data_files, N_FILES)):
    with open(fn, "r") as file:
        data += json.load(file)['playlists']

100%|███████████████████████████████████████████| 20/20 [00:01<00:00, 11.28it/s]


In [4]:
MIN_TRACKS, MAX_TRACKS = 10 + 1, 100

playlists = []

for pl in tqdm(data):
    n_tracks = len(pl["tracks"])
    
    if n_tracks >= MIN_TRACKS and n_tracks <= MAX_TRACKS:
        playlists += [{"pos": track["pos"], 
                       "artist_name": track["artist_name"], 
                       "track_name": track["track_name"], 
                       "track_uri": track["track_uri"], 
                       "pos": track["pos"], 
                       "pid": pl["pid"]}
                       for track in pl["tracks"]]

100%|█████████████████████████████████| 20000/20000 [00:00<00:00, 114936.53it/s]


In [5]:
df = pd.DataFrame(playlists)
df = df.rename(columns={"pid": "playlist_id", "pos": "playlist_position"})

track_encoder = LabelEncoder()
df["track_id"] = track_encoder.fit_transform(df["track_uri"])

playlist_encoder = LabelEncoder()
df["playlist_id"] = playlist_encoder.fit_transform(df["playlist_id"])

df = df[["playlist_id", "track_id", "track_uri", "playlist_position", "artist_name", "track_name"]]
df.to_csv("../data/spotify_playlists.csv", index=False)

In [6]:
n_tracks = df["track_id"].unique().shape[0]
n_playlists = df["playlist_id"].unique().shape[0]
print("Unique tracks: {}".format(n_tracks))
print("Playlists: {}".format(n_playlists))

Unique tracks: 163517
Playlists: 14770


In [7]:
df

Unnamed: 0,playlist_id,track_id,track_uri,playlist_position,artist_name,track_name
0,11801,98552,spotify:track:4gowy3WT6D1yhMLgRBlf9C,0,Drake,Change Locations
1,11801,149313,spotify:track:76kyKtPLsFbQkdQ86QrkF4,1,Drake,Plastic Bag
2,11801,116025,spotify:track:5W7xC99N2Zzfh69r7I7zWK,2,Rick Ross,Aston Martin Music
3,11801,77328,spotify:track:3fyMH1t6UPeR5croea9PrR,3,Drake,Best I Ever Had
4,11801,126428,spotify:track:60geMByGdlcGGMR5R5ZjHE,4,Drake,I'm Goin In
...,...,...,...,...,...,...
656149,3667,137038,spotify:track:6WaIDdOdfmWdvXvUyBMq7F,26,NEEDTOBREATHE,DON'T BRING THAT TROUBLE
656150,3667,88747,spotify:track:4DZghpw50ZnO3ckfDuNkft,27,The Heydaze,New Religion
656151,3667,16017,spotify:track:0krby3AUjWWXAermpLRklq,28,for KING & COUNTRY,To the Dreamers
656152,3667,21144,spotify:track:0zkyplku4wDUCYlMJd5vTu,29,People & Songs,Let Revival Come (Revive Me) [feat. Kevin Jone...
