In [1]:
import json
import time
import os
import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm_notebook

In [2]:
def json_reader(begin):
    if begin not in np.arange(0, 11000, 1000):
        raise Exception(
            "Invalid start pid! Start pids must be {0, 1000, 2000, ..., 999000}"
        )

    end = begin + 1000
    path = (
        "../data/playlists/playlist.slice." + str(begin) + "-" + str(end - 1) + ".json"
    )

    jsonData = json.load(open(path, "r"))
    actualSlice = pd.DataFrame.from_dict(jsonData["playlists"], orient="columns")
    return actualSlice

In [3]:
jsonList = []
for begin in np.arange(0, 11000, 1000):
    actual = json_reader(begin)
    jsonList.append(actual)

trainData = pd.concat(jsonList)
jsonList.clear()

print(trainData.shape)
trainData.head()

(11000, 12)


Unnamed: 0,collaborative,description,duration_ms,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks,pid,tracks
0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't..."
1,False,,11656470,1506556800,Awesome Playlist,23,21,5,1,39,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_..."
2,False,,14039958,1505692800,korean,51,31,18,1,64,2,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri..."
3,False,,28926058,1501027200,mat,107,86,4,1,126,3,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën..."
4,False,,4335282,1401667200,90s,16,16,7,2,17,4,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk..."


In [4]:
# Turn playlist level dataframe into song level dataframe
# is a df of all track ids, cooresponding artist names, track names and playlist ids

songPlaylistArray = []
for index, row in trainData.iterrows():
    for track in row["tracks"]:
        songPlaylistArray.append(
            [track["track_uri"], track["artist_name"], track["track_name"], row["pid"]]
        )
songPlaylist = pd.DataFrame(
    songPlaylistArray, columns=["trackid", "artist_name", "track_name", "pid"]
)

print(songPlaylist.shape)
songPlaylist.head(10)

(731360, 4)


Unnamed: 0,trackid,artist_name,track_name,pid
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0


In [5]:
trackids = songPlaylist["trackid"]
print(trackids)

0         spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1         spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2         spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3         spotify:track:1AWQoqb9bSvzTjaLralEkT
4         spotify:track:1lzr43nnXAijIGYnCT8M8H
5         spotify:track:0XUfyU2QviPAs6bxSpXYG4
6         spotify:track:68vgtRHr7iZHpzGpon6Jlo
7         spotify:track:3BxWKCI06eQ5Od8TY2JBeA
8         spotify:track:7H6ev70Weq6DdpZyyTmUXk
9         spotify:track:2PpruBYCo4H7WOBJ7Q2EwM
10        spotify:track:2gam98EZKrF9XuOkU13ApN
11        spotify:track:4Y45aqo9QMa57rDsAJv40A
12        spotify:track:1HwpWwa6bnqqRhK8agG4RS
13        spotify:track:20ORwCJusz4KS2PbTPVNKo
14        spotify:track:7k6IzwMGpxnRghE7YosnXT
15        spotify:track:1Bv0Yl01xBDZD4OQP93fyl
16        spotify:track:4omisSlTk6Dsq2iQD7MA07
17        spotify:track:7xYnUQigPoIDAMPVK79NEq
18        spotify:track:6d8A5sAx9TfdeseDvfWNHd
19        spotify:track:4pmc2AxSEq6g7hPVlJCPyP
20        spotify:track:215JYyyUnrJ98NK3KEwu6d
21        spo

In [6]:
with open(".spotifyTokens", "r") as tokens:
    client_id = tokens.readline().strip()
    client_secret = tokens.readline().strip()

In [7]:
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

slice = -1
batch_size = 100
for i in tqdm_notebook(range(0, len(trackids), batch_size), ascii=True, desc="Getting Musics Descriptions"):
    tids = trackids[i : i + batch_size]

    musics = sp.audio_features(tids)

    if i % 1000 == 0:
        slice += 1

        if slice > 0:
            with open(
                "../data/music_contents/content.slice.%s-%s.json"
                % (begin_slice, end_slice),
                "rb+",
            ) as output:
                output.seek(-2, os.SEEK_END)
                output.truncate()

            with open(
                "../data/music_contents/content.slice.%s-%s.json"
                % (begin_slice, end_slice),
                "a",
            ) as output:
                output.write("\n]")

        begin_slice = slice * 1000
        end_slice = begin_slice + 999

        if end_slice > len(trackids):
            end_slice = len(trackids)

        with open(
            "../data/music_contents/content.slice.%s-%s.json"
            % (begin_slice, end_slice),
            "w",
        ) as output:
            output.write("[\n")

    with open(
        "../data/music_contents/content.slice.%s-%s.json" % (begin_slice, end_slice),
        "a",
    ) as output:
        for music in musics:
            output.write(json.dumps(music, indent=4))
            output.write(",\n")
            
with open(
    "../data/music_contents/content.slice.%s-%s.json" % (begin_slice, end_slice), "rb+"
) as output:
    output.seek(-2, os.SEEK_END)
    output.truncate()

with open(
    "../data/music_contents/content.slice.%s-%s.json" % (begin_slice, end_slice), "a"
) as output:
    output.write("\n]")

HBox(children=(IntProgress(value=0, description='Getting Musics Descriptions', max=7314, style=ProgressStyle(d…




In [8]:
with open("../data/music_contents/content.slice.0-999.json") as json_file:
    data = json.load(json_file)
    print(len(data))

1000
