In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import json
import os

In [2]:
def json_reader(begin):
    if begin not in np.arange(0, 11000, 1000):
        raise Exception(
            "Invalid start pid! Start pids must be {0, 1000, 2000, ..., 11000}"
        )

    end = begin + 999
    path = "../data/playlists/playlist.slice." + str(begin) + "-" + str(end) + ".json"

    jsonData = json.load(open(path, "r"))
    actualSlice = pd.DataFrame.from_dict(jsonData["playlists"], orient="columns")
    return actualSlice

In [3]:
jsonList = []
for begin in np.arange(0, 11000, 1000):
    actual = json_reader(begin)
    jsonList.append(actual)

trainData = pd.concat(jsonList, ignore_index=True)
jsonList.clear()

print(trainData.shape)

(11000, 12)


In [4]:
# Turn playlist level dataframe into song level dataframe
# is a df of all track ids, cooresponding artist names, track names and playlist ids

songPlaylistArray = []
for index, row in trainData.iterrows():
    for track in row["tracks"]:
        songPlaylistArray.append(
            [track["track_uri"], track["artist_name"], track["track_name"], row["pid"]]
        )
songPlaylist = pd.DataFrame(
    songPlaylistArray, columns=["trackid", "artist_name", "track_name", "pid"]
)
songPlaylistArray.clear()

print(songPlaylist.shape)
songPlaylist.head(10)
len(songPlaylist['trackid'].unique())

(731360, 4)


180409

In [5]:
def json_content_reader(begin):
    if begin not in np.arange(0, 180000, 1000):
        raise Exception(
            "Invalid start contentid! Start pids must be {0, 1000, 2000, ..., 731000}"
        )

    end = begin + 999
    path = (
        "../data/music_contents/content.slice." + str(begin) + "-" + str(end) + ".json"
    )
    jsonData = json.load(open(path, "r"))
    
    actualSlice = pd.DataFrame.from_dict(jsonData, orient="columns")
    return actualSlice

In [6]:
jsonList = []
for begin in np.arange(0, 180000, 1000):
    actual = json_content_reader(begin)
    jsonList.append(actual)

contentData = pd.concat(jsonList, ignore_index=True)
jsonList.clear()

print(contentData.shape)
contentData.head(10)

(179999, 18)


Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.0311,https://api.spotify.com/v1/audio-analysis/0UaM...,0.904,226864,0.813,0UaMYEvWZi0ZqiDOoHU3YI,0.00697,4,0.0471,-7.105,0,0.121,125.461,4,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,audio_features,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0.81
1,0.0249,https://api.spotify.com/v1/audio-analysis/6I9V...,0.774,198800,0.838,6I9VzXrHxO9rA9A5euc8Ak,0.025,5,0.242,-3.914,0,0.114,143.04,4,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,audio_features,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0.924
2,0.00238,https://api.spotify.com/v1/audio-analysis/0WqI...,0.664,235933,0.758,0WqIKmW4BTrj3eJFmnCKMv,0.0,2,0.0598,-6.583,0,0.21,99.259,4,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,audio_features,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,0.701
3,0.202,https://api.spotify.com/v1/audio-analysis/1AWQ...,0.891,267267,0.714,1AWQoqb9bSvzTjaLralEkT,0.000234,4,0.0521,-6.055,0,0.14,100.972,4,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,audio_features,spotify:track:1AWQoqb9bSvzTjaLralEkT,0.818
4,0.0561,https://api.spotify.com/v1/audio-analysis/1lzr...,0.853,227600,0.606,1lzr43nnXAijIGYnCT8M8H,0.0,0,0.313,-4.596,1,0.0713,94.759,4,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,audio_features,spotify:track:1lzr43nnXAijIGYnCT8M8H,0.654
5,0.0212,https://api.spotify.com/v1/audio-analysis/0XUf...,0.881,250373,0.788,0XUfyU2QviPAs6bxSpXYG4,0.0,2,0.0377,-4.669,1,0.168,104.997,4,https://api.spotify.com/v1/tracks/0XUfyU2QviPA...,audio_features,spotify:track:0XUfyU2QviPAs6bxSpXYG4,0.592
6,0.257,https://api.spotify.com/v1/audio-analysis/68vg...,0.662,223440,0.507,68vgtRHr7iZHpzGpon6Jlo,0.0,5,0.0465,-8.238,1,0.118,86.412,4,https://api.spotify.com/v1/tracks/68vgtRHr7iZH...,audio_features,spotify:track:68vgtRHr7iZHpzGpon6Jlo,0.676
7,0.158,https://api.spotify.com/v1/audio-analysis/3BxW...,0.544,225560,0.823,3BxWKCI06eQ5Od8TY2JBeA,0.0,2,0.268,-4.318,1,0.32,210.75,4,https://api.spotify.com/v1/tracks/3BxWKCI06eQ5...,audio_features,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,0.434
8,0.273,https://api.spotify.com/v1/audio-analysis/7H6e...,0.713,271333,0.678,7H6ev70Weq6DdpZyyTmUXk,0.0,5,0.149,-3.525,0,0.102,138.009,4,https://api.spotify.com/v1/tracks/7H6ev70Weq6D...,audio_features,spotify:track:7H6ev70Weq6DdpZyyTmUXk,0.734
9,0.103,https://api.spotify.com/v1/audio-analysis/2Ppr...,0.728,235213,0.974,2PpruBYCo4H7WOBJ7Q2EwM,0.000532,4,0.175,-2.261,0,0.0665,79.526,4,https://api.spotify.com/v1/tracks/2PpruBYCo4H7...,audio_features,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,0.965


In [7]:
contentData.drop(
    columns=[
        "type",
        "track_href",
        "mode",
        "time_signature",
        "id",
        "analysis_url",
        "key",
    ],
    inplace=True,
)

contentData.head(10)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,uri,valence
0,0.0311,0.904,226864,0.813,0.00697,0.0471,-7.105,0.121,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0.81
1,0.0249,0.774,198800,0.838,0.025,0.242,-3.914,0.114,143.04,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0.924
2,0.00238,0.664,235933,0.758,0.0,0.0598,-6.583,0.21,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,0.701
3,0.202,0.891,267267,0.714,0.000234,0.0521,-6.055,0.14,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT,0.818
4,0.0561,0.853,227600,0.606,0.0,0.313,-4.596,0.0713,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H,0.654
5,0.0212,0.881,250373,0.788,0.0,0.0377,-4.669,0.168,104.997,spotify:track:0XUfyU2QviPAs6bxSpXYG4,0.592
6,0.257,0.662,223440,0.507,0.0,0.0465,-8.238,0.118,86.412,spotify:track:68vgtRHr7iZHpzGpon6Jlo,0.676
7,0.158,0.544,225560,0.823,0.0,0.268,-4.318,0.32,210.75,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,0.434
8,0.273,0.713,271333,0.678,0.0,0.149,-3.525,0.102,138.009,spotify:track:7H6ev70Weq6DdpZyyTmUXk,0.734
9,0.103,0.728,235213,0.974,0.000532,0.175,-2.261,0.0665,79.526,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,0.965
