In [1]:
import pandas as pd
import numpy as np
import json
import os
import ast
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

  return f(*args, **kwds)
  return f(*args, **kwds)


## Data Collection
Data Sources:
- Million Playlist Dataset
- Spotify API

MPD provides playlist level data but it includes all the tracks that belong to each playlist. Using the track URI provided, we can get more track level detail from the Spotify API. It takes around 1 hour to get an aggregated csv for a json file.

In [2]:
# read all json files in the folder and load them in as dataframes
def read_files(path_to_json):
    json_files = [pos_json for pos_json in sorted(os.listdir(path_to_json)) if pos_json.endswith('.json')][115:500:10]
    master_list = []
    for i in json_files:
        data = json.load(open(path_to_json+i))
        df = pd.DataFrame(data["playlists"])
        master_list.append(df)
    return master_list, json_files

In [3]:


# # Put all tracks in each playlist in the previous dataframe in a datafarame
# def get_tracks_df(df):
#     df_new2 = []
#     for i in range(df.shape[0]):
#         for j in df[['tracks']].iloc[i][0]:
#             j['name'] = df[['name']].iloc[i].values[0]
# #             j['collaborative'] = df[['collaborative']].iloc[i].values[0]
# #             j['pid'] = df[['pid']].iloc[i].values[0]
# #             j['modified_at'] = df[['modified_at']].iloc[i].values[0]
# #             j['num_tracks'] = df[['num_tracks']].iloc[i].values[0]
# #             j['num_albums'] = df[['num_albums']].iloc[i].values[0]
# #             j['num_followers'] = df[['num_followers']].iloc[i].values[0]
# #             j['num_edits'] = df[['num_edits']].iloc[i].values[0]
# #             j['duration_ms'] = df[['duration_ms']].iloc[i].values[0]
# #             j['num_artists'] = df[['num_artists']].iloc[i].values[0]
#             df_new2.append(j)
#     df_updated = pd.DataFrame(df_new2)
#     print(df_updated.shape)
#     display(df_updated.head())
#     return df_updated
# path_to_json = 'mpd.v1/data/'

# master_list, json_files = read_files(path_to_json)
# df = master_list[0]
# get_tracks_df(df).shape

In [4]:
def get_tracks_df(df):
    tracks = []
    for i in range(df.shape[0]):
        values = df.iloc[i][["collaborative", "description", "duration_ms", "modified_at",
                                                         "name", "num_albums", "num_artists", "num_edits", "num_followers",
                                                         "num_tracks", "pid"]].values.tolist()
        keys=["collaborative", "description", "duration_ms", "modified_at", "name", "num_albums", "num_artists", 
              "num_edits", "num_followers", "num_tracks", "pid"]
        dictionary = dict(zip(keys, values))
        for track in df.iloc[i]['tracks']:
            z = {**track, **dictionary}
            tracks.append(z)
    assert(len(tracks) == df[['num_tracks']].sum().tolist()[0])
    return pd.DataFrame(tracks)

In [5]:
# parse the uris 
def parse_uri(columns, df_updated):
    for i in columns:   
        df_updated[i] = df_updated[i].str.split(':', expand=True)[2]
    return df_updated


In [6]:
# get token for the spotify API
def get_token():
    auth_url = 'https://accounts.spotify.com/api/token'
    headers = {
                'Content-Type': 'application/x-www-form-urlencoded',
                'Authorization': 'Basic MTU0NWExZTBiZTE5NDg0MWE0MjZkN2YwN2Q2OWVjZWM6YjJlMDY0MGYzMDY0NGM3NGE4YjM0NWZmYzM5YzA0ZjU='
                }
    payload = {'grant_type': 'client_credentials'}

    r = requests.post(auth_url, data=payload, headers=headers)
    token = r.json()
    return token

In [7]:
# Make the spotify API requests using the token and track uris from the dataframe
def get_spotify_tracks(url, file_name, track_id_groups):
    return_list = []
    count = 0
    for i in track_id_groups:
        if count % 5000 == 0:
            token = get_token()
        headers = {'user-agent': 'my-app/0.0.1', 'Authorization': 'Bearer '+ token['access_token']}
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        response = session.get(url + ",".join(str(x) for x in i), headers = headers)
#         response = requests.get(url + ",".join(str(x) for x in i), headers = headers)
        try: 
            r = response.json()
        except:
            print(response)
        return_list.append(r)
        count += 1

#     with open(file_name, 'w') as f:
#         for item in return_list:
#             f.write("%s\n" % item)
    return return_list


In [8]:
# get the popularities from the track_detail_0.txt and add it to the dataframe and take care of all the corner cases
def add_popularity_col(df_updated, tracks_detail, track_group):
    popularities = []
    count = 0
    for i in range (len(tracks_detail)):
        if 'tracks' in tracks_detail[i]:
            for track in tracks_detail[i].get('tracks'):
                count+=1
                if track and 'popularity' in track:
                    popularities.append(track['popularity'])
                else:
                    popularities.append(None)
        else:
            popularities = popularities+[None]*len(track_group[i])
    popularities_dict = dict(zip(unique_track_uri, popularities))
    popular_df = pd.Series(popularities_dict).to_frame('popularity').reset_index()
    popular_df.columns = ['track_uri', 'popularity']
    return pd.merge(df_updated, popular_df, on='track_uri', how='inner')
    

In [9]:
# get the audio details from the API and add these features to the dataframe and take care of all the corner cases
def parse_audio(track_audio_detail, track_group):
    audio_features = []
    for i in range (len(track_audio_detail)):
        if 'audio_features' in track_audio_detail[i]:
            audio_features = audio_features + track_audio_detail[i].get('audio_features')
        else:
            audio_features = audio_features + [None]*len(track_group[i])
    for i in range(len(audio_features)):
        if audio_features[i] is None:
            audio_features[i] = {"danceability": None,
                "energy": None,
                "key": None,
                "loudness": None,
                "mode": None,
                "speechiness": None,
                "acousticness": None,
                "instrumentalness": None,
                "liveness": None,
                "valence": None,
                "tempo": None,
                "type": None,
                "id": None,
                "uri": None,
                "track_href": None,
                "analysis_url": None,
                "duration_ms": None,
                "time_signature": None}
    print('total number of audio features: ', len(audio_features))
    return audio_features

In [10]:
def add_audio_cols(audio_features, df_updated,filename):
    audio_features_df = pd.DataFrame(audio_features)
    audio_features_df.drop(['type', 'track_href', 'id'], axis=1, inplace=True)
    audio_features_df.rename(columns={'uri': 'track_uri', 'duration_ms': 'track_duration_ms'}, inplace=True)
    audio_features_df = parse_uri(['track_uri'], audio_features_df)
    updated = pd.merge(df_updated, audio_features_df, on = 'track_uri', how='inner')

    updated.to_csv(filename, encoding='utf-8', index=False)
    return updated

In [11]:
path_to_json = 'mpd.v1/data/'
master_list, json_files = read_files(path_to_json)
print('total number of json files getting processed:', len(json_files))
print('dataframe example')
display(master_list[0].head())
master_tracks_detail = []
master_tracks_audio = []
for i in range(len(master_list)):
    print('current file is', json_files[i])
    df = master_list[i]
    df_updated = get_tracks_df(df)
    df_updated = parse_uri(['track_uri', 'album_uri', 'artist_uri'], df_updated)
    unique_track_uri = df_updated['track_uri'].unique()
    # get the track uris from the above dataframe and break it into groups of 50 so we can feed into spotify API
    track_id_groups = [unique_track_uri[i:i + 50] for i in range(0, len(unique_track_uri), 50)]
    print('load from spotify API')
    url = 'https://api.spotify.com/v1/tracks/?ids='
    tracks_detail = get_spotify_tracks(url, 'tracks_detail_'+json_files[i].replace(".", "_").replace('_json', '.csv'), track_id_groups)
    print('finish loading track details, now audio features...')
    url = 'https://api.spotify.com/v1/audio-features/?ids='
    track_audio_detail = get_spotify_tracks(url, 'tracks_audio_'+json_files[i].replace(".", "_").replace('_json', '.csv'), track_id_groups)
    df_updated = add_popularity_col(df_updated, tracks_detail, track_id_groups)
    audio_features = parse_audio(track_audio_detail, track_id_groups)
    final_df = add_audio_cols(audio_features, df_updated, 'final_'+json_files[i].replace(".", "_").replace('_json', '.csv'))
    master_tracks_detail.append(tracks_detail)
    master_tracks_audio.append(track_audio_detail)


total number of json files getting processed: 39
dataframe example


Unnamed: 0,collaborative,description,duration_ms,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks,pid,tracks
0,False,,9071991,1504051200,chillax,34,29,13,1,41,201000,"[{'pos': 0, 'artist_name': 'John Kenza', 'trac..."
1,False,,44053449,1493337600,The Playlist,86,45,7,1,191,201001,"[{'pos': 0, 'artist_name': 'G-Eazy', 'track_ur..."
2,False,[Cringe],8048561,1477008000,Summer 2016,35,35,16,1,35,201002,"[{'pos': 0, 'artist_name': 'John Legend', 'tra..."
3,False,,17512093,1509235200,Christian,50,33,32,1,75,201003,"[{'pos': 0, 'artist_name': 'Kutless', 'track_u..."
4,False,,9959719,1380672000,Jamzzz,38,37,13,3,41,201004,"[{'pos': 0, 'artist_name': 'Kishi Bashi', 'tra..."


current file is mpd.slice.201000-201999.json
load from spotify API
finish loading track details, now audio features...
total number of audio features:  34103
current file is mpd.slice.210000-210999.json
load from spotify API
finish loading track details, now audio features...
total number of audio features:  35567
current file is mpd.slice.22000-22999.json
load from spotify API
finish loading track details, now audio features...
total number of audio features:  35677
current file is mpd.slice.229000-229999.json
load from spotify API
finish loading track details, now audio features...
total number of audio features:  33918
current file is mpd.slice.238000-238999.json
load from spotify API
finish loading track details, now audio features...
total number of audio features:  34927
current file is mpd.slice.247000-247999.json
load from spotify API
finish loading track details, now audio features...
total number of audio features:  34859
current file is mpd.slice.256000-256999.json
load from