In [1]:
import os
import re
import sys
import json
import time
import config
import spotipy
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

In [2]:
# Change the path to where your Spotify Million Playlist is located. (Need to download the dataset first from 
# https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge) 
# The expected file structure is to have a data folder /data .csv files will be placed there.

path = 'data/spotify_million_playlist_dataset/data'

def loop_slices(path, num_slices=20):
    """
    Each slice is a .json file containing 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters:
        num_slices (int): Number of slices to return, max 1000.
        path (str): Path to the Spotify Million Playlist.
        
    Output:
        mpd_playlists (list): a list of dictionaries of all the playlists.
    """
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

In [3]:
def create_csv(playlists, extended=False):
    """
    This function will construct a datafrme and write to .csv file for all the input playlists, this .csv will have a single cell with all the tracks
    in the playlist. 
    An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an 
    additional .csv file
    Parameters:
        playlists (list): a list of dictionaries such as that from the loop_slices() function.
        extended (boolean): boolean to enable the extended .csv file generation
        
    Output:
        MPD.csv: .csv file with the playlists
        MPD.csv: .csv file with extended song columns
    """
    df = pd.DataFrame(playlists)
    df.to_csv('data/MPD.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('data/{}_MPD_Extended.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S")), index=False) 

# Run with num_slices=1000 to run the whole data set (1M)
# It will save a .json locally

In [8]:
#####################################################################################################################
# STOP, STOP, STOP, STOP, STOP                                                                                      #
# num_slices=1000 (all playlists) will take considerable time (>30min) and it will eat all your disk storage 30Gb+  #
# recommended to work with default num_slices=20, this will output 400Mb for each file                              #
#####################################################################################################################

playlists = loop_slices(path, num_slices=1000)
create_csv(playlists, extended=True)

# Save json file - This file will be used as database
with open('data/{}_playlists_{}.json'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(playlists)), 'w') as outfile:
    json.dump(playlists, outfile, indent=2)

mpd.slice.0-999.json


In [None]:
# # Read MPD
df = pd.read_csv('data/MPD.csv')
df

In [None]:
# # Read MPD_Extended
df = pd.read_csv('data/MPD_Extended.csv')
df

# Get song features from playlists extracted

In [3]:
# Spotify credentials
os.environ["SPOTIPY_CLIENT_ID"] = config.SPOTIPY_CLIENT_ID
os.environ["SPOTIPY_CLIENT_SECRET"] = config.SPOTIPY_CLIENT_SECRET
os.environ['SPOTIPY_REDIRECT_URI'] = config.SPOTIPY_REDIRECT_URI   # Needed for user authorization
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [14]:
# Code to retrieve and add data to an existing df
df_list = []
feats_df = pd.read_csv('Playlist_Feats.csv')
idx = len(feats_df)
df_list.append(feats_df)
feats_df

# The cell  below will run for 2+ days, let it run and monitor every few hours.

In [None]:
##################################################################################
# STOP - This computation will take ~ 40hrs to finish for every 20,000 playlists #
##################################################################################

################################################################################################################
# Code to retrieve song features from each playlist passed, average of song features in each playlist is       #
# computed in such a way that the final computation is each playlist is a row and it has the average of        #
# all songs in the playlist.                                                                                   #
# This code can be optimized in different ways:                                                                #
#      - Remove outliers from each playlist                                                                    #
#      - Keep only those playlist where the song variation is within 3 standard deviation for all features     #
#      - Add for loop to keep trying in case of an exception occurs while extracting features.                 #
################################################################################################################

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

cols_to_keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
dfs = []

for playlist in tqdm(playlists):
    audio_feats = []
    all_uris_in_plylst = []
    for track in playlist['tracks']:
        uri = track['track_uri'].split("k:")[1]
        all_uris_in_plylst.append(uri)

    chunks_uris = [all_uris_in_plylst[i:i + 100] for i in range(0, len(all_uris_in_plylst), 100)]
    for chunk in  chunks_uris:
        for attempt in range(10):
            try:
                chunk_audio_feats = sp.audio_features(chunk)
                audio_feats.append(chunk_audio_feats)
            except Exception as e: 
                print(e)
                print('playlist: {}, chunk: {}'.format(playlist['name'], chunk))
            else:
                break
        else:
            print('Everything failed')


    
    playlist_audio_feats = [item for sublist in audio_feats for item in sublist]
    name = playlist['name']
    pid = playlist['pid']
    s1 = pd.Series([name, pid], index=['name', 'pid'])
    try: # Try/Except for when there is a None in playlist
        s2 = pd.DataFrame(playlist_audio_feats)[cols_to_keep].mean()
    except:
        print('Playlist "{}" has a None. PID: {}'.format(name, pid))
        s2 = pd.DataFrame([i for i in playlist_audio_feats if i])[cols_to_keep].mean() # List comprehension to remove None from list

    dfs.append(pd.DataFrame(s1.append(s2)).T)
# Time to beat: 5% in 5min

# This cell will save a .csv file locally.

In [140]:
df1 = pd.concat(dfs, axis=0)
df1.to_csv('data/{}_Playlist_Feats_{}.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(df1)), index=False)
df1

# Code to add data on top of an existing df
df_list.append(df1)
feats_df = pd.concat(df_list, axis=0)
feats_df.to_csv('data/Playlist_Feats_Copy.csv', index=False)
idx = len(feats_df)
feats_df

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.03846,-4.89121,0.692308,0.103698,0.0836741,0.000674382,0.187087,0.64275,121.158,221777,4
0,Awesome Playlist,1,0.492382,0.695923,4.46154,-8.10797,0.538462,0.0910103,0.162227,0.223708,0.179344,0.476667,124.987,298838,3.76923
0,korean,2,0.671062,0.692953,5,-4.87559,0.515625,0.096425,0.2691,0.000637812,0.168894,0.565078,114.596,219374,4
0,mat,3,0.514349,0.620901,5.10317,-9.61875,0.714286,0.067004,0.273514,0.203148,0.188278,0.451258,125.523,229575,3.95238
0,90s,4,0.576235,0.650418,3.35294,-7.63453,0.823529,0.0412176,0.177189,0.0817588,0.166524,0.490294,127.725,255014,3.94118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,gang gang,115995,0.804615,0.529026,4.20513,-8.16418,0.589744,0.219638,0.217947,0.00862146,0.166574,0.363818,129.452,230348,4
0,Chaos,115996,0.6889,0.6955,5.7,-8.23265,0.4,0.104215,0.243258,0.225892,0.15177,0.7178,135.314,283091,4.05
0,Spring 2014,115997,0.646571,0.603214,5.35714,-6.89093,0.714286,0.0787429,0.227477,0.0243475,0.146629,0.442571,125.139,268989,4
0,autumn,115998,0.549943,0.562457,5.11429,-8.66606,0.657143,0.06552,0.273361,0.0180656,0.16158,0.390406,114.145,237815,3.97143
