In [None]:
import spotipy
import json
import pandas as pd
import os

In [None]:
# setup spotify api object
with open('spotify_credentials.json') as fh:
    auth = json.load(fh)
credentials = spotipy.oauth2.SpotifyClientCredentials(
    client_id=auth['client_id'],
    client_secret=auth['client_secret'])
spotify = spotipy.Spotify(client_credentials_manager=credentials)

In [None]:
# See what the top weekly lists files look like
pd.read_csv('top_weekly_lists/regional-global-weekly-2019-12-27--2020-01-03.csv', header=1)

In [None]:
# get list of artists on the top 200 weekly lists 30/8/19 to 28/2/20 (about 6 months)
top_artists = set()
for filename in os.listdir('top_weekly_lists'):
    df = pd.read_csv(os.path.join('top_weekly_lists', filename), header=1)
    artists = set(df['Artist'])
    top_artists.update(artists)

In [None]:
# get songs for top artists

n = len(top_artists)

song_ids = set()
for i, artist in enumerate(top_artists):
    print(f'Progress: {i}/{n}')
    artist_dict = spotify.search(artist, type='artist')['artists']['items'][0]
    if artist_dict['popularity'] < 50:
        # warn since this probably isn't the right artist
        print(f"Artist {artist_dict['name']} has popularity {artist_dict['popularity']}")
    # get all the artist's albums
    albums = spotify.artist_albums(artist_dict['id'])['items']
    for album in albums:
        # get all songs in the album
        songs = spotify.album_tracks(album['id'])['items']
        song_ids.update(song['id'] for song in songs)
    print(f'\tSongs collected: {len(song_ids)}')

In [None]:
# save list of song ids to json file
with open('song_ids.json', 'w') as fh:
    json.dump(list(song_ids), fh)

In [None]:
# load list of song ids from json file (if already fetched)
with open('song_ids.json', 'r') as fh:
    song_ids = json.load(fh)

Features to include:

From `.track`:

* number of available markets
* is explicit
* release date
* popularity
* track_number

from `.audio_features`:

* danceability
* energy
* key
* loudness
* mode
* speechiness
* instrumentalness
* liveness
* valence
* tempo
* duration_ms
* time_signature

Careful! Popularity will depend on date collected

In [None]:
# function for getting the info for songs from song_ids list
def get_info(start_index=0, end_index=None):
    if end_index is None:
        end_index = len(song_ids) - 1
    data = []
    for i in range(start_index, end_index+1):
        print(f'Working... {start_index}->{i}->{end_index}')
        song_id = song_ids[i]
        track = spotify.track(song_id)
        features = spotify.audio_features(song_id)[0]
        if features is None:
            features = {}
        data.append({
            'id': song_id,
            'popularity': track.get('popularity'),
            'explicit': track.get('explicit'),
            'availability': len(track.get('available_markets')),
            'track_number': track.get('track_number'),
            'release_date': track['album'].get('release_date'),
            'num_artists': len(track.get('artists')),
            'danceability': features.get('danceability'),
            'energy': features.get('energy'),
            'key': features.get('key'),
            'loudness': features.get('loudness'),
            'mode': features.get('mode'),
            'speechiness': features.get('speechiness'),
            'acousticness': features.get('acousticness'),
            'instrumentalness': features.get('instrumentalness'),
            'liveness': features.get('liveness'),
            'valence': features.get('valence'),
            'tempo': features.get('tempo'),
            'duration_ms': features.get('duration_ms'),
            'time_signature': features.get('time_signature')
        })
    pd.DataFrame(data).set_index('id').to_csv(os.path.join('data', f'song_data_{start_index}_{end_index}.csv'))

In [None]:
%time get_info() # this takes about 1/2 hour per 10,000 songs (~2.5 hrs for all songs)