In [13]:
import spotipy
import json
import pandas as pd
import os

In [4]:
# setup spotify api object
with open('spotify_credentials.json') as fh:
    auth = json.load(fh)
credentials = spotipy.oauth2.SpotifyClientCredentials(client_id=auth['client_id'], client_secret=auth['client_secret'])
spotify = spotipy.Spotify(client_credentials_manager=credentials)

In [19]:
pd.read_csv('top_weekly_lists/regional-global-weekly-2019-12-27--2020-01-03.csv', header=1)

Unnamed: 0,Position,Track Name,Artist,Streams,URL
0,1,Dance Monkey,Tones and I,50183626,https://open.spotify.com/track/1rgnBhdG2JDFTbY...
1,2,ROXANNE,Arizona Zervas,33254585,https://open.spotify.com/track/696DnlkuDOXcMAn...
2,3,Tusa,KAROL G,29349573,https://open.spotify.com/track/7k4t7uLgtOxPwTp...
3,4,Memories,Maroon 5,26067853,https://open.spotify.com/track/2b8fOow8UzyDFAE...
4,5,Falling,Trevor Daniel,25901488,https://open.spotify.com/track/4TnjEaWOeW0eKTK...
...,...,...,...,...,...
195,196,Cheirosa - Ao Vivo,Jorge & Mateus,4623030,https://open.spotify.com/track/2PWjKmjyTZeDpmO...
196,197,Havana (feat. Young Thug),Camila Cabello,4620876,https://open.spotify.com/track/1rfofaqEpACxVEH...
197,198,Surtada - Remix Brega Funk,Dadá Boladão,4607385,https://open.spotify.com/track/5F8ffc8KWKNawll...
198,199,Lover (Remix) [feat. Shawn Mendes],Taylor Swift,4595450,https://open.spotify.com/track/3i9UVldZOE0aD0J...


In [24]:
# get list of artists on the top 200 weekly lists 30/8/19 to 28/2/20 (about 6 months)
top_artists = set()
for filename in os.listdir('top_weekly_lists'):
    df = pd.read_csv(os.path.join('top_weekly_lists', filename), header=1)
    artists = set(df['Artist'])
    top_artists.update(artists)

In [38]:
# get songs for top artists

n = len(top_artists)

song_ids = set()
for i, artist in enumerate(top_artists):
    print(f'Progress: {i}/{n}')
    artist_dict = spotify.search(artist, type='artist')['artists']['items'][0]
    if artist_dict['popularity'] < 50:
        # warn since this probably isn't the right artist
        print(f"Artist {artist_dict['name']} has popularity {artist_dict['popularity']}")
    # get all the artist's albums
    albums = spotify.artist_albums(artist_dict['id'])['items']
    for album in albums:
        # get all songs in the album
        songs = spotify.album_tracks(album['id'])['items']
        song_ids.update(song['id'] for song in songs)
    print(f'\tSongs collected: {len(song_ids)}')

Progress: 0/280
	Songs collected: 183
Progress: 1/280
	Songs collected: 613
Progress: 2/280
	Songs collected: 796
Progress: 3/280
	Songs collected: 1027
Progress: 4/280
	Songs collected: 1056
Progress: 5/280
	Songs collected: 1658
Progress: 6/280
	Songs collected: 2073
Progress: 7/280
	Songs collected: 2166
Progress: 8/280
	Songs collected: 2205
Progress: 9/280
retrying ...2secs
	Songs collected: 2359
Progress: 10/280
	Songs collected: 2434
Progress: 11/280
	Songs collected: 2772
Progress: 12/280
	Songs collected: 3077
Progress: 13/280
	Songs collected: 3499
Progress: 14/280
	Songs collected: 3525
Progress: 15/280
	Songs collected: 3571
Progress: 16/280
	Songs collected: 3600
Progress: 17/280
	Songs collected: 3747
Progress: 18/280
	Songs collected: 3917
Progress: 19/280
retrying ...2secs
	Songs collected: 4242
Progress: 20/280
	Songs collected: 4464
Progress: 21/280
	Songs collected: 4780
Progress: 22/280
	Songs collected: 4906
Progress: 23/280
	Songs collected: 5003
Progress: 24/280


	Songs collected: 36242
Progress: 192/280
	Songs collected: 36581
Progress: 193/280
	Songs collected: 36868
Progress: 194/280
	Songs collected: 36933
Progress: 195/280
	Songs collected: 37008
Progress: 196/280
	Songs collected: 37053
Progress: 197/280
retrying ...2secs
	Songs collected: 37133
Progress: 198/280
	Songs collected: 37621
Progress: 199/280
	Songs collected: 37749
Progress: 200/280
	Songs collected: 37921
Progress: 201/280
	Songs collected: 38005
Progress: 202/280
	Songs collected: 38018
Progress: 203/280
	Songs collected: 38305
Progress: 204/280
	Songs collected: 38306
Progress: 205/280
	Songs collected: 38773
Progress: 206/280
	Songs collected: 38858
Progress: 207/280
	Songs collected: 39032
Progress: 208/280
	Songs collected: 39109
Progress: 209/280
	Songs collected: 39951
Progress: 210/280
retrying ...1secs
	Songs collected: 40314
Progress: 211/280
	Songs collected: 40319
Progress: 212/280
	Songs collected: 40414
Progress: 213/280
	Songs collected: 40670
Progress: 214/28

In [43]:
with open('song_ids.json', 'w') as fh:
    json.dump(list(song_ids), fh)

In [75]:
with open('song_ids.json', 'r') as fh:
    song_ids = json.load(fh)

Features to include:

From `.track`:

* number of available markets
* is explicit
* release date
* popularity
* track_number

from `.audio_features`:

* danceability
* energy
* key
* loudness
* mode
* speechiness
* instrumentalness
* liveness
* valence
* tempo
* duration_ms
* time_signature

Careful! Popularity will depend on date collected

In [88]:
def get_info(start_index=0, end_index=None):
    if end_index is None:
        end_index = len(song_ids) - 1
    data = []
    for i in range(start_index, end_index+1):
        print(f'Working... {start_index}->{i}->{end_index}')
        song_id = song_ids[i]
        track = spotify.track(song_id)
        features = spotify.audio_features(song_id)[0]
        if features is None:
            features = {}
        data.append({
            'id': song_id,
            'popularity': track.get('popularity'),
            'explicit': track.get('explicit'),
            'availability': len(track.get('available_markets')),
            'track_number': track.get('track_number'),
            'release_date': track['album'].get('release_date'),
            'num_artists': len(track.get('artists')),
            'danceability': features.get('danceability'),
            'energy': features.get('energy'),
            'key': features.get('key'),
            'loudness': features.get('loudness'),
            'mode': features.get('mode'),
            'speechiness': features.get('speechiness'),
            'acousticness': features.get('acousticness'),
            'instrumentalness': features.get('instrumentalness'),
            'liveness': features.get('liveness'),
            'valence': features.get('valence'),
            'tempo': features.get('tempo'),
            'duration_ms': features.get('duration_ms'),
            'time_signature': features.get('time_signature')
        })
    pd.DataFrame(data).set_index('id').to_csv(os.path.join('data', f'song_data_{start_index}_{end_index}.csv'))

In [None]:
%time get_info(start_index=40001)

Working... 40001->40001->52164
Working... 40001->40002->52164
Working... 40001->40003->52164
Working... 40001->40004->52164
Working... 40001->40005->52164
Working... 40001->40006->52164
Working... 40001->40007->52164
Working... 40001->40008->52164
Working... 40001->40009->52164
Working... 40001->40010->52164
Working... 40001->40011->52164
Working... 40001->40012->52164
Working... 40001->40013->52164
Working... 40001->40014->52164
Working... 40001->40015->52164
Working... 40001->40016->52164
Working... 40001->40017->52164
Working... 40001->40018->52164
Working... 40001->40019->52164
Working... 40001->40020->52164
Working... 40001->40021->52164
Working... 40001->40022->52164
Working... 40001->40023->52164
Working... 40001->40024->52164
Working... 40001->40025->52164
Working... 40001->40026->52164
Working... 40001->40027->52164
Working... 40001->40028->52164
Working... 40001->40029->52164
Working... 40001->40030->52164
Working... 40001->40031->52164
Working... 40001->40032->52164
Working.

Working... 40001->40263->52164
Working... 40001->40264->52164
Working... 40001->40265->52164
Working... 40001->40266->52164
Working... 40001->40267->52164
Working... 40001->40268->52164
Working... 40001->40269->52164
Working... 40001->40270->52164
Working... 40001->40271->52164
Working... 40001->40272->52164
Working... 40001->40273->52164
Working... 40001->40274->52164
Working... 40001->40275->52164
Working... 40001->40276->52164
Working... 40001->40277->52164
Working... 40001->40278->52164
Working... 40001->40279->52164
Working... 40001->40280->52164
Working... 40001->40281->52164
Working... 40001->40282->52164
Working... 40001->40283->52164
Working... 40001->40284->52164
Working... 40001->40285->52164
Working... 40001->40286->52164
Working... 40001->40287->52164
Working... 40001->40288->52164
Working... 40001->40289->52164
Working... 40001->40290->52164
Working... 40001->40291->52164
Working... 40001->40292->52164
Working... 40001->40293->52164
Working... 40001->40294->52164
Working.

Working... 40001->40525->52164
Working... 40001->40526->52164
Working... 40001->40527->52164
Working... 40001->40528->52164
Working... 40001->40529->52164
Working... 40001->40530->52164
Working... 40001->40531->52164
Working... 40001->40532->52164
Working... 40001->40533->52164
Working... 40001->40534->52164
Working... 40001->40535->52164
Working... 40001->40536->52164
Working... 40001->40537->52164
Working... 40001->40538->52164
Working... 40001->40539->52164
Working... 40001->40540->52164
Working... 40001->40541->52164
Working... 40001->40542->52164
Working... 40001->40543->52164
Working... 40001->40544->52164
Working... 40001->40545->52164
Working... 40001->40546->52164
Working... 40001->40547->52164
Working... 40001->40548->52164
Working... 40001->40549->52164
Working... 40001->40550->52164
Working... 40001->40551->52164
Working... 40001->40552->52164
Working... 40001->40553->52164
Working... 40001->40554->52164
Working... 40001->40555->52164
Working... 40001->40556->52164
Working.

Working... 40001->40789->52164
Working... 40001->40790->52164
Working... 40001->40791->52164
Working... 40001->40792->52164
Working... 40001->40793->52164
Working... 40001->40794->52164
Working... 40001->40795->52164
Working... 40001->40796->52164
Working... 40001->40797->52164
Working... 40001->40798->52164
Working... 40001->40799->52164
Working... 40001->40800->52164
Working... 40001->40801->52164
Working... 40001->40802->52164
Working... 40001->40803->52164
Working... 40001->40804->52164
Working... 40001->40805->52164
Working... 40001->40806->52164
Working... 40001->40807->52164
Working... 40001->40808->52164
Working... 40001->40809->52164
Working... 40001->40810->52164
Working... 40001->40811->52164
Working... 40001->40812->52164
Working... 40001->40813->52164
Working... 40001->40814->52164
Working... 40001->40815->52164
Working... 40001->40816->52164
Working... 40001->40817->52164
Working... 40001->40818->52164
Working... 40001->40819->52164
Working... 40001->40820->52164
Working.

Working... 40001->41054->52164
Working... 40001->41055->52164
Working... 40001->41056->52164
Working... 40001->41057->52164
Working... 40001->41058->52164
Working... 40001->41059->52164
Working... 40001->41060->52164
Working... 40001->41061->52164
Working... 40001->41062->52164
Working... 40001->41063->52164
Working... 40001->41064->52164
Working... 40001->41065->52164
Working... 40001->41066->52164
Working... 40001->41067->52164
Working... 40001->41068->52164
Working... 40001->41069->52164
Working... 40001->41070->52164
Working... 40001->41071->52164
retrying ...2secs
Working... 40001->41072->52164
Working... 40001->41073->52164
Working... 40001->41074->52164
Working... 40001->41075->52164
Working... 40001->41076->52164
Working... 40001->41077->52164
Working... 40001->41078->52164
Working... 40001->41079->52164
Working... 40001->41080->52164
Working... 40001->41081->52164
Working... 40001->41082->52164
Working... 40001->41083->52164
Working... 40001->41084->52164
Working... 40001->410

Working... 40001->41317->52164
Working... 40001->41318->52164
Working... 40001->41319->52164
Working... 40001->41320->52164
Working... 40001->41321->52164
Working... 40001->41322->52164
Working... 40001->41323->52164
Working... 40001->41324->52164
Working... 40001->41325->52164
Working... 40001->41326->52164
Working... 40001->41327->52164
Working... 40001->41328->52164
Working... 40001->41329->52164
Working... 40001->41330->52164
Working... 40001->41331->52164
Working... 40001->41332->52164
Working... 40001->41333->52164
Working... 40001->41334->52164
Working... 40001->41335->52164
Working... 40001->41336->52164
Working... 40001->41337->52164
Working... 40001->41338->52164
Working... 40001->41339->52164
Working... 40001->41340->52164
Working... 40001->41341->52164
Working... 40001->41342->52164
Working... 40001->41343->52164
Working... 40001->41344->52164
Working... 40001->41345->52164
Working... 40001->41346->52164
Working... 40001->41347->52164
Working... 40001->41348->52164
Working.

Working... 40001->41581->52164
Working... 40001->41582->52164
Working... 40001->41583->52164
Working... 40001->41584->52164
Working... 40001->41585->52164
Working... 40001->41586->52164
Working... 40001->41587->52164
Working... 40001->41588->52164
Working... 40001->41589->52164
Working... 40001->41590->52164
Working... 40001->41591->52164
Working... 40001->41592->52164
Working... 40001->41593->52164
Working... 40001->41594->52164
Working... 40001->41595->52164
Working... 40001->41596->52164
Working... 40001->41597->52164
Working... 40001->41598->52164
Working... 40001->41599->52164
Working... 40001->41600->52164
Working... 40001->41601->52164
Working... 40001->41602->52164
Working... 40001->41603->52164
Working... 40001->41604->52164
Working... 40001->41605->52164
Working... 40001->41606->52164
Working... 40001->41607->52164
Working... 40001->41608->52164
Working... 40001->41609->52164
Working... 40001->41610->52164
Working... 40001->41611->52164
Working... 40001->41612->52164
Working.

Working... 40001->41843->52164
Working... 40001->41844->52164
Working... 40001->41845->52164
Working... 40001->41846->52164
Working... 40001->41847->52164
Working... 40001->41848->52164
Working... 40001->41849->52164
Working... 40001->41850->52164
Working... 40001->41851->52164
Working... 40001->41852->52164
Working... 40001->41853->52164
Working... 40001->41854->52164
Working... 40001->41855->52164
Working... 40001->41856->52164
Working... 40001->41857->52164
Working... 40001->41858->52164
Working... 40001->41859->52164
Working... 40001->41860->52164
Working... 40001->41861->52164
Working... 40001->41862->52164
Working... 40001->41863->52164
Working... 40001->41864->52164
Working... 40001->41865->52164
Working... 40001->41866->52164
Working... 40001->41867->52164
Working... 40001->41868->52164
Working... 40001->41869->52164
Working... 40001->41870->52164
Working... 40001->41871->52164
Working... 40001->41872->52164
Working... 40001->41873->52164
Working... 40001->41874->52164
Working.

Working... 40001->42105->52164
Working... 40001->42106->52164
Working... 40001->42107->52164
Working... 40001->42108->52164
Working... 40001->42109->52164
Working... 40001->42110->52164
Working... 40001->42111->52164
Working... 40001->42112->52164
Working... 40001->42113->52164
Working... 40001->42114->52164
Working... 40001->42115->52164
Working... 40001->42116->52164
Working... 40001->42117->52164
Working... 40001->42118->52164
Working... 40001->42119->52164
Working... 40001->42120->52164
Working... 40001->42121->52164
Working... 40001->42122->52164
Working... 40001->42123->52164
Working... 40001->42124->52164
Working... 40001->42125->52164
Working... 40001->42126->52164
Working... 40001->42127->52164
Working... 40001->42128->52164
Working... 40001->42129->52164
Working... 40001->42130->52164
Working... 40001->42131->52164
Working... 40001->42132->52164
Working... 40001->42133->52164
Working... 40001->42134->52164
Working... 40001->42135->52164
Working... 40001->42136->52164
Working.

In [46]:
track = spotify.track('1DxSv8dlsrc0LYSiqzuco5')

In [60]:
features = spotify.audio_features('1DxSv8dlsrc0LYSiqzuco5')

In [61]:
features

[{'danceability': 0.683,
  'energy': 0.643,
  'key': 9,
  'loudness': -4.242,
  'mode': 0,
  'speechiness': 0.0939,
  'acousticness': 0.127,
  'instrumentalness': 0.000416,
  'liveness': 0.235,
  'valence': 0.385,
  'tempo': 125.108,
  'type': 'audio_features',
  'id': '1DxSv8dlsrc0LYSiqzuco5',
  'uri': 'spotify:track:1DxSv8dlsrc0LYSiqzuco5',
  'track_href': 'https://api.spotify.com/v1/tracks/1DxSv8dlsrc0LYSiqzuco5',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1DxSv8dlsrc0LYSiqzuco5',
  'duration_ms': 112953,
  'time_signature': 4}]