In [1]:
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import librosa
import math
import time

In [2]:
#Load the api client id and secret from file
f = open('data/apikeys/.apikeys.json')
apikeys = json.load(f)
CLIENT_ID = apikeys['clientId']
CLIENT_SECRET = apikeys['clientSecret']

In [3]:
#get access token
def authenticate_token():
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    auth_response = requests.post(AUTH_URL, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()

    # save the access token
    access_token = auth_response_data['access_token']

    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    return headers

headers = authenticate_token()

In [4]:
# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

genre_seeds = requests.get(BASE_URL + 'recommendations/available-genre-seeds', headers=headers)

In [5]:
genre_seeds = genre_seeds.json()['genres']

In [11]:
from pyarrow import feather
feather.write_feather(pd.DataFrame(genre_seeds, columns=['genre']), 'data/genre_seeds.feather')

In [6]:
results =[]
for idx, genre in enumerate(genre_seeds):
    params = {
        'seed_genres':genre,
        'limit':100
    }

    recs = requests.get(BASE_URL + 'recommendations', params=params, headers=headers)
    rec_tracks = recs.json()['tracks']
    for track in rec_tracks:
        artist = track['artists'][0]
        name = artist['name']
        id = artist['id']
        result = {'artist_name':name, 'artist_id':id}
        results.append(result)
    print(f'{idx+1} / {len(genre_seeds)}', end='\r')

126 / 126

In [7]:
genre_artists_df = pd.DataFrame(results)

In [8]:
genre_artists_df = genre_artists_df.drop_duplicates().reset_index(drop=True)

In [9]:
genre_artists_df.artist_id

0       1uNFoZAHBGtllmzznpCI3s
1       0YLUOdFiedWIWBttlDAQeO
2       3LRldYGJpCAQT5cZKQyi7o
3       2Q44GAsVvUt6y0TwPiZZvS
4       6olE6TJLqED3rqDCT0FyPh
                 ...          
5700    6V46XOVfnoCXHWBeZ92k5b
5701    4G5ZJny3HvX6Il7eHVfnNC
5702    5dss7E1Ph0KxuF1ULfAx4D
5703    4r4tRC1EUpfUCLjwQXP5EA
5704    2GUxWjR8cNgljddVLEp72u
Name: artist_id, Length: 5705, dtype: object

In [10]:
chunk_size = math.ceil(len(genre_artists_df) / 50)

In [11]:
chunk_size

115

In [12]:
genre_artists_df['genres'] = float('nan')
genre_artists_df['popularity'] = float('nan')

In [14]:
genre_artists_df

Unnamed: 0,artist_name,artist_id,genres,popularity
0,Justin Bieber,1uNFoZAHBGtllmzznpCI3s,,
1,Joshua James,0YLUOdFiedWIWBttlDAQeO,,
2,Hannah Peel,3LRldYGJpCAQT5cZKQyi7o,,
3,Olle Hedberg,2Q44GAsVvUt6y0TwPiZZvS,,
4,Nirvana,6olE6TJLqED3rqDCT0FyPh,,
...,...,...,...,...
5700,Mapumba,6V46XOVfnoCXHWBeZ92k5b,,
5701,Fatoumata Diawara,4G5ZJny3HvX6Il7eHVfnNC,,
5702,Ibrahim Ferrer,5dss7E1Ph0KxuF1ULfAx4D,,
5703,Brooklyn Funk Essentials,4r4tRC1EUpfUCLjwQXP5EA,,


In [19]:
genre_artists_full_results = []
for artists in np.array_split(genre_artists_df, chunk_size):
    params = {'ids' : ','.join(list(artists.artist_id))}
    several_artists = requests.get(BASE_URL+'artists/', params=params, headers=headers)
    for i in artists.index:
        j = i - artists.index[0]
        result = {
            'artist_name': genre_artists_df.loc[i, 'artist_name'],
            'artist_id': genre_artists_df.loc[i, 'artist_id'],
            'genres': several_artists.json()['artists'][j]['genres'],
            'popularity': several_artists.json()['artists'][j]['popularity']
        }
        genre_artists_full_results.append(result)
        print(f'{i+1} / {len(genre_artists_df)}', end= '\r')
        
genre_artists_df = pd.DataFrame(genre_artists_full_results)

5705 / 5705

In [20]:
genre_artists_df

Unnamed: 0,artist_name,artist_id,genres,popularity
0,Justin Bieber,1uNFoZAHBGtllmzznpCI3s,"[canadian pop, pop]",90
1,Joshua James,0YLUOdFiedWIWBttlDAQeO,[utah indie],38
2,Hannah Peel,3LRldYGJpCAQT5cZKQyi7o,"[chamber psych, folktronica, spectra]",33
3,Olle Hedberg,2Q44GAsVvUt6y0TwPiZZvS,[],8
4,Nirvana,6olE6TJLqED3rqDCT0FyPh,"[alternative rock, grunge, permanent wave, rock]",80
...,...,...,...,...
5700,Mapumba,6V46XOVfnoCXHWBeZ92k5b,[],6
5701,Fatoumata Diawara,4G5ZJny3HvX6Il7eHVfnNC,"[afropop, desert blues, malian blues, mande po...",54
5702,Ibrahim Ferrer,5dss7E1Ph0KxuF1ULfAx4D,"[bolero, cuban rumba, latin jazz, musica tradi...",47
5703,Brooklyn Funk Essentials,4r4tRC1EUpfUCLjwQXP5EA,"[modern funk, nu jazz]",41


In [21]:
related_dfs = [genre_artists_df]

In [None]:
for i in range(1,3):
    new_artists = []
    for idx, artist in related_dfs[i-1].iterrows():
        related = requests.get(BASE_URL+'artists/'+artist.artist_id+'/related-artists', headers=headers)
        while(related.ok == False):
            if related.status_code == 401:
                headers = authenticate_token()
                related = requests.get(BASE_URL+'artists/'+artist.artist_id+'/related-artists', headers=headers)
            elif related.status_code == 429:
                time.sleep(30)
                related = requests.get(BASE_URL+'artists/'+artist.artist_id+'/related-artists', headers=headers)
            else:
                break
        for new_artist in related.json()['artists']:
            new_row = {'artist_name': new_artist['name'],
                      'artist_id': new_artist['id'],
                      'genres': new_artist['genres'],
                      'popularity': new_artist['popularity']}
            new_artists.append(new_row)
        print(f'{idx+1} / {len(related_dfs[i-1])}', end='\r')
    print('\n')
    related_dfs.append(pd.DataFrame(new_artists))
    related_dfs[i] = related_dfs[i].drop_duplicates(subset=['artist_id'])
        

In [37]:
all_artists = pd.concat([df for df in related_dfs])

In [38]:
all_artists = all_artists.drop_duplicates(subset=['artist_id']).reset_index(drop=True)

In [None]:
all_tracks = []

In [121]:
for idx, artist in all_artists.iterrows():
    top_tracks = requests.get(BASE_URL+'artists/'+artist.artist_id+'/top-tracks?market=US', headers=headers)
    for track in top_tracks.json()['tracks']:
        track_info = {
            'track_id': track['id'],
            'track_name': track['name'],
            'track_preview_link': track['preview_url'],
            'track_popularity': track['popularity'],
            'track_uri': track['uri'],
            'release_date':track['album']['release_date'],
            'artist_name': artist.artist_name,
            'artist_id': artist.artist_id,
            'artist_genres': artist.genres,
            'artist_popularity': artist.popularity
        }
        all_tracks.append(track_info)
    print(f'{idx+1} / {len(all_artists)}', end='\r')
all_tracks_df = pd.DataFrame(all_tracks)

72259 / 72259

In [162]:
track_chunk_size

12771

In [None]:
track_chunk_size = math.ceil(len(all_tracks_2) / 50)

release_dates = []
for tracks in np.array_split(all_tracks_2, track_chunk_size):
    params = {'ids' : ','.join(list(tracks.track_id)),
             'market': 'US'}
    several_tracks = requests.get(BASE_URL+'tracks/', params=params, headers=headers)
    for i in tracks.index:
        j = i - tracks.index[0]
        result = {
            'track_id': several_tracks.json()['tracks'][j]['id'],
            'release_date': several_tracks.json()['tracks'][j]['album']['release_date']
        }
        release_dates.append(result)
        print(f'{i+1} / {len(all_tracks_2)}', end= '\r')

In [164]:
len(release_dates)

459111

In [174]:
from pyarrow import feather
feather.write_feather(all_tracks_2, 'data/all_tracks.feather')
feather.write_feather(all_artists, 'data/all_artists.feather')