# Functions to scrape the Spotify API
These functions can be used to scrape the spotify API & generate the data needed from the Instagram Reels playlist.

In [1]:
import requests
import regex as re
import json
import time

In [2]:
def get_spotify_token(client_id,client_secret):
        '''
        Get the spotify api access token using client id and secret from post request. 

        PARAMETERS
        ----------
        client_id: str
            client id for spotify developer account
        client_secret: str
            client secret for spotify developer account
        RETURNS
        -------
        str
            bearer access token for spotify api
        '''
        info = requests.post(url="https://accounts.spotify.com/api/token",
                        headers={"Content-Type":"application/x-www-form-urlencoded"},
                        data=f"grant_type=client_credentials&client_id={client_id}&client_secret={client_secret}")
        return(info.json()['access_token'])

In [3]:
def parse_playlist_id(link='skyeler'):
    '''
    Get the spotify playlist id from a playlist link 

    PARAMETERS
    ----------
    link: str
        the link to a playlist on spotify
    
    RETURNS
    -------
    str
        the spotify playlist id
    '''
    regex_pattern = r'/playlist/([a-zA-Z0-9]+)'

    regex_match = re.search(regex_pattern,link)

    if regex_match:
        id = regex_match.group(1)
        return id
    return None

In [4]:
def get_title_info(headers,playlist_id):
    '''
    Makes a spotify api request to get the title and author of a playlist in a pretty string format.

    PARAMETERS
    ----------
    headers: dict
        api headers to use in the request, should be a dict with a bearer token.
    playlist_id: str
        the spotify unique id for a playlist

    RETURNS
    -------
    str
        the playlist name and ownner in the format "<playlist name> by <playlist author>"
    '''
    info = requests.get(f"https://api.spotify.com/v1/playlists/{playlist_id}",headers=headers).json()
    return(f"{info['name']} by {info['owner']['display_name']}")

In [5]:
def parse_playlist(api_output):
    '''
    Parses the api output of a spotify api playlist tracks call. 
    Finds each artist per each song on the playlist and adds the artist to a dictionary, 
    with their spotify id and their api_link. 

    PARAMETERS
    ----------
    api_output: requests response type
        the output of an api get request

    RETURNS
    -------
    dict
        dictionary where the key is the name of an artist (str) and the value is a dictionary with info
    '''
    track_list = api_output.json()['items']
    output = {}

    for item in track_list:
        artist_info = item['track']['artists']
        for artist in artist_info:
            id = artist['id']
            name = artist['name']
            api_link = artist['href']
            output[name]={'id':id,
                        'api_link':api_link}
    return output

In [6]:
def query_playlist(spotify_token,playlist_link:str):
    '''
    Queries the playlist object to get the tracks.

    PARAMETERS
    ----------
    playlist_link: str
        spotify api link to get the tracks from a playlist
    headers: dict
        api headers to use in the request, should be a dict with a bearer token.

    RETURNS
    -------
    request object
    '''
    headers={"Authorization": f"Bearer {spotify_token}"} #set the spotify headers
    playlist_id = parse_playlist_id(playlist_link)
    #title_info = get_title_info(headers,playlist_id)
    playlist = requests.get(f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks",headers=headers) #get the tracks
    return playlist

In [7]:
def get_artist_info(link,headers):
   '''
   Makes a request based on an artist's link and returns a dictionary with additional 
   info about that artist. 

   PARAMETERS
   ----------
   link: str
        spotify api link to get information about an artist
    headers: dict
        api headers to use in the request, should be a dict with a bearer token.

   RETURNS
   -------
   dict
        a dictionary with the artist's genres, image information, and popularity
   '''
   info = requests.get(link,headers=headers)
   print(info)
   info = info.json()
   print(info)
   try:
     genres = info['genres']
   except:
      genres = []
      print(info)
   try:
     followers = info['followers']['total']
   except:
      followers = None
   try:
     img_info = info['images']
   except:
      img_info = []
   try:
     popularity = info['popularity']
   except:
      popularity = None
   return {'genres':genres,
           'img_info':img_info,
           'popularity':popularity,
           'followers':followers}

In [8]:
def get_collaborators(albums:dict)->list:
    '''Gets the collaborators of an artist'''
    collaborators = {''}
    for al in albums['items']:
        artist_list = al['artists']
        for artist in artist_list:
            collaborators.add((artist['name'],artist['uri']))
    collaborators.remove('')
    return(list(collaborators))

In [10]:
spotify_token = get_spotify_token()
playlist_ex = query_playlist(spotify_token,'https://open.spotify.com/playlist/6dSz0AWxkgBfhlNBFyzqFf')
playlist_dict = parse_playlist(playlist_ex)

In [10]:
playlist_dict['Billie Eilish']

{'id': '6qqNVTkY8uBg9cP3Jd7DAH',
 'api_link': 'https://api.spotify.com/v1/artists/6qqNVTkY8uBg9cP3Jd7DAH'}

In [11]:
headers={"Authorization": f"Bearer {spotify_token}"}
artist_data = {}
#  for output, get more info on them
for celeb in playlist_dict.keys(): # parse each artist in the list if it isn't in the cache
    artist_metadata = get_artist_info(playlist_dict[celeb]['api_link'],headers)
    # want to get albums
    albums = requests.get(playlist_dict[celeb]['api_link']+'/albums',headers=headers).json()
    artist_metadata['collaborators']=get_collaborators(albums)
    artist_metadata['in_playlist']=True
    artist_data[celeb]=artist_metadata

<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/3c0gDdb9lhnHGFtP4prQpn'}, 'followers': {'href': None, 'total': 463558}, 'genres': ['5th gen k-pop'], 'href': 'https://api.spotify.com/v1/artists/3c0gDdb9lhnHGFtP4prQpn', 'id': '3c0gDdb9lhnHGFtP4prQpn', 'images': [{'url': 'https://i.scdn.co/image/ab6761610000e5eb031e8e2a9c4893810a02f863', 'height': 640, 'width': 640}, {'url': 'https://i.scdn.co/image/ab67616100005174031e8e2a9c4893810a02f863', 'height': 320, 'width': 320}, {'url': 'https://i.scdn.co/image/ab6761610000f178031e8e2a9c4893810a02f863', 'height': 160, 'width': 160}], 'name': 'KATSEYE', 'popularity': 74, 'type': 'artist', 'uri': 'spotify:artist:3c0gDdb9lhnHGFtP4prQpn'}
<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/4nVa6XlBFlIkF6msW57PHp'}, 'followers': {'href': None, 'total': 496990}, 'genres': ['desi hip hop', 'indian underground rap', 'malayalam hip hop'], 'href': 'https://api.spotify.com/v1/artists/4nVa6XlBFlIkF6

In [12]:
dumped_json_cache = json.dumps(artist_data)
fw = open('instagram_reels_data_10_07_24.json',"w")
fw.write(dumped_json_cache)
fw.close() 

In [13]:
nodes = []
for item in artist_data:
    nodes.append(item)
    for x in artist_data[item]['collaborators']:
        nodes.append(x)
print(len(set(nodes)))

1303


In [18]:
cache_file = open('instagram_reels_data_10_07_24.json', 'r')
cache_contents = cache_file.read()
artist_data = json.loads(cache_contents)
cache_file.close()

In [16]:
spotify_token = get_spotify_token()
headers={"Authorization": f"Bearer {spotify_token}"}

In [17]:
import time

In [19]:
# want to meta data on the people NOT in the playlist
non_playlist_copy = artist_data.copy()
counter = 0
for item in artist_data.keys():
    #time.sleep(1)
    for x in non_playlist_copy[item]['collaborators']:
        time.sleep(1)
        counter +=1
        print(counter)
        name = x[0]
        if name not in artist_data.keys() and name not in non_playlist_copy.keys():
            print(name)
            uri = x[1].split(':')[2]
            try:
                artist_metadata = get_artist_info(f'https://api.spotify.com/v1/artists/{uri}',headers)
            except:
                spotify_token = get_spotify_token()
                headers={"Authorization": f"Bearer {spotify_token}"}
                artist_metadata = get_artist_info(f'https://api.spotify.com/v1/artists/{uri}',headers)
            # want to get albums
            albums = requests.get(f'https://api.spotify.com/v1/artists/{uri}'+'/albums',headers=headers).json()
            artist_metadata['collaborators']=get_collaborators(albums)
            artist_metadata['in_playlist']=False # this is false now
            non_playlist_copy[name]=artist_metadata

1
2
KSHMR
<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/2wX6xSig4Rig5kZU6ePlWe'}, 'followers': {'href': None, 'total': 1770167}, 'genres': ['big room', 'dutch house', 'edm', 'electro house', 'indian edm', 'pop dance', 'progressive electro house', 'slap house'], 'href': 'https://api.spotify.com/v1/artists/2wX6xSig4Rig5kZU6ePlWe', 'id': '2wX6xSig4Rig5kZU6ePlWe', 'images': [{'url': 'https://i.scdn.co/image/ab6761610000e5ebd1db941d6e307ab229651fae', 'height': 640, 'width': 640}, {'url': 'https://i.scdn.co/image/ab67616100005174d1db941d6e307ab229651fae', 'height': 320, 'width': 320}, {'url': 'https://i.scdn.co/image/ab6761610000f178d1db941d6e307ab229651fae', 'height': 160, 'width': 160}], 'name': 'KSHMR', 'popularity': 65, 'type': 'artist', 'uri': 'spotify:artist:2wX6xSig4Rig5kZU6ePlWe'}
3
Shalmali Kholgade
<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/6uskWv5K2FA0YOAYr7JOUz'}, 'followers': {'href': None, 'total': 133578}, 

In [20]:
dumped_json_cache = json.dumps(non_playlist_copy)
fw = open('instagram_reels_data_10_07_24_second_layer.json',"w")
fw.write(dumped_json_cache)
fw.close() 