# Functions to scrape the Spotify API
These functions can be used to scrape the spotify API & generate the data needed from the Instagram Reels playlist.

In [2]:
import requests
import regex as re
import json
import time

In [3]:
def get_spotify_token(client_id,client_secret):
        '''
        Get the spotify api access token using client id and secret from post request. 

        PARAMETERS
        ----------
        client_id: str
            client id for spotify developer account
        client_secret: str
            client secret for spotify developer account
        RETURNS
        -------
        str
            bearer access token for spotify api
        '''
        info = requests.post(url="https://accounts.spotify.com/api/token",
                        headers={"Content-Type":"application/x-www-form-urlencoded"},
                        data=f"grant_type=client_credentials&client_id={client_id}&client_secret={client_secret}")
        return(info.json()['access_token'])

In [4]:
def parse_playlist_id(link='skyeler'):
    '''
    Get the spotify playlist id from a playlist link 

    PARAMETERS
    ----------
    link: str
        the link to a playlist on spotify
    
    RETURNS
    -------
    str
        the spotify playlist id
    '''
    regex_pattern = r'/playlist/([a-zA-Z0-9]+)'

    regex_match = re.search(regex_pattern,link)

    if regex_match:
        id = regex_match.group(1)
        return id
    return None

In [5]:
def get_title_info(headers,playlist_id):
    '''
    Makes a spotify api request to get the title and author of a playlist in a pretty string format.

    PARAMETERS
    ----------
    headers: dict
        api headers to use in the request, should be a dict with a bearer token.
    playlist_id: str
        the spotify unique id for a playlist

    RETURNS
    -------
    str
        the playlist name and ownner in the format "<playlist name> by <playlist author>"
    '''
    info = requests.get(f"https://api.spotify.com/v1/playlists/{playlist_id}",headers=headers).json()
    return(f"{info['name']} by {info['owner']['display_name']}")

In [6]:
def parse_playlist(api_output):
    '''
    Parses the api output of a spotify api playlist tracks call. 
    Finds each artist per each song on the playlist and adds the artist to a dictionary, 
    with their spotify id and their api_link. 

    PARAMETERS
    ----------
    api_output: requests response type
        the output of an api get request

    RETURNS
    -------
    dict
        dictionary where the key is the name of an artist (str) and the value is a dictionary with info
    '''
    track_list = api_output.json()['items']
    output = {}

    for item in track_list:
        artist_info = item['track']['artists']
        for artist in artist_info:
            id = artist['id']
            name = artist['name']
            api_link = artist['href']
            output[name]={'id':id,
                        'api_link':api_link}
    return output

In [7]:
def query_playlist(spotify_token,playlist_link:str):
    '''
    Queries the playlist object to get the tracks.

    PARAMETERS
    ----------
    playlist_link: str
        spotify api link to get the tracks from a playlist
    headers: dict
        api headers to use in the request, should be a dict with a bearer token.

    RETURNS
    -------
    request object
    '''
    headers={"Authorization": f"Bearer {spotify_token}"} #set the spotify headers
    playlist_id = parse_playlist_id(playlist_link)
    #title_info = get_title_info(headers,playlist_id)
    playlist = requests.get(f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks",headers=headers) #get the tracks
    return playlist

In [8]:
def get_artist_info(link,headers):
   '''
   Makes a request based on an artist's link and returns a dictionary with additional 
   info about that artist. 

   PARAMETERS
   ----------
   link: str
        spotify api link to get information about an artist
    headers: dict
        api headers to use in the request, should be a dict with a bearer token.

   RETURNS
   -------
   dict
        a dictionary with the artist's genres, image information, and popularity
   '''
   info = requests.get(link,headers=headers)
   print(info)
   info = info.json()
   print(info)
   try:
     genres = info['genres']
   except:
      genres = []
      print(info)
   try:
     followers = info['followers']['total']
   except:
      followers = None
   try:
     img_info = info['images']
   except:
      img_info = []
   try:
     popularity = info['popularity']
   except:
      popularity = None
   return {'genres':genres,
           'img_info':img_info,
           'popularity':popularity,
           'followers':followers}

In [9]:
def get_collaborators(albums:dict)->list:
    '''Gets the collaborators of an artist'''
    collaborators = {''}
    for al in albums['items']:
        artist_list = al['artists']
        for artist in artist_list:
            collaborators.add((artist['name'],artist['uri']))
    collaborators.remove('')
    return(list(collaborators))

In [21]:
def get_collaborators_number(albums:dict)->list:
    '''Gets the collaborators of an artist'''
    collaborators = {}
    for al in albums['items']:
        artist_list = al['artists']
        for artist in artist_list:
            if artist['name'] not in collaborators.keys():
                collaborators[artist['name']]={'count':1,'uri':artist['uri']}
            else:
                collaborators[artist['name']]['count']+=1
    return(collaborators)

In [20]:
def save_file(data:dict,filepath:str='cache.json')->None:
    dumped_json_cache = json.dumps(data)
    fw = open(filepath,"w")
    fw.write(dumped_json_cache)
    fw.close()
    return

In [23]:
def open_file(filepath:str='cache.json')->dict:
    cache_file = open(filepath, 'r')
    cache_contents = cache_file.read()
    artist_data = json.loads(cache_contents)
    cache_file.close()
    return artist_data

In [24]:
# get the token
tok1, tok2 = 
spotify_token = get_spotify_token(tok1,tok2)
headers={"Authorization": f"Bearer {spotify_token}"}
# query the playlist
playlist_ex = query_playlist(spotify_token,'https://open.spotify.com/playlist/6dSz0AWxkgBfhlNBFyzqFf')
playlist_dict = parse_playlist(playlist_ex)
# open cache
cache = open_file('cache.json')

# initialize output
output = {}

# for each celeb in the playlist
for celeb in playlist_dict.keys():
#       check if in cache.keys
    if celeb in cache.keys():
        output[celeb]=cache[celeb]
        continue # no need to do more stuff
#       add data to output
    artist_metadata = get_artist_info(playlist_dict[celeb]['api_link'],headers)
    albums = requests.get(playlist_dict[celeb]['api_link']+'/albums',headers=headers).json()
    artist_metadata['collaborators']=get_collaborators_number(albums)
    artist_metadata['in_playlist']=True
    # update output & cache
    output[celeb]=artist_metadata
    cache[celeb]=artist_metadata

# update cache
save_file(cache,'cache.json')
# save outpput
save_file(output,'10_26_1.json')


<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/5HK6QtizXJzCmoYTkvFRik'}, 'followers': {'href': None, 'total': 105313}, 'genres': ['chill drill'], 'href': 'https://api.spotify.com/v1/artists/5HK6QtizXJzCmoYTkvFRik', 'id': '5HK6QtizXJzCmoYTkvFRik', 'images': [{'url': 'https://i.scdn.co/image/ab6761610000e5ebf7cf118ff58706a19039bfdc', 'height': 640, 'width': 640}, {'url': 'https://i.scdn.co/image/ab67616100005174f7cf118ff58706a19039bfdc', 'height': 320, 'width': 320}, {'url': 'https://i.scdn.co/image/ab6761610000f178f7cf118ff58706a19039bfdc', 'height': 160, 'width': 160}], 'name': 'Ndotz', 'popularity': 74, 'type': 'artist', 'uri': 'spotify:artist:5HK6QtizXJzCmoYTkvFRik'}
<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/09cKncAQn28NqTUORLMwSR'}, 'followers': {'href': None, 'total': 129708}, 'genres': ['gym phonk'], 'href': 'https://api.spotify.com/v1/artists/09cKncAQn28NqTUORLMwSR', 'id': '09cKncAQn28NqTUORLMwSR', 'images': [

In [33]:
# want to meta data on the people NOT in the playlist
non_playlist_copy = output.copy()
counter = 0
for item in output.keys(): # for each top 100 artist
    time.sleep(1)
    for name in non_playlist_copy[item]['collaborators']: # for each collaborator
        print(name)
        time.sleep(1)
        counter +=1
        print(counter)
        #name = x[0]
        # if it isn't in the output & playlist copy
        if name not in output.keys() and name not in non_playlist_copy.keys():
            print(name)
            # check if it is in the cache
            if name in cache.keys():
                artist_metadata = cache[name]
                artist_metadata['in_playlist']=False
                non_playlist_copy[name] = artist_metadata
                continue

            uri = non_playlist_copy[item]['collaborators'][name]['uri'].split(':')[2]
            try:
                artist_metadata = get_artist_info(f'https://api.spotify.com/v1/artists/{uri}',headers)
                albums = requests.get(f'https://api.spotify.com/v1/artists/{uri}'+'/albums',headers=headers).json()
                artist_metadata['collaborators']=get_collaborators_number(albums)

                artist_metadata['in_playlist']=False # this is false now
                non_playlist_copy[name]=artist_metadata
            except:
                spotify_token = get_spotify_token(tok1,tok2)
                headers={"Authorization": f"Bearer {spotify_token}"}
                artist_metadata = get_artist_info(f'https://api.spotify.com/v1/artists/{uri}',headers)
                albums = requests.get(f'https://api.spotify.com/v1/artists/{uri}'+'/albums',headers=headers).json()
                artist_metadata['collaborators']=get_collaborators_number(albums)

            artist_metadata['in_playlist']=False # this is false now
            non_playlist_copy[name]=artist_metadata
            #update cache
            cache[name]=artist_metadata

# update cache
save_file(cache,'cache.json')
# save outpput
save_file(non_playlist_copy,'10_26_2.json')

Hanumankind
1
Kalmi
2
Parimal Shais
3
Parimal Shais
<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/2epRpDIZDqjmxSyysULdTQ'}, 'followers': {'href': None, 'total': 38998}, 'genres': ['malayalam hip hop'], 'href': 'https://api.spotify.com/v1/artists/2epRpDIZDqjmxSyysULdTQ', 'id': '2epRpDIZDqjmxSyysULdTQ', 'images': [{'url': 'https://i.scdn.co/image/ab6761610000e5eb3c069be9b4dd5075b68aaef7', 'height': 640, 'width': 640}, {'url': 'https://i.scdn.co/image/ab676161000051743c069be9b4dd5075b68aaef7', 'height': 320, 'width': 320}, {'url': 'https://i.scdn.co/image/ab6761610000f1783c069be9b4dd5075b68aaef7', 'height': 160, 'width': 160}], 'name': 'Parimal Shais', 'popularity': 51, 'type': 'artist', 'uri': 'spotify:artist:2epRpDIZDqjmxSyysULdTQ'}
Shalmali Kholgade
4
Shalmali Kholgade
<Response [200]>
{'external_urls': {'spotify': 'https://open.spotify.com/artist/6uskWv5K2FA0YOAYr7JOUz'}, 'followers': {'href': None, 'total': 155041}, 'genres': ['filmi', 'modern bollyw

We need to get the # of occurances of the number of collaborations.

In [1]:
def open_file(path:str)->dict:
    cache_file = open(path, 'r')
    cache_contents = cache_file.read()
    artist_data = json.loads(cache_contents)
    cache_file.close()
    return artist_data