# Exploratory Text Analytics Final Project

In [51]:
import numpy as np
import pandas as pd
import os
import json
import dotenv
import requests
import lyricsgenius
import base64


## Lyrics Genius API Calls
### Read .env for API tokens

In [50]:
dotenv.load_dotenv()
geniusclientid = os.getenv('geniusclientid')
geniusclientsecret = os.getenv('geniusclientsecret')
geniusclientaccesstoken = os.getenv('geniusclientaccesstoken')
spotifyclientid = os.getenv('spotifyclientid')
spotifyclientsecret = os.getenv('spotifyclientsecret')

### Lyrics Genius Requests

In [3]:
genius = lyricsgenius.Genius(geniusclientaccesstoken)
artist = genius.search_artist('St. Vincent', max_songs=3)

Searching for songs by St. Vincent...

Song 1: "Los Ageless"
Song 2: "New York"
Song 3: "Prince Johnny"

Reached user-specified song limit (3).
Done. Found 3 songs.


The wrapper `lyricsgenius` is a little slow. Let's try calling the api directly

In [4]:
# headers
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-Agent':useragent,
           'From':'rhl8pk@virginia.edu'}

In [5]:
root = 'https://api.genius.com'
endpoint = '/search?'
parameters = {'access_token':geniusclientaccesstoken,
              'q':'St. Vincent'}
r = requests.get(root+endpoint, params=parameters, headers=headers)
r

<Response [200]>

In [6]:
myjson = json.loads(r.text)
myjson

{'meta': {'status': 200},
 'response': {'hits': [{'highlights': [],
    'index': 'song',
    'type': 'song',
    'result': {'annotation_count': 9,
     'api_path': '/songs/66160',
     'artist_names': 'Bon Iver & St. Vincent',
     'full_title': 'Rosyln by\xa0Bon\xa0Iver & St. Vincent',
     'header_image_thumbnail_url': 'https://images.genius.com/9ee556816f731ec19f706cd65abe509a.300x300x1.png',
     'header_image_url': 'https://images.genius.com/9ee556816f731ec19f706cd65abe509a.1000x1000x1.png',
     'id': 66160,
     'lyrics_owner_id': 27148,
     'lyrics_state': 'complete',
     'path': '/Bon-iver-and-st-vincent-rosyln-lyrics',
     'pyongs_count': 44,
     'relationships_index_url': 'https://genius.com/Bon-iver-and-st-vincent-rosyln-sample',
     'release_date_components': {'year': 2009, 'month': 10, 'day': 16},
     'release_date_for_display': 'October 16, 2009',
     'release_date_with_abbreviated_month_for_display': 'Oct. 16, 2009',
     'song_art_image_thumbnail_url': 'https://

In [7]:
myjson['response']['hits'][1]['result']['primary_artist']['api_path']

'/artists/2373'

### Loop over artist list to get IDs for more in-depth API calls

In [8]:
artist_list = [
'St. Vincent', 
'LCD Soundsystem',
'Dirty Projectors',
'Ava Luna', 
'Caroline Polachek', 
'Mr. Twin Sister', 
'The Marias', 
'Pure Bathing Culture',
'Cults', 
'The National', 
'Ethel Cain',
'CHVRCHES', 
'Big Thief', 
'Holychild', 
'Yeah Yeah Yeahs', 
'MGMT', 
'Radiohead', 
'Metric', 
'Broken Social Scene'
]

In [10]:
artist_endpoints = {}
for artist in artist_list:
    parameters = {'access_token':geniusclientaccesstoken,
              'q':artist}
    r = requests.get(root+endpoint, params=parameters, headers=headers)
    myjson = json.loads(r.text)
    parsed_endpoint = myjson['response']['hits'][1]['result']['primary_artist']['api_path']
    genius_name = myjson['response']['hits'][1]['result']['primary_artist']['name']
    artist_endpoints[artist] = parsed_endpoint
    print(f'{artist} matched to {genius_name}. Endpoint: {parsed_endpoint}')

St. Vincent matched to St. Vincent. Endpoint: /artists/2373
LCD Soundsystem matched to LCD Soundsystem. Endpoint: /artists/1169
Dirty Projectors matched to Dirty Projectors. Endpoint: /artists/12968
Ava Luna matched to Ava Luna. Endpoint: /artists/390647
Caroline Polachek matched to Caroline Polachek. Endpoint: /artists/63882
Mr. Twin Sister matched to Mr Twin Sister. Endpoint: /artists/230800
The Marias matched to The Marías. Endpoint: /artists/1127174
Pure Bathing Culture matched to Pure Bathing Culture. Endpoint: /artists/216604
Cults matched to Cults. Endpoint: /artists/15724
The National matched to The National. Endpoint: /artists/658
Ethel Cain matched to Ethel Cain. Endpoint: /artists/1904291
CHVRCHES matched to CHVRCHES. Endpoint: /artists/40863
Big Thief matched to Big Thief. Endpoint: /artists/668810
Holychild matched to HOLYCHILD. Endpoint: /artists/309275
Yeah Yeah Yeahs matched to Yeah Yeah Yeahs. Endpoint: /artists/17657
MGMT matched to MGMT. Endpoint: /artists/764
Radioh

Looks like everyone matched except for Metric. Let's scope the JSON for their artist search

In [11]:
endpoint = '/search?'
parameters = {'access_token':geniusclientaccesstoken,
              'q':'Metric'}
r = requests.get(root+endpoint, params=parameters, headers=headers)
myjson = json.loads(r.text)
myjson

{'meta': {'status': 200},
 'response': {'hits': [{'highlights': [],
    'index': 'song',
    'type': 'song',
    'result': {'annotation_count': 11,
     'api_path': '/songs/121943',
     'artist_names': 'Metric',
     'full_title': 'Black Sheep by\xa0Metric',
     'header_image_thumbnail_url': 'https://images.genius.com/921bc72852619829d6fd80f55bb47c3a.300x300x1.jpg',
     'header_image_url': 'https://images.genius.com/921bc72852619829d6fd80f55bb47c3a.1000x1000x1.jpg',
     'id': 121943,
     'lyrics_owner_id': 131,
     'lyrics_state': 'complete',
     'path': '/Metric-black-sheep-lyrics',
     'pyongs_count': 29,
     'relationships_index_url': 'https://genius.com/Metric-black-sheep-sample',
     'release_date_components': {'year': 2010, 'month': 8, 'day': 10},
     'release_date_for_display': 'August 10, 2010',
     'release_date_with_abbreviated_month_for_display': 'Aug. 10, 2010',
     'song_art_image_thumbnail_url': 'https://images.genius.com/b5ff8bf8f3e07cf5a318e90e032cd395.300x

Turns out the [1] to try and skip possible collaborations overshot Metric, but we can use [0] to get their artist endpoint.

For simplicity I'm just manually copying their artist endpoint into the dictionary.

In [12]:
artist_endpoints['Metric'] = '/artists/17675'
artist_endpoints['Metric']

'/artists/17675'

## Spotify API for album information and tracklists
Lyrics Genius API doesn't want me to get albums by artist.....so we will try Spotify's API

In [253]:
def get_spotify_token():
    auth_string = spotifyclientid + ":" + spotifyclientsecret
    auth_bytes = auth_string.encode('utf-8')
    auth_base64 = str(base64.b64encode(auth_bytes), 'utf-8')
    url = 'https://accounts.spotify.com/api/token'
    headers = {
        'Authorization':'Basic ' + auth_base64,
        'Content-Type':'application/x-www-form-urlencoded'
    }
    data = {'grant_type':'client_credentials'}
    result = requests.post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result['access_token']
    return token

def get_spotify_auth_header(token):
    return {'Authorization':'Bearer ' + token}

def search_for_artist(token, artist_name):
    url = 'https://api.spotify.com/v1/search'
    headers = get_spotify_auth_header(token)
    query = f'?q={artist_name}&type=artist&limit=1'
    query_url = url + query
    result = requests.get(query_url, headers=headers)
    json_result = json.loads(result.content)['artists']['items']
    if len(json_result) == 0:
        print(f'Could not find artist with name {artist_name}')
        return None
    return {'artist_name':json_result[0]['name'],
            'artist_id':json_result[0]['id'],
            'followers':json_result[0]['followers']['total'],
            'genres':json_result[0]['genres'],
            'popularity':json_result[0]['popularity']
    }

def get_albums_by_artist(token, artist_id):
    url = f'https://api.spotify.com/v1/artists/{artist_id}/albums?include_groups=album&market=US'
    headers = get_spotify_auth_header(token)
    result = requests.get(url, headers=headers)
    album_list = json.loads(result.content)['items']
    return album_list

def get_tracklist_by_album(token, album_id):
    url = f'https://api.spotify.com/v1/albums/{album_id}/?market=US'
    headers = get_spotify_auth_header(token)
    result = requests.get(url, headers=headers)
    track_list = json.loads(result.content)
    return track_list

def get_audio_features_by_song(token, song_id):
    url = f'https://api.spotify.com/v1/audio-features/{song_id}'
    headers = get_spotify_auth_header(token)
    result = requests.get(url, headers=headers)
    audio_features = json.loads(result.content)
    return audio_features

def parse_album_list(album_list):
    album_dict = {}
    for album in album_list:
        album_name = album['name']
        album_dict[album_name] = {'album_id':album['id'],
                                  'release_date':album['release_date'],
                                  'total_tracks':album['total_tracks'],
                                  'tracklist':{}
        }
    return album_dict

# Since we get to the song level here, we might as well get the audio information here too
def parse_tracklist(token, track_list):
    tracklist_dict = {}
    album_label = track_list['label']
    for track in track_list['tracks']['items']:
        track_name = track['name']
        track_id = track['id']
        audio_info = get_audio_features_by_song(token, track_id)
        tracklist_dict[track_name] = {'track_id':track['id'],
                                      'disc_number':track['disc_number'],
                                      'duration_ms':track['duration_ms'],
                                      'track_number':track['track_number'],
                                      'audio_information':audio_info}
    return album_label, tracklist_dict

In [254]:
spotify_token = get_spotify_token()

In [123]:
search_for_artist(spotify_token, 'St. Vincent')

{'artist_name': 'St. Vincent',
 'artist_id': '7bcbShaqKdcyjnmv4Ix8j6',
 'followers': 832341,
 'genres': ['art pop',
  'etherpop',
  'indie rock',
  'indietronica',
  'metropopolis',
  'neo-synthpop'],
 'popularity': 62}

### Generate Artist Data Dictionary
We want to store artist metadata, albums, album info, tracklist, songs, and lyrics

In [255]:
spotify_artist_data = {}
for artist in artist_list:
    artist_data = search_for_artist(spotify_token, artist)
    spotify_artist_data[artist] = {'Metadata':artist_data}

In [256]:
spotify_artist_data.keys()

dict_keys(['St. Vincent', 'LCD Soundsystem', 'Dirty Projectors', 'Ava Luna', 'Caroline Polachek', 'Mr. Twin Sister', 'The Marias', 'Pure Bathing Culture', 'Cults', 'The National', 'Ethel Cain', 'CHVRCHES', 'Big Thief', 'Holychild', 'Yeah Yeah Yeahs', 'MGMT', 'Radiohead', 'Metric', 'Broken Social Scene'])

In [257]:
spotify_artist_data['St. Vincent']['Metadata']['artist_id']

'7bcbShaqKdcyjnmv4Ix8j6'

In [258]:
spotify_artist_data

{'St. Vincent': {'Metadata': {'artist_name': 'St. Vincent',
   'artist_id': '7bcbShaqKdcyjnmv4Ix8j6',
   'followers': 832341,
   'genres': ['art pop',
    'etherpop',
    'indie rock',
    'indietronica',
    'metropopolis',
    'neo-synthpop'],
   'popularity': 62}},
 'LCD Soundsystem': {'Metadata': {'artist_name': 'LCD Soundsystem',
   'artist_id': '066X20Nz7iquqkkCW6Jxy6',
   'followers': 1080622,
   'genres': ['alternative dance',
    'alternative rock',
    'art pop',
    'dance rock',
    'dance-punk',
    'electronic rock',
    'electronica',
    'indie rock',
    'indietronica',
    'neo-synthpop',
    'new rave'],
   'popularity': 59}},
 'Dirty Projectors': {'Metadata': {'artist_name': 'Dirty Projectors',
   'artist_id': '5VF0YkVLeVD4ytyiyVSIiF',
   'followers': 252286,
   'genres': ['art pop',
    'brooklyn indie',
    'indie rock',
    'indietronica',
    'noise pop'],
   'popularity': 40}},
 'Ava Luna': {'Metadata': {'artist_name': 'Ava Luna',
   'artist_id': '7xkkeemZnfafR

### Get Albums by Artist

In [259]:
for artist in spotify_artist_data:
    artist_id = spotify_artist_data[artist]['Metadata']['artist_id']
    spotify_artist_data[artist]['Albums'] = parse_album_list(get_albums_by_artist(spotify_token, artist_id))

In [260]:
spotify_artist_data['Dirty Projectors']['Albums']

{'Mount Wittenberg Orca (Expanded Edition)': {'album_id': '14xINYt1omt6jlxv8AFbly',
  'release_date': '2023-06-16',
  'total_tracks': 27,
  'tracklist': {}},
 '5EPs': {'album_id': '7u0QlPnsqX06KfavgPeSET',
  'release_date': '2020-11-20',
  'total_tracks': 20,
  'tracklist': {}},
 'Sing The Melody': {'album_id': '48gORfOPT5iEm9eooBAQiv',
  'release_date': '2019-12-10',
  'total_tracks': 8,
  'tracklist': {}},
 'Lamp Lit Prose': {'album_id': '03CZTX0lcoZGy71rOHnDxn',
  'release_date': '2018-07-13',
  'total_tracks': 10,
  'tracklist': {}},
 'Dirty Projectors': {'album_id': '6HA2YKWabsGi6XWkhWoZuA',
  'release_date': '2017-02-21',
  'total_tracks': 9,
  'tracklist': {}},
 'Swing Lo Magellan': {'album_id': '5zQgu9qi6mK8KuleuzM22C',
  'release_date': '2012-07-10',
  'total_tracks': 12,
  'tracklist': {}},
 'Mount Wittenberg Orca': {'album_id': '1C9sn8azB8qNZyPSCgkNJz',
  'release_date': '2011-10-25',
  'total_tracks': 7,
  'tracklist': {}},
 'Bitte Orca (Expanded Edition)': {'album_id': '2M

## Get tracklist by album

In [263]:
for artist in spotify_artist_data:
    for album in spotify_artist_data[artist]['Albums']:
        album_id = spotify_artist_data[artist]['Albums'][album]['album_id']
        album_tracklist = get_tracklist_by_album(spotify_token, album_id)
        spotify_artist_data[artist]['Albums'][album]['label'], spotify_artist_data[artist]['Albums'][album]['tracklist'] = parse_tracklist(spotify_token, album_tracklist)

In [264]:
spotify_artist_data['LCD Soundsystem']['Albums']['american dream']

{'album_id': '0hdimlCTCms7otJCX9OvqM',
 'release_date': '2017-10-06',
 'total_tracks': 11,
 'tracklist': {'oh baby': {'track_id': '1Y5EqnpZLLoAp4RQQTr10O',
   'disc_number': 1,
   'duration_ms': 349693,
   'track_number': 1,
   'audio_information': {'danceability': 0.581,
    'energy': 0.623,
    'key': 5,
    'loudness': -12.005,
    'mode': 1,
    'speechiness': 0.0351,
    'acousticness': 0.0196,
    'instrumentalness': 0.626,
    'liveness': 0.653,
    'valence': 0.78,
    'tempo': 169.443,
    'type': 'audio_features',
    'id': '1Y5EqnpZLLoAp4RQQTr10O',
    'uri': 'spotify:track:1Y5EqnpZLLoAp4RQQTr10O',
    'track_href': 'https://api.spotify.com/v1/tracks/1Y5EqnpZLLoAp4RQQTr10O',
    'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1Y5EqnpZLLoAp4RQQTr10O',
    'duration_ms': 349693,
    'time_signature': 4}},
  'other voices': {'track_id': '0nTW4n6goPkpnoaBBeSjXm',
   'disc_number': 1,
   'duration_ms': 403346,
   'track_number': 2,
   'audio_information': {'danceabilit

## Append lyrics artist endpoint for ease of access

In [265]:
for sp_artist in spotify_artist_data:
    for lg_artist in artist_endpoints:
        if sp_artist == lg_artist:
            spotify_artist_data[sp_artist]['Metadata']['lg_endpoint'] = artist_endpoints[lg_artist]

In [266]:
spotify_artist_data['Metric']

{'Metadata': {'artist_name': 'Metric',
  'artist_id': '1rCIEwPp5OnXW0ornlSsRl',
  'followers': 804601,
  'genres': ['canadian indie',
   'canadian rock',
   'indie rock',
   'indietronica',
   'metropopolis',
   'neo-synthpop'],
  'popularity': 62,
  'lg_endpoint': '/artists/17675'},
 'Albums': {'Formentera II': {'album_id': '4SH7eiRuT8MdrLnL52gB1O',
   'release_date': '2023-10-13',
   'total_tracks': 9,
   'tracklist': {'Detour Up': {'track_id': '0urzesdU9xIlWnvUaRBDaV',
     'disc_number': 1,
     'duration_ms': 218416,
     'track_number': 1,
     'audio_information': {'danceability': 0.682,
      'energy': 0.738,
      'key': 9,
      'loudness': -6.394,
      'mode': 0,
      'speechiness': 0.0317,
      'acousticness': 0.00393,
      'instrumentalness': 0.00495,
      'liveness': 0.562,
      'valence': 0.584,
      'tempo': 116.906,
      'type': 'audio_features',
      'id': '0urzesdU9xIlWnvUaRBDaV',
      'uri': 'spotify:track:0urzesdU9xIlWnvUaRBDaV',
      'track_href': 'http

## Get song lyrics from genius using spotify artist name and song name
The idea here is that if we match on both fields, it is a song we want.

## Save current work to json file

In [None]:
# with open('spotify_artist_data.json', 'w') as outfile:
#     json.dump(spotify_artist_data, outfile)

# To Read
# f = open('spotify_artist_data.json')
# data = json.load(f)
# f.close()