<img src="https://github.com/rjpost20/Onramp-Project/blob/main/data/pexels-vishnu-r-nair-1105666.jpg?raw=true">
Image by <a href="https://www.pexels.com/@vishnurnair/" >Vishnu R Nair</a> on <a href="https://www.pexels.com/photo/people-at-concert-1105666/" >Pexels.com</a>

# *Onramp x Vanguard Spotify Project*

## By Ryan Posternak

<br>

## Imports

In [148]:
import numpy as np
import pandas as pd
from pprint import pprint
import time
import re

# import http.client
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

<br>

# Section 1: Data Ingestion

### Establish connection to Spotify's API

In [255]:
# Set API keys as environment variables (sensitive information!). Credentials are stored securely in a local file.
with open("API.txt") as f:
    text = f.readlines()
    client_id = text[0].strip()
    client_secret = text[1].strip()
    redirect_uri = text[2].strip()
    
# Assign API keys to a Spotipy credential manager
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, 
                                                      client_secret=client_secret, 
                                                      requests_timeout=100)  # Default timeout setting is too short

# Connect to Spotipy by passing in credential manager
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

sp

<spotipy.client.Spotify at 0x160e6b790>

## Part 1: `Artist` Dataframe

### Obtain artist data for top 20 favorite artists

In [73]:
# Define list of 20 favorite artists
artists = ['Beyoncé', 'Billie Eilish', 'blackbear', 'Bob Dylan', 'Bob Marley', 'Cuco', 'Doja Cat', 'Drake', \
           'Ellie Goulding', 'J. Cole', 'Jack Johnson', 'Khalid', 'Kid Cudi', 'Pink Floyd', 'Post Malone', \
           'Simon & Garfunkel', 'The Beatles', 'The Chainsmokers', 'The Weeknd', 'Tove Lo']

assert len(artists) == 20

In [74]:
# Preview artist output format
preview = sp.search('The Beatles', limit=1, type='artist')
pprint(preview)

{'artists': {'href': 'https://api.spotify.com/v1/search?query=The+Beatles&type=artist&offset=0&limit=1',
             'items': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3WrFJ7ztbogyGnTHbHJFl2'},
                        'followers': {'href': None, 'total': 23541504},
                        'genres': ['beatlesque',
                                   'british invasion',
                                   'classic rock',
                                   'merseybeat',
                                   'psychedelic rock',
                                   'rock'],
                        'href': 'https://api.spotify.com/v1/artists/3WrFJ7ztbogyGnTHbHJFl2',
                        'id': '3WrFJ7ztbogyGnTHbHJFl2',
                        'images': [{'height': 640,
                                    'url': 'https://i.scdn.co/image/ab6761610000e5ebe9348cc01ff5d55971b22433',
                                    'width': 640},
                                   {'height': 

In [186]:
# Define dictionary to contain all artist data
artists_dict = {}

# Define list containers for artists info
artist_ids = []
artist_names = []
external_urls = []
genres = []
image_urls = []
followers = []
popularities = []
types = []
artist_uris = []

# Append artist info to each respective list
for artist in artists:
    artist_info = sp.search(artist, limit=1, type='artist')
    info_items = artist_info['artists']['items'][0]
    
    artist_ids.append(info_items['id'])
    artist_names.append(info_items['name'])
    external_urls.append(info_items['external_urls']['spotify'])
    genres.append(info_items['genres'][0])  # Take first genre from list
    image_urls.append(info_items['images'][0]['url'])  # Take first image url from list
    followers.append(info_items['followers']['total'])
    popularities.append(info_items['popularity'])
    types.append(info_items['type'])
    artist_uris.append(info_items['uri'])
    
    # Set a delay after each API call just to be safe (don't want to get in trouble with the API!)
    time.sleep(0.5)

# Add lists to dictionary holding compiled artist data
artists_dict['artist_id'] = artist_ids
artists_dict['artist_name'] = artist_names
artists_dict['external_url'] = external_urls
artists_dict['genre'] = genres
artists_dict['image_url'] = image_urls
artists_dict['followers'] = followers
artists_dict['popularity'] = popularities
artists_dict['type'] = types
artists_dict['artist_uri'] = artist_uris

In [116]:
# Compile into Pandas dataframe
artists_df = pd.DataFrame(data=artists_dict)
assert artists_df.shape[0] == 20

# Preview artists dataframe
artists_df.head()

Unnamed: 0,artist_id,artist_name,external_url,genre,image_url,followers,popularity,type,artist_uri
0,6vWDO969PvNqNYHIOW5v0m,Beyoncé,https://open.spotify.com/artist/6vWDO969PvNqNY...,dance pop,https://i.scdn.co/image/ab6761610000e5eb676338...,32114388,88,artist,spotify:artist:6vWDO969PvNqNYHIOW5v0m
1,6qqNVTkY8uBg9cP3Jd7DAH,Billie Eilish,https://open.spotify.com/artist/6qqNVTkY8uBg9c...,art pop,https://i.scdn.co/image/ab6761610000e5ebd8b998...,68569580,88,artist,spotify:artist:6qqNVTkY8uBg9cP3Jd7DAH
2,2cFrymmkijnjDg9SS92EPM,blackbear,https://open.spotify.com/artist/2cFrymmkijnjDg...,electropop,https://i.scdn.co/image/ab6761610000e5eb4f7d04...,4774278,80,artist,spotify:artist:2cFrymmkijnjDg9SS92EPM
3,74ASZWbe4lXaubB36ztrGX,Bob Dylan,https://open.spotify.com/artist/74ASZWbe4lXaub...,album rock,https://i.scdn.co/image/ab6772690000c46cf79ca0...,5774389,71,artist,spotify:artist:74ASZWbe4lXaubB36ztrGX
4,2QsynagSdAqZj3U9HgDzjD,Bob Marley & The Wailers,https://open.spotify.com/artist/2QsynagSdAqZj3...,reggae,https://i.scdn.co/image/b5aae2067db80f694a980e...,10849534,78,artist,spotify:artist:2QsynagSdAqZj3U9HgDzjD


<br>

## Part 2: `Album` Dataframe

### Obtain album data for six albums for each of top 20 favorite artists

In [221]:
# Preview albums output format
preview = sp.artist_albums(artist_id='6vWDO969PvNqNYHIOW5v0m', limit=10, country='US')
pprint(preview['items'][0])

{'album_group': 'album',
 'album_type': 'album',
 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6vWDO969PvNqNYHIOW5v0m'},
              'href': 'https://api.spotify.com/v1/artists/6vWDO969PvNqNYHIOW5v0m',
              'id': '6vWDO969PvNqNYHIOW5v0m',
              'name': 'Beyoncé',
              'type': 'artist',
              'uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m'}],
 'external_urls': {'spotify': 'https://open.spotify.com/album/6FJxoadUE4JNVwWHghBwnb'},
 'href': 'https://api.spotify.com/v1/albums/6FJxoadUE4JNVwWHghBwnb',
 'id': '6FJxoadUE4JNVwWHghBwnb',
 'images': [{'height': 640,
             'url': 'https://i.scdn.co/image/ab67616d0000b2730e58a0f8308c1ad403d105e7',
             'width': 640},
            {'height': 300,
             'url': 'https://i.scdn.co/image/ab67616d00001e020e58a0f8308c1ad403d105e7',
             'width': 300},
            {'height': 64,
             'url': 'https://i.scdn.co/image/ab67616d000048510e58a0f8308c1ad403d105e7'

In [222]:
# Display names of albums
for album in preview['items']:
    pprint(album['name'])

'RENAISSANCE'
'RENAISSANCE'
'The Lion King: The Gift [Deluxe Edition]'
'The Lion King: The Gift [Deluxe Edition]'
'The Lion King: The Gift'
'HOMECOMING: THE LIVE ALBUM'
'HOMECOMING: THE LIVE ALBUM'
'Lemonade'
'Lemonade'
'BEYONCÉ [Platinum Edition]'


**Remarks:**
- It looks like duplicate albums will be an issue. Further, it looks like slight variations on the name or edition (e.g. regular edition vs. deluxe edition) will also be an issue. We'll address this with a RegEx search to remove any text inside square brackets or parentheses when checking for duplicate albums, which should catch most of these duplicates.

In [166]:
# Display names of albums with album variation comments removed
for album in preview['items']:
    pprint(re.sub("[\(\[].*?[\)\]]", "", album['name']).strip())

'RENAISSANCE'
'RENAISSANCE'
'The Lion King: The Gift'
'The Lion King: The Gift'
'The Lion King: The Gift'
'HOMECOMING: THE LIVE ALBUM'
'HOMECOMING: THE LIVE ALBUM'
'Lemonade'
'Lemonade'
'BEYONCÉ'


In [203]:
# Define dictionary to contain all album data
albums_dict = {}

# Set up containers for albums info
album_ids = []
album_names = []
external_urls = []
image_urls = []
release_dates = []
total_tracks = []
types = []
album_uris = []
album_artist_ids = []

# Append album info to each respective list
for artist_id in artist_ids:
    # API call for 7 albums by artist
    albums_info = sp.artist_albums(artist_id=artist_id, limit=10, country='US')
    
    # Prevent duplicate albums from being added
    dup_album_check = []
    
    # Retrieve info for each album
    for album in albums_info['items']:
        
        unique_album_name = re.sub("[\(\[].*?[\)\]]", "", album['name']).strip()
        if unique_album_name in dup_album_check:  # Skip album if in dup_album_check list
            continue
            
        album_ids.append(album['id'])
        album_names.append(album['name'])
        external_urls.append(album['external_urls']['spotify'])
        image_urls.append(album['images'][0]['url'])  # Take first image url from list
        release_dates.append(album['release_date'])
        total_tracks.append(album['total_tracks'])
        types.append(album['type'])
        album_uris.append(album['uri'])
        album_artist_ids.append(artist_id)
        
        dup_album_check.append(unique_album_name)
    
    # Set a delay after each API call
    time.sleep(0.5)
    
# Add lists to dictionary holding compiled albums data
albums_dict['album_id'] = album_ids
albums_dict['album_name'] = album_names
albums_dict['external_url'] = external_urls
albums_dict['image_url'] = image_urls
albums_dict['release_date'] = release_dates
albums_dict['total_tracks'] = total_tracks
albums_dict['type'] = types
albums_dict['album_uri'] = album_uris
albums_dict['artist_id'] = album_artist_ids

In [268]:
# Compile into Pandas dataframe
albums_df = pd.DataFrame(data=albums_dict)
assert set(artist_ids) == set(album_artist_ids)  # Verify we retrieved albums for all 20 artists

# Preview albums dataframe
print('Albums:', albums_df.shape[0])
albums_df.head()

Albums: 135


Unnamed: 0,album_id,album_name,external_url,image_url,release_date,total_tracks,type,album_uri,artist_id
0,6FJxoadUE4JNVwWHghBwnb,RENAISSANCE,https://open.spotify.com/album/6FJxoadUE4JNVwW...,https://i.scdn.co/image/ab67616d0000b2730e58a0...,2022-07-29,16,album,spotify:album:6FJxoadUE4JNVwWHghBwnb,6vWDO969PvNqNYHIOW5v0m
1,7kUuNU2LRmr9XbwLHXU9UZ,The Lion King: The Gift [Deluxe Edition],https://open.spotify.com/album/7kUuNU2LRmr9Xbw...,https://i.scdn.co/image/ab67616d0000b27360e232...,2020-07-31,17,album,spotify:album:7kUuNU2LRmr9XbwLHXU9UZ,6vWDO969PvNqNYHIOW5v0m
2,35S1JCj5paIfElT2GODl6x,HOMECOMING: THE LIVE ALBUM,https://open.spotify.com/album/35S1JCj5paIfElT...,https://i.scdn.co/image/ab67616d0000b2738e5252...,2019-04-17,40,album,spotify:album:35S1JCj5paIfElT2GODl6x,6vWDO969PvNqNYHIOW5v0m
3,7dK54iZuOxXFarGhXwEXfF,Lemonade,https://open.spotify.com/album/7dK54iZuOxXFarG...,https://i.scdn.co/image/ab67616d0000b27389992f...,2016-04-23,13,album,spotify:album:7dK54iZuOxXFarGhXwEXfF,6vWDO969PvNqNYHIOW5v0m
4,2UJwKSBUz6rtW4QLK74kQu,BEYONCÉ [Platinum Edition],https://open.spotify.com/album/2UJwKSBUz6rtW4Q...,https://i.scdn.co/image/ab67616d0000b2730d1d6e...,2014-11-24,20,album,spotify:album:2UJwKSBUz6rtW4QLK74kQu,6vWDO969PvNqNYHIOW5v0m


<br>

## Part 3: `Track` Dataframe

### Obtain track data for 20 tracks (minus duplicates) in each album obtained above

In [231]:
# Preview tracks output format
preview = sp.album_tracks(album_id='6FJxoadUE4JNVwWHghBwnb', limit=10, market='US')
pprint(preview['items'][0])

{'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6vWDO969PvNqNYHIOW5v0m'},
              'href': 'https://api.spotify.com/v1/artists/6vWDO969PvNqNYHIOW5v0m',
              'id': '6vWDO969PvNqNYHIOW5v0m',
              'name': 'Beyoncé',
              'type': 'artist',
              'uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m'}],
 'disc_number': 1,
 'duration_ms': 208014,
 'explicit': True,
 'external_urls': {'spotify': 'https://open.spotify.com/track/1MpCaOeUWhox2Fgigbe1cL'},
 'href': 'https://api.spotify.com/v1/tracks/1MpCaOeUWhox2Fgigbe1cL',
 'id': '1MpCaOeUWhox2Fgigbe1cL',
 'is_local': False,
 'is_playable': True,
 'name': "I'M THAT GIRL",
 'preview_url': 'https://p.scdn.co/mp3-preview/c7cece6b1b9cb3637fc48924f23baf9c7e1ec15c?cid=a3c3419e623d4410ad1aadf01bc737d5',
 'track_number': 1,
 'type': 'track',
 'uri': 'spotify:track:1MpCaOeUWhox2Fgigbe1cL'}


In [247]:
# Display names of 10 songs
for track in preview['items']:
    pprint(track['name'])

"I'M THAT GIRL"
'COZY'
'ALIEN SUPERSTAR'
'CUFF IT'
'ENERGY (feat. Beam)'
'BREAK MY SOUL'
'CHURCH GIRL'
'PLASTIC OFF THE SOFA'
"VIRGO'S GROOVE"
'MOVE (feat. Grace Jones & Tems)'


**Remarks:**
- It looks like duplicate songs might not be as big an issue with track searches. I don't see a need to use a RegEx search like in the last API call, but we'll still check for identically named songs and skip those.

In [250]:
# Define dictionary to contain all tracks data
tracks_dict = {}

# Set up containers for tracks info
track_ids = []
song_names = []
external_urls = []
durations_ms = []
explicit = []
disc_numbers = []
types = []
song_uris = []
track_album_ids = []

# Append track info to each respective list
for album_id in album_ids:
    # API call for (max) 20 tracks per album
    tracks_info = sp.album_tracks(album_id=album_id, limit=20, market='US')
    
    # Prevent duplicate tracks from being added
    dup_track_check = []
    
    # Retrieve info for each album
    for track in tracks_info['items']:
        
        track_name = track['name']
        if track_name in dup_track_check:  # Skip track if in dup_track_check list
            continue
            
        track_ids.append(track['id'])
        song_names.append(track['name'])
        external_urls.append(track['external_urls']['spotify'])
        durations_ms.append(track['duration_ms'])
        explicit.append(track['explicit'])
        disc_numbers.append(track['disc_number'])
        types.append(track['type'])
        song_uris.append(track['uri'])
        track_album_ids.append(album_id)
        
        dup_track_check.append(track_name)
    
    # Set a (shorter) delay after each API call
    time.sleep(0.01)
    
# Add lists to dictionary holding compiled tracks data
tracks_dict['track_id'] = track_ids
tracks_dict['song_name'] = song_names
tracks_dict['external_url'] = external_urls
tracks_dict['duration_ms'] = durations_ms
tracks_dict['explicit'] = explicit
tracks_dict['disc_number'] = disc_numbers
tracks_dict['type'] = types
tracks_dict['song_uri'] = song_uris
tracks_dict['album_id'] = track_album_ids

In [251]:
# Compile into Pandas dataframe
tracks_df = pd.DataFrame(data=tracks_dict)
assert set(album_ids) == set(track_album_ids)  # Verify we retrieved tracks for all 135 albums

# Preview tracks dataframe
print('Tracks:', tracks_df.shape[0])
tracks_df.head()

Tracks: 1765


Unnamed: 0,track_id,song_name,external_url,duration_ms,explicit,disc_number,type,song_uri,album_id
0,1MpCaOeUWhox2Fgigbe1cL,I'M THAT GIRL,https://open.spotify.com/track/1MpCaOeUWhox2Fg...,208014,True,1,track,spotify:track:1MpCaOeUWhox2Fgigbe1cL,6FJxoadUE4JNVwWHghBwnb
1,0mKGwFMHzTprtS2vpR3b6s,COZY,https://open.spotify.com/track/0mKGwFMHzTprtS2...,210372,True,1,track,spotify:track:0mKGwFMHzTprtS2vpR3b6s,6FJxoadUE4JNVwWHghBwnb
2,1Hohk6AufHZOrrhMXZppax,ALIEN SUPERSTAR,https://open.spotify.com/track/1Hohk6AufHZOrrh...,215459,True,1,track,spotify:track:1Hohk6AufHZOrrhMXZppax,6FJxoadUE4JNVwWHghBwnb
3,1xzi1Jcr7mEi9K2RfzLOqS,CUFF IT,https://open.spotify.com/track/1xzi1Jcr7mEi9K2...,225388,True,1,track,spotify:track:1xzi1Jcr7mEi9K2RfzLOqS,6FJxoadUE4JNVwWHghBwnb
4,0314PeD1sQNonfVWix3B2K,ENERGY (feat. Beam),https://open.spotify.com/track/0314PeD1sQNonfV...,116727,False,1,track,spotify:track:0314PeD1sQNonfVWix3B2K,6FJxoadUE4JNVwWHghBwnb


<br>

## Part 4: `Track_Feature` Dataframe

### Obtain track features data for each of the 1,765 tracks obtained above

In [260]:
# Preview track features output format
preview = sp.audio_features('1MpCaOeUWhox2Fgigbe1cL')[0]
pprint(preview)

{'acousticness': 0.0616,
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1MpCaOeUWhox2Fgigbe1cL',
 'danceability': 0.554,
 'duration_ms': 208014,
 'energy': 0.535,
 'id': '1MpCaOeUWhox2Fgigbe1cL',
 'instrumentalness': 1.32e-05,
 'key': 5,
 'liveness': 0.124,
 'loudness': -8.959,
 'mode': 0,
 'speechiness': 0.186,
 'tempo': 105.865,
 'time_signature': 4,
 'track_href': 'https://api.spotify.com/v1/tracks/1MpCaOeUWhox2Fgigbe1cL',
 'type': 'audio_features',
 'uri': 'spotify:track:1MpCaOeUWhox2Fgigbe1cL',
 'valence': 0.136}


In [271]:
# Define dictionary to contain all track features data
track_features_dict = {}

# Set up containers for tracks info
track_features_track_ids = []
danceability = []
energy = []
instrumentalness = []
liveness = []
loudness = []
speechiness = []
tempo = []
types = []
valence = []
track_features_song_uris = []

# Append track features info to each respective list
for track_id in track_ids:
    # API call for track features info
    track_features = sp.audio_features(track_id)[0]
    
    track_features_track_ids.append(track_id)
    danceability.append(track_features['danceability'])
    energy.append(track_features['energy'])
    instrumentalness.append(track_features['instrumentalness'])
    liveness.append(track_features['liveness'])
    loudness.append(track_features['loudness'])
    speechiness.append(track_features['speechiness'])
    tempo.append(track_features['tempo'])
    types.append(track_features['type'])
    valence.append(track_features['valence'])
    track_features_song_uris.append(track_features['uri'])
    
    # Set a (even shorter) delay after each track features call
    time.sleep(0.001)
    
# Add lists to dictionary holding compiled track features data
track_features_dict['track_id'] = track_features_track_ids
track_features_dict['danceability'] = danceability
track_features_dict['energy'] = energy
track_features_dict['instrumentalness'] = instrumentalness
track_features_dict['liveness'] = liveness
track_features_dict['loudness'] = loudness
track_features_dict['speechiness'] = speechiness
track_features_dict['tempo'] = tempo
track_features_dict['type'] = types
track_features_dict['valence'] = valence
track_features_dict['song_uri'] = track_features_song_uris

In [272]:
# Compile into Pandas dataframe
track_features_df = pd.DataFrame(data=track_features_dict)
assert set(track_ids) == set(track_features_track_ids)  # Verify we retrieved track features for all 1,765 tracks

# Preview tracks dataframe
print('Track features:', track_features_df.shape[0])
track_features_df.head()

Track features: 1765


Unnamed: 0,track_id,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,type,valence,song_uri
0,1MpCaOeUWhox2Fgigbe1cL,0.554,0.535,1.3e-05,0.124,-8.959,0.186,105.865,audio_features,0.136,spotify:track:1MpCaOeUWhox2Fgigbe1cL
1,0mKGwFMHzTprtS2vpR3b6s,0.556,0.63,0.00468,0.155,-8.15,0.102,149.147,audio_features,0.367,spotify:track:0mKGwFMHzTprtS2vpR3b6s
2,1Hohk6AufHZOrrhMXZppax,0.545,0.641,6.6e-05,0.171,-6.398,0.0998,121.892,audio_features,0.464,spotify:track:1Hohk6AufHZOrrhMXZppax
3,1xzi1Jcr7mEi9K2RfzLOqS,0.78,0.689,1e-05,0.0698,-5.668,0.141,115.042,audio_features,0.642,spotify:track:1xzi1Jcr7mEi9K2RfzLOqS
4,0314PeD1sQNonfVWix3B2K,0.903,0.519,0.000106,0.155,-9.151,0.26,114.991,audio_features,0.587,spotify:track:0314PeD1sQNonfVWix3B2K
