# Spotify ETL Pipeline - local version

In [173]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
import os

Load Spotify credentials from `.env` file

In [174]:
load_dotenv()

True

Connect to Spotify API using the ID and Secret stored in `.env`

In [175]:
spotify = spotipy.Spotify(
    client_credentials_manager=
        SpotifyClientCredentials(
            client_id = os.getenv('CLIENT_ID'),
            client_secret = os.getenv('CLIENT_SECRET')
                    ))

Define a dictionary with playlists that you want to look into

In [176]:
playlists = {'rock':'spotify:playlist:37i9dQZF1DX4DZAVUAwHMT'}

Pull in playlist data for playlist above

In [177]:
playlist_tracks = spotify.playlist_tracks(playlist_id = playlists['rock'], market = 'GB')

See a list of keys in returned dictionary. Information about each track is contained in the `items` key

In [178]:
playlist_tracks.keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

Data for each song is under the `track` key

In [9]:
playlist_tracks['items'][0].keys()

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])

And each track has a bunch of data about it within its own dictionary

In [10]:
playlist_tracks['items'][0]

{'added_at': '2023-06-29T23:00:00Z',
 'added_by': {'external_urls': {'spotify': 'https://open.spotify.com/user/'},
  'href': 'https://api.spotify.com/v1/users/',
  'id': '',
  'type': 'user',
  'uri': 'spotify:user:'},
 'is_local': False,
 'primary_color': None,
 'track': {'album': {'album_type': 'single',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6Ad91Jof8Niiw0lGLLi3NW'},
     'href': 'https://api.spotify.com/v1/artists/6Ad91Jof8Niiw0lGLLi3NW',
     'id': '6Ad91Jof8Niiw0lGLLi3NW',
     'name': 'YUNGBLUD',
     'type': 'artist',
     'uri': 'spotify:artist:6Ad91Jof8Niiw0lGLLi3NW'}],
   'external_urls': {'spotify': 'https://open.spotify.com/album/4MwosQ9tTm95DAaHdoNs5E'},
   'href': 'https://api.spotify.com/v1/albums/4MwosQ9tTm95DAaHdoNs5E',
   'id': '4MwosQ9tTm95DAaHdoNs5E',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab67616d0000b273df5bd10ce6f6fc2723f892b4',
     'width': 640},
    {'height': 300,
     'url': 'https://i.scdn.

In [11]:
playlist_tracks['items'][0]['track'].keys()

dict_keys(['album', 'artists', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'is_playable', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

We're going to build a function which will go through a playlist and identify all artists and their spotify URIs. This will end up in a dictionary 

{uri : artist}

In [179]:
from spotify_tools import get_artists_from_playlist

In [180]:
artists = get_artists_from_playlist(playlists['rock'])

In [181]:
artists

{'spotify:artist:6Ad91Jof8Niiw0lGLLi3NW': 'YUNGBLUD',
 'spotify:artist:1kDGbuxWknIKx4FlgWxiSp': 'Nothing But Thieves',
 'spotify:artist:1Ffb6ejR6Fe5IamqA5oRUF': 'Bring Me The Horizon',
 'spotify:artist:1caBfBEapzw8z2Qz9q0OaQ': 'Asking Alexandria',
 'spotify:artist:2n2RSaZqBuUUukhbLlpnE6': 'Sleep Token',
 'spotify:artist:7jy3rLJdDQY21OgRLCZ9sD': 'Foo Fighters',
 'spotify:artist:6blEmsLU25ewy8hHtgZaSL': 'Jazmin Bean',
 'spotify:artist:1koutXdSFq2PHqtxSWj9tK': 'Hot Milk',
 'spotify:artist:3Wcyta3gkOdQ4TfY0WyZpu': 'YONAKA',
 'spotify:artist:0RqtSIYZmd4fiBKVFqyIqD': 'Thirty Seconds To Mars',
 'spotify:artist:24XtlMhEMNdi822vi0MhY1': 'Taking Back Sunday',
 'spotify:artist:0KDuKk6YdEu3hR56HtXmxt': 'Story Of The Year',
 'spotify:artist:64EHXDoln95lnccszdPum0': 'Royal & the Serpent',
 'spotify:artist:4UXqAaa6dQYAk18Lv7PEgX': 'Fall Out Boy',
 'spotify:artist:4QxIol1JzAa4ePmDytv0e4': 'KID BRUNSWICK',
 'spotify:artist:6ekYAO2D1JkI58CF4uRRqw': 'Tigercub',
 'spotify:artist:7MhMgCo0Bl0Kukl93PZbYS': '

Next we need to build a function to pull in album data for each artist in the playlist. The goal is to produce a dictionary with the following information: 

{artist:

Album_name:

Album_Length:

Year_Released:
}

Let's have a look at the structure of album response

In [24]:
albums = spotify.artist_albums(artist_id = 'spotify:artist:7jy3rLJdDQY21OgRLCZ9sD', limit = 2, country = 'GB')

In [25]:
albums.keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [39]:
albums['items'][0].keys()

dict_keys(['album_group', 'album_type', 'artists', 'external_urls', 'href', 'id', 'images', 'is_playable', 'name', 'release_date', 'release_date_precision', 'total_tracks', 'type', 'uri'])

In [44]:
albums['items'][0]

{'album_group': 'album',
 'album_type': 'album',
 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/7jy3rLJdDQY21OgRLCZ9sD'},
   'href': 'https://api.spotify.com/v1/artists/7jy3rLJdDQY21OgRLCZ9sD',
   'id': '7jy3rLJdDQY21OgRLCZ9sD',
   'name': 'Foo Fighters',
   'type': 'artist',
   'uri': 'spotify:artist:7jy3rLJdDQY21OgRLCZ9sD'}],
 'external_urls': {'spotify': 'https://open.spotify.com/album/4wp4aWWpoYfNcspimVAnel'},
 'href': 'https://api.spotify.com/v1/albums/4wp4aWWpoYfNcspimVAnel',
 'id': '4wp4aWWpoYfNcspimVAnel',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab67616d0000b27384c85afa887f664fef3c5e8a',
   'width': 640},
  {'height': 300,
   'url': 'https://i.scdn.co/image/ab67616d00001e0284c85afa887f664fef3c5e8a',
   'width': 300},
  {'height': 64,
   'url': 'https://i.scdn.co/image/ab67616d0000485184c85afa887f664fef3c5e8a',
   'width': 64}],
 'is_playable': True,
 'name': 'But Here We Are',
 'release_date': '2023-06-02',
 'release_date_prec

Album URIs then go into another call where the information about tracks on each album is retrieved

For each artist in the dictionary pull out the list of albums and then for each album calculate its length

In [190]:
def get_data(playlist):
    # define output dictionary structure
    final_data_dictionary = {
    'Artist': [],
    'Album Name': [],
    'Year Released': [],
    'Album Length': []  }

    artists = get_artists_from_playlist(playlist)
    for artist_key in artists:
        artist_name = artists[artist_key]
    
        # pull out all albums by artist
        albums_by_artist = spotify.artist_albums(artist_id = artist_key, country = 'GB',album_type='album', limit=50)

        for album in albums_by_artist['items']:
            # all album data 
            release_date = album['release_date']
            album_name = album['name']
            artist = album['artists'][0]['name']
            album_uri = album['uri']

            album_duration = 0
            for song in spotify.album(album_uri, market = 'GB')['tracks']['items']:
                album_duration += song['duration_ms']

             # put final dictionary together
            final_data_dictionary['Artist'].append(artist_name)
            final_data_dictionary['Album Name'].append(album_name)
            final_data_dictionary['Year Released'].append(release_date)
            final_data_dictionary['Album Length'].append(album_duration)    
            
    return pd.DataFrame.from_dict(final_data_dictionary)
            

In [191]:
output_file = get_data(playlists['rock'])

In [169]:
output_file.to_parquet('tmp/test.parquet')