# Get Spotify data
This Jupyter notebook provides a Python script for accessing Spotify's Web API to retrieve data about playlists, playlist items, track audio features, and track audio analysis. The Spotify Web API provides a wide range of information about user's music preferences, which can be used for various music analytics applications.

Please refer to Spotify's official documentation here on [how to request for an access token](https://developer.spotify.com/documentation/web-api/tutorials/getting-started#request-an-access-token).

You'll need your own `CLIENT_ID`, `CLIENT_SECRET` and `USER_ID` credentials to proceed.

In [1]:
import os
import pandas as pd
import re
import requests
import time
import json
from utils import *
%load_ext dotenv
%dotenv

In [2]:
# Retrieve credentials from the environment variables
CLIENT_ID = os.environ['CLIENT_ID']
CLIENT_SECRET = os.environ['CLIENT_SECRET']
USER_ID = os.environ['USER_ID']
REDIRECT_URI = os.environ['REDIRECT_URI']
SCOPE = ''

## Get authorization
Initiates the authorization flow for Spotify API using port 8889. It returns the callback URL to `callback_url` variable after the user grants access.

The `CallbackHandler` class is designed to handle incoming GET requests to extract the callback URL.

In [3]:
import http.server
import socketserver
import webbrowser

In [4]:
class CallbackHandler(http.server.SimpleHTTPRequestHandler):
    def log_message(self, format, *args):
        """
        Override log_message method to suppress server logs.
        """
        pass
    
    def do_GET(self):
        """
        Handle the GET request to extract the callback URL.
        """

        # Declare callback_url as global to modify the global variable
        global callback_url

        self.send_response(200)
        self.send_header('Content-type', 'text/html')
        self.end_headers()

        # Extract the callback URL from self.path
        callback_url = self.path

In [5]:
# Spotify authorization URL
auth_url = 'https://accounts.spotify.com/authorize'
callback_url = None

# Parameters for the authorization request
params = {
    'client_id': CLIENT_ID,
    'response_type': 'code',
    'redirect_uri': REDIRECT_URI,
    'scope': SCOPE
}

# Send the authorization request and get the response URL
auth_response = requests.get(auth_url, params=params)
auth_response_url = auth_response.url

# Start the local server to handle the callback
with socketserver.TCPServer(('localhost', 8889), CallbackHandler) as httpd:
    # Open the authorization URL in a browser
    webbrowser.open(auth_response_url)

    # Handle a single request
    httpd.handle_request()

## Generate access token

In [6]:
AUTH_CODE = callback_url[15:]

In [7]:
access_token = request_an_access_token(CLIENT_ID, CLIENT_SECRET, AUTH_CODE, REDIRECT_URI)

## Get playlist
Get a playlist owned by a Spotify user. [See Spotify documentation](https://developer.spotify.com/documentation/web-api/reference/get-playlist)

In [8]:
# # Get playlist for the first time and save raw data to a file
# data = get_playlist(USER_ID, access_token)
# with open('./data/raw/playlists.json', 'w') as f:
#     json.dump(data, f)

# Or, load saved data from a file
with open('./data/raw/playlists.json', 'r') as f:
    data = json.load(f)

In [9]:
# Normalize the data and explode the 'items' column to create a row for each playlist
df_playlists = pd.json_normalize(data).explode('items')

# Normalize the 'items' column to extract the relevant playlist information
df_playlists = pd.json_normalize(df_playlists['items'])

# Select only the columns we need for our analysis
df_playlists = df_playlists[['id', 'name', 'tracks.total']]

In [10]:
df_playlists.head()

Unnamed: 0,id,name,tracks.total
0,13Qsm0axSPpL11U5yGhwFS,Awkward,2
1,730Ce3gbzbasRtac5l8eXs,My Songs,1085
2,2gibcQ6TCJyyQgdJaDNWsT,Nicole&Shaun Wedding Playlist :),106
3,78TQufEn9zE564Is7DKk46,Karaoke,10
4,6mAJ1EqzlLuD5o97BB1VNP,1) 29.10 Pre-walk in,16


In [11]:
# # Save tabulated playlists to a file
# df_playlists.to_csv('./data/playlists.csv', index = False)

## Get playlist items
Get full details of the items of a playlist owned by a Spotify user. [See Spotify documentation](https://developer.spotify.com/documentation/web-api/reference/get-playlists-tracks)

In [12]:
df_playlists = pd.read_csv('./data/playlists.csv')

# Calculate the number of offsets needed to retrieve all tracks in each playlist
df_playlists['offsets_needed'] = df_playlists['tracks.total'] // 100 + 1

In [13]:
# # Get playlist items for the first time and save raw data to a file
# for index, row in df_playlists.iterrows():
#     for i in range(row['offsets_needed']):
#         playlist_id = row['id']
#         offset = i * 100
        
#         data = get_playlist_tracks(playlist_id, access_token, offset)
#         df_tmp = pd.json_normalize(data)
        
#         if index == 0 and i == 0:
#             df_raw_tracks = df_tmp
#         else:
#             df_raw_tracks = pd.concat([df_raw_tracks, df_tmp])
            
#         print(f'Playlist: {index}; Offset: {offset}')

# df_raw_tracks.to_json('./data/raw/tracks.json', orient = 'records')

# Or, load saved data from a file
df_raw_tracks = pd.read_json('./data/raw/tracks.json')

Playlist: 0; Offset: 0
Playlist: 1; Offset: 0
Playlist: 1; Offset: 100
Playlist: 1; Offset: 200
Playlist: 1; Offset: 300
Playlist: 1; Offset: 400
Playlist: 1; Offset: 500
Playlist: 1; Offset: 600
Playlist: 1; Offset: 700
Playlist: 1; Offset: 800
Playlist: 1; Offset: 900
Playlist: 1; Offset: 1000
Playlist: 2; Offset: 0
Playlist: 2; Offset: 100
Playlist: 3; Offset: 0
Playlist: 4; Offset: 0
Playlist: 5; Offset: 0
Playlist: 6; Offset: 0
Playlist: 7; Offset: 0
Playlist: 8; Offset: 0


In [14]:
# Explode the 'items' column to create a row for each track
df_tracks = df_raw_tracks[['href', 'items']].explode('items')

# Add the normalize the 'items' columns to our df_tracks
df_tracks = pd.concat([df_tracks.reset_index(drop = True), pd.json_normalize(df_tracks['items']).reset_index(drop = True)], axis = 1)

# Extract the playlist ID from the track URL
df_tracks['playlist_id'] = df_tracks.apply(lambda x: re.sub(r'https:\/\/.*\/([\w-]+)\/.*$', r'\1', x['href']), axis = 1)

# Select only the columns we need for our analysis
df_tracks = df_tracks[['href', 'track.id', 'track.name', 'playlist_id']]

In [15]:
df_tracks.head()

Unnamed: 0,href,track.id,track.name,playlist_id
0,https://api.spotify.com/v1/playlists/13Qsm0axS...,4CPYZtb4tX2V03jcsJAZCD,Where's Kevin (From 'Overcooked! 2'),13Qsm0axSPpL11U5yGhwFS
1,https://api.spotify.com/v1/playlists/13Qsm0axS...,4cmRCH5q4Mp5DKqsGkQ2eu,"Super Mario Theme (From ""Super Mario"")",13Qsm0axSPpL11U5yGhwFS
2,https://api.spotify.com/v1/playlists/730Ce3gbz...,5x5JM1BSB6vollcIzDocqT,The Climb,730Ce3gbzbasRtac5l8eXs
3,https://api.spotify.com/v1/playlists/730Ce3gbz...,05mAIVLkIWc2d1UBYZBCp8,1999,730Ce3gbzbasRtac5l8eXs
4,https://api.spotify.com/v1/playlists/730Ce3gbz...,5ojMDcYCghyRb4JSI7JL0Z,Starving,730Ce3gbzbasRtac5l8eXs


In [16]:
# # Save tabulated playlist items to a file
# df_tracks.to_csv('./data/tracks.csv', index = False)

## Get track's audio features
Get audio feature information for a single track identified by its unique Spotify ID. [See Spotify documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-features)

In [17]:
df_tracks = pd.read_csv('./data/tracks.csv')

# Select only the unique track IDs and names
df_unq_tracks = df_tracks[['track.id', 'track.name']].drop_duplicates().reset_index(drop=True)

# Assign a batch number to each unique track to group them for API requests
df_unq_tracks['batch'] = (df_unq_tracks.index.values.astype(int) // 100) + 1

In [18]:
df_unq_tracks

Unnamed: 0,track.id,track.name,batch
0,4CPYZtb4tX2V03jcsJAZCD,Where's Kevin (From 'Overcooked! 2'),1
1,4cmRCH5q4Mp5DKqsGkQ2eu,"Super Mario Theme (From ""Super Mario"")",1
2,5x5JM1BSB6vollcIzDocqT,The Climb,1
3,05mAIVLkIWc2d1UBYZBCp8,1999,1
4,5ojMDcYCghyRb4JSI7JL0Z,Starving,1
...,...,...,...
1184,37dYAkMa4lzRCH6kDbMT1L,We No Speak Americano (Edit),12
1185,3dxDj8pDPlIHCIrUPXuCeG,Sandstorm,12
1186,09cM9BjyNFizKUOXh6j9rT,Sparks Fly,12
1187,0tr6XR58KBdDYd8qvHVTs8,Back To December,12


In [19]:
# # Get track audio features for the first time and save raw data to a file
# for i in df_unq_tracks['batch'].unique():
#     df_batch_i = df_unq_tracks[df_unq_tracks['batch'] == i]
#     data = df_batch_i.apply(lambda x: get_audio_features(x['track.id'], access_token), axis = 1)
#     df_tmp = pd.json_normalize(data)

#     if i == 1:
#         df_audio_features = df_tmp
#     else:
#         df_audio_features = pd.concat([df_audio_features, df_tmp])

#     print(f'Batch: {i}')
#     time.sleep(30)

# df_audio_features.to_csv('./data/raw/audio_features.csv', index = False)

# Or, load saved data from a file
df_audio_features = pd.read_csv('./data/raw/audio_features.csv')

Batch: 1
Batch: 2
Batch: 3
Batch: 4
Batch: 5
Batch: 6
Batch: 7
Batch: 8
Batch: 9
Batch: 10
Batch: 11
Batch: 12


In [20]:
df_audio_features.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.9,0.316,0,-14.0,0,0.171,0.137,0.757,0.0824,0.972,110.026,audio_features,4CPYZtb4tX2V03jcsJAZCD,spotify:track:4CPYZtb4tX2V03jcsJAZCD,https://api.spotify.com/v1/tracks/4CPYZtb4tX2V...,https://api.spotify.com/v1/audio-analysis/4CPY...,132620,4
1,0.769,0.324,0,-13.916,1,0.43,0.214,0.846,0.0505,0.964,100.123,audio_features,4cmRCH5q4Mp5DKqsGkQ2eu,spotify:track:4cmRCH5q4Mp5DKqsGkQ2eu,https://api.spotify.com/v1/tracks/4cmRCH5q4Mp5...,https://api.spotify.com/v1/audio-analysis/4cmR...,188843,4
2,0.336,0.602,4,-6.354,1,0.0325,0.0635,0.0,0.141,0.299,161.01,audio_features,5x5JM1BSB6vollcIzDocqT,spotify:track:5x5JM1BSB6vollcIzDocqT,https://api.spotify.com/v1/tracks/5x5JM1BSB6vo...,https://api.spotify.com/v1/audio-analysis/5x5J...,234520,4
3,0.739,0.742,7,-4.586,1,0.0329,0.0227,1e-06,0.229,0.659,124.016,audio_features,05mAIVLkIWc2d1UBYZBCp8,spotify:track:05mAIVLkIWc2d1UBYZBCp8,https://api.spotify.com/v1/tracks/05mAIVLkIWc2...,https://api.spotify.com/v1/audio-analysis/05mA...,189000,4
4,0.734,0.614,4,-4.219,1,0.0616,0.374,0.0,0.103,0.533,100.021,audio_features,5ojMDcYCghyRb4JSI7JL0Z,spotify:track:5ojMDcYCghyRb4JSI7JL0Z,https://api.spotify.com/v1/tracks/5ojMDcYCghyR...,https://api.spotify.com/v1/audio-analysis/5ojM...,181880,4


## Get track's audio analysis
Get a low-level audio analysis for a track in the Spotify catalog. The audio analysis describes the track’s structure and musical content, including rhythm, pitch, and timbre. [See Spotify documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis)

In [21]:
# # Get track audio analysis for the first time and save raw data to a file
# for i in df_unq_tracks['batch'].unique():
#     df_batch_i = df_unq_tracks[df_unq_tracks['batch'] == i]
#     data = df_batch_i.apply(lambda x: get_audio_analysis(x['track.id'], access_token), axis = 1)
#     df_tmp = pd.json_normalize(data)

#     if i == 1:
#         df_audio_analysis = df_tmp
#     else:
#         df_audio_analysis = pd.concat([df_audio_analysis, df_tmp])

#     print(f'Batch: {i}')
#     time.sleep(30)

# df_audio_analysis = pd.concat([df_unq_tracks.reset_index(drop = True), df_audio_analysis.reset_index(drop = True)], axis = 1)
# df_audio_analysis.to_csv('./data/raw/audio_analysis.csv', index = False)
    
# Or, load saved data from a file
df_audio_analysis = pd.read_csv('./data/raw/audio_analysis.csv')

Batch: 1
Batch: 2
Batch: 3
Batch: 4
Batch: 5
Batch: 6
Batch: 7
Batch: 8
Batch: 9
Batch: 10
Batch: 11
Batch: 12


In [22]:
df_audio_analysis.head()

Unnamed: 0,track.id,track.name,batch,bars,beats,sections,segments,tatums,meta.analyzer_version,meta.platform,...,track.mode,track.mode_confidence,track.codestring,track.code_version,track.echoprintstring,track.echoprint_version,track.synchstring,track.synch_version,track.rhythmstring,track.rhythm_version
0,4CPYZtb4tX2V03jcsJAZCD,Where's Kevin (From 'Overcooked! 2'),1,"[{'start': 0.68467, 'duration': 2.18383, 'conf...","[{'start': 0.13989, 'duration': 0.54478, 'conf...","[{'start': 0.0, 'duration': 18.13542, 'confide...","[{'start': 0.0, 'duration': 0.13338, 'confiden...","[{'start': 0.13989, 'duration': 0.27239, 'conf...",4.0.0,Linux,...,0,0.253,eJxVmVuC5CgMBK_iI_AUcP-LTYRwV9V87GzjwjZIqVQmXn...,3.15,eJzlnQuOJDmPpK_kekvH0fP-R9jPGD0V1QmEcgu5tY3FDj...,4.12,eJxNlwmSLDkIQ6_iI6TB6_0vNtLD1X-iK7oqnV5ACIEjvv...,1.0,eJyNWgm2HDEKu0odwbvN_S82RgLs6vqZmfeSdKcWLyCEwJ...,1.0
1,4cmRCH5q4Mp5DKqsGkQ2eu,"Super Mario Theme (From ""Super Mario"")",1,"[{'start': 1.36865, 'duration': 2.39713, 'conf...","[{'start': 0.76593, 'duration': 0.60272, 'conf...","[{'start': 0.0, 'duration': 16.95381, 'confide...","[{'start': 0.0, 'duration': 0.16259, 'confiden...","[{'start': 0.76593, 'duration': 0.30136, 'conf...",4.0.0,Linux,...,1,0.543,eJxdm9mB6zoMQ1txCdop9d_YHEBelPl4940Ux6YpEgSX5J...,3.15,eJzNnQvSa7mNpLd0-CaXw-f-lzBfQmXLfaNFhUdTUdMOo8...,4.12,eJxVWQmS3CAM_IqfYCQu__9joQ8xm8rWZMZg0NGSGhHxjv...,1.0,eJxtXA2S5LwKu0qOEP9h-_4XWyMJnJ7aqm_fzHR3EtuAEI...,1.0
2,5x5JM1BSB6vollcIzDocqT,The Climb,1,"[{'start': 0.31047, 'duration': 1.56331, 'conf...","[{'start': 0.31047, 'duration': 0.38741, 'conf...","[{'start': 0.0, 'duration': 8.20337, 'confiden...","[{'start': 0.0, 'duration': 0.081, 'confidence...","[{'start': 0.31047, 'duration': 0.1937, 'confi...",4.0.0,Linux,...,1,0.543,eJxVmokRHDcMBFPZEPg_-Sfm7uHJJZXtssjb5ZLAYDAAtW...,3.15,eJzdvQmS7LC1XTslkiBAYjho5z-EvxbKoSspgijr36eww-...,4.12,eJx1WYmV5TYMa8UlmIcO999YRAD0eHY3L5MfW9bBAwRJxf...,1.0,eJxVnIlxYzkMRFNRCLyP_BNbdD_we7ZqvR7L0ieJs9EAXW...,1.0
3,05mAIVLkIWc2d1UBYZBCp8,1999,1,"[{'start': 0.26254, 'duration': 1.96513, 'conf...","[{'start': 0.26254, 'duration': 0.50354, 'conf...","[{'start': 0.0, 'duration': 47.23179, 'confide...","[{'start': 0.0, 'duration': 0.26685, 'confiden...","[{'start': 0.26254, 'duration': 0.25177, 'conf...",4.0.0,Linux,...,1,0.582,eJxVmwcWHbcRBK-yR0AO97-YqxqfIi0_iVz8DcCEnp4GfP...,3.15,eJztnQma5bqRnbdEYsZyMBD7X4L_EywlS-kmrsupaqn9uY...,4.12,eJxdWguW3DAIu4qPEMDf-1-sRhKZbN_uTjOJ4w9gSeC6P-...,1.0,eJxlWwm2JCmOvEocAcQiuP_FRrbgP6vnva7OzAhfQGgxMy...,1.0
4,5ojMDcYCghyRb4JSI7JL0Z,Starving,1,"[{'start': 1.26645, 'duration': 2.40117, 'conf...","[{'start': 0.07445, 'duration': 0.59677, 'conf...","[{'start': 0.0, 'duration': 9.0533, 'confidenc...","[{'start': 0.0, 'duration': 0.31796, 'confiden...","[{'start': 0.07445, 'duration': 0.29839, 'conf...",4.0.0,Linux,...,1,0.335,eJxVmgmy5TYOBK-iI3Bf7n-xySy-dk87HPYnn0SRWAoFgK...,3.15,eJztvWuyJDmOrLkl45tcDp_7X8J8Cq9Oz4pp47k1p1Nu_x...,4.12,eJx1WYe15DgMa0UlWKRi_40dAVCy59_d29kZWYFJYPI3e_...,1.0,eJx1mwuSazmOQ7fiJej_2f_GhsCh7HzVMRFdXVlOX12JIk...,1.0
