## Loading credentials from the config file

In [1]:
import config
import pandas as pd
import requests

## Initialising Spotipy API

In [2]:
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials


#Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret))

## Collecting songs from Spotify's public playlists

The first step is building a database which contains the songs that will be provided as recommendation to the user. It is important to have songs of different genres so that whatever song the user inputs, there could be a corresponding match in the database. 

In order to speed up gathering songs, I opted to 'scrape' Spotify's own Public Playlists, assuming that those would contain a wide range of songs in different genres. According to the following webpage, there are 172 playlists edited by Spotify: https://open.spotify.com/user/spotify/playlists.

In [4]:
# retrieve the user "spotify"'s public playlists and retrieve their IDs
playlist_ids = []
offset = 0
limit = 50
while True:
    playlists = sp.user_playlists(user='spotify', limit=limit, offset=offset)
    playlist_ids.extend([playlist['id'] for playlist in playlists['items']])
    offset += limit
    if not playlists['next']:
        break

In [5]:
# select only the first 172 playlist_ids

playlist_ids = playlist_ids[:172]
playlist_ids

['37i9dQZF1DXcBWIGoYBM5M',
 '37i9dQZF1DX0XUsuxWHRQd',
 '37i9dQZF1DX1lVhptIYRda',
 '37i9dQZF1DX10zKzsJ2jva',
 '37i9dQZF1DX4JAvHpjipBk',
 '37i9dQZF1DX4sWSpwq3LiO',
 '37i9dQZF1DX4SBhb3fqCJd',
 '37i9dQZF1DWXRqgorJj26U',
 '37i9dQZF1DX4dyzvuaRJ0n',
 '37i9dQZF1DXcF6B6QPhFDv',
 '37i9dQZF1DWXJfnUiYjUKT',
 '37i9dQZF1DXcRXFNfZr7Tp',
 '37i9dQZF1DX4o1oenSJRJd',
 '37i9dQZF1DXbTxeAdrVG2l',
 '37i9dQZF1DX4UtSsGT1Sbe',
 '37i9dQZF1DWTJ7xPn4vNaz',
 '37i9dQZF1DXaKIA8E7WcJj',
 '37i9dQZF1DWSV3Tk4GO2fq',
 '37i9dQZF1DWTwnEm1IYyoj',
 '37i9dQZF1DX2A29LI7xHn1',
 '37i9dQZF1DX2RxBh64BHjQ',
 '37i9dQZF1DWVA1Gq4XHa6U',
 '37i9dQZF1DWY4xHQp97fN6',
 '37i9dQZF1DWX3387IZmjNa',
 '37i9dQZF1DWYkaDif7Ztbp',
 '37i9dQZF1DXan38dNVDdl4',
 '37i9dQZF1DWSvKsRPPnv5o',
 '37i9dQZF1DWUVpAXiEPK8P',
 '37i9dQZF1DX0Tkc6ltcBfU',
 '37i9dQZF1DX1YPTAhwehsC',
 '37i9dQZF1DWTggY0yqBxES',
 '37i9dQZF1DX0HRj9P7NxeE',
 '37i9dQZF1DWT6SJaitNDax',
 '37i9dQZF1DX2r0FByV5U4C',
 '37i9dQZF1DWT2jS7NwYPVI',
 '37i9dQZF1DX82GYcclJ3Ug',
 '37i9dQZF1DX49jUV2NfGku',
 

## Defining functions to get audio features from playlist

After obtaining the playlist IDs of the selected playlists, it is necessary to define 2 functions:

1. Retrieve all tracks from the playlists
2. Extract audio features for the tracks

In [6]:
def get_playlist_tracks(username, playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

In [7]:
def get_audio_features(track_ids):
    audio_features = []
    for i in range(0, len(track_ids), 50):
        features_batch = sp.audio_features(track_ids[i:i+50])
        for features in features_batch:
            if features:
                audio_features.append(features)
    return audio_features

## Creating a database with all songs in Spotify playlists

In [8]:
import time

start_time = time.time()

# Retrieve all tracks from the playlists
tracks = []
counter = 0
for playlist_id in playlist_ids:
    tracks += get_playlist_tracks('spotify', playlist_id)
    counter += 1
    if counter % 50 == 0:
        time.sleep(5) # sleep for 5 seconds every 50 playlists

# Filter out any tracks that are None
tracks = [track for track in tracks if track is not None]

# Extract audio features for the tracks
track_ids = [track['track']['id'] for track in tracks if track.get('track')]
track_ids = [id for id in track_ids if isinstance(id, str)]
audio_features = get_audio_features(track_ids)

# Build a pandas DataFrame from the audio features
df = pd.DataFrame(audio_features)

end_time = time.time()

print("Time taken:", end_time - start_time, "seconds")


Time taken: 122.13358116149902 seconds


In [9]:
df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.696,0.8090,5,-8.254,1,0.0500,0.2520,0.000128,0.2480,0.8570,132.962,audio_features,6AQbmUe0Qwf5PZnt4HmTXv,spotify:track:6AQbmUe0Qwf5PZnt4HmTXv,https://api.spotify.com/v1/tracks/6AQbmUe0Qwf5...,https://api.spotify.com/v1/audio-analysis/6AQb...,131013,4
1,0.707,0.6810,0,-4.325,1,0.0668,0.0632,0.000005,0.0322,0.6460,117.999,audio_features,0yLdNVWF3Srea0uzk55zFn,spotify:track:0yLdNVWF3Srea0uzk55zFn,https://api.spotify.com/v1/tracks/0yLdNVWF3Sre...,https://api.spotify.com/v1/audio-analysis/0yLd...,200455,4
2,0.644,0.7350,8,-5.747,1,0.0391,0.0521,0.144000,0.1610,0.4180,88.980,audio_features,1Qrg8KqiBpW07V7PNxwwwL,spotify:track:1Qrg8KqiBpW07V7PNxwwwL,https://api.spotify.com/v1/tracks/1Qrg8KqiBpW0...,https://api.spotify.com/v1/audio-analysis/1Qrg...,153947,4
3,0.538,0.7420,2,-5.355,1,0.1140,0.1380,0.000047,0.0934,0.2500,96.107,audio_features,5Z2MiIZ5I3jJvvmeWMLbOQ,spotify:track:5Z2MiIZ5I3jJvvmeWMLbOQ,https://api.spotify.com/v1/tracks/5Z2MiIZ5I3jJ...,https://api.spotify.com/v1/audio-analysis/5Z2M...,272373,4
4,0.715,0.6200,1,-6.005,0,0.0484,0.4170,0.000000,0.0822,0.1720,97.950,audio_features,2dHHgzDwk4BJdRwy9uXhTO,spotify:track:2dHHgzDwk4BJdRwy9uXhTO,https://api.spotify.com/v1/tracks/2dHHgzDwk4BJ...,https://api.spotify.com/v1/audio-analysis/2dHH...,221520,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15374,0.261,0.0585,5,-22.054,0,0.0473,0.9870,0.872000,0.0887,0.1380,67.242,audio_features,2IAnU53rak76TpGpA515Xl,spotify:track:2IAnU53rak76TpGpA515Xl,https://api.spotify.com/v1/tracks/2IAnU53rak76...,https://api.spotify.com/v1/audio-analysis/2IAn...,439213,4
15375,0.322,0.1290,1,-20.671,1,0.0437,0.9920,0.003620,0.1010,0.0449,129.018,audio_features,4zPZbj1sNxCOtU1CdgCK53,spotify:track:4zPZbj1sNxCOtU1CdgCK53,https://api.spotify.com/v1/tracks/4zPZbj1sNxCO...,https://api.spotify.com/v1/audio-analysis/4zPZ...,260187,4
15376,0.311,0.1580,10,-16.724,1,0.0380,0.9690,0.901000,0.2400,0.1700,96.367,audio_features,2qYu0I9yKKpGnbySmN6w3K,spotify:track:2qYu0I9yKKpGnbySmN6w3K,https://api.spotify.com/v1/tracks/2qYu0I9yKKpG...,https://api.spotify.com/v1/audio-analysis/2qYu...,427653,4
15377,0.456,0.1090,10,-20.268,1,0.0604,0.9940,0.000266,0.0836,0.2400,68.848,audio_features,1xU1yjwGAGCCE3RiB8YKLe,spotify:track:1xU1yjwGAGCCE3RiB8YKLe,https://api.spotify.com/v1/tracks/1xU1yjwGAGCC...,https://api.spotify.com/v1/audio-analysis/1xU1...,201640,5


In [10]:
df.to_csv('Data/df_top_playlists_all.csv', index=False)