# Spotify track features analysis

In [123]:
from spotipy import Spotify
import pandas as pd
import plotly.express as px

## Login and get data

I used the accompanying `login.py` script to generate an access token, which can be supplied directly as the `auth` kwarg to the client initializer.

In [126]:
sp = Spotify(auth="REDACTED")

Start off by getting my playlists, my saved songs, and my most recent plays.

In [127]:
playlists = sp.current_user_playlists()['items']

In [128]:
plists = [{"ID": x['id'], "Name": x['name']} for x in playlists]
plists = pd.DataFrame(plists)
plists.head()

Unnamed: 0,ID,Name
0,05uRh5eogpZNyLGXLzLPrW,buffalo
1,4po0sG9mdx1phmlhH8adQL,dark and wavvy
2,21k0bkuYPlawkg6w2429rn,where's jon - part i
3,06zvFArrpSrYTtCqz69nDy,songs
4,5FwP3qXtWiiXlMALrhWsm3,synthy?


In [129]:
recents = sp.current_user_recently_played()

In [130]:
_recents = list()
for i in recents['items']:
    t = i['track']
    artists = ", ".join(x['name'] for x in t['artists'])
    artist_ids = ",".join(x['id'] for x in t['artists'])
    name = t['name']
    track_id = t['id']
    popularity = t['popularity']
    album_id = t['album']['id']
    _recents.append({
        "Artists": artists,
        "Artist IDs": artist_ids,
        "Name": name,
        "Track ID": track_id,
        "Popularity": popularity,
        "Album ID": album_id
    })

recents = pd.DataFrame(_recents)
recents.head()

Unnamed: 0,Artists,Artist IDs,Name,Track ID,Popularity,Album ID
0,Veeshy,4I9gvXHCMWMzarL8yOyjIL,Beverly Blues,13NCh4WiC6l7AGyWDMvLqh,20,2JRlWH7ozsFUDU9aOrqeM0
1,"Veeshy, Phonic Youth","4I9gvXHCMWMzarL8yOyjIL,19j2mxQkCDl6jNS9s6bf3D",Sunset Strip,54leWgkYwmwTGXf97IyQAZ,35,2JRlWH7ozsFUDU9aOrqeM0
2,Veeshy,4I9gvXHCMWMzarL8yOyjIL,Re-L.A.X.,6Ig7e0deA3hFCuOQtPXjDk,20,2JRlWH7ozsFUDU9aOrqeM0
3,Veeshy,4I9gvXHCMWMzarL8yOyjIL,Mulholland Drive,0bWRGbQRGJeMZnTwryl09m,23,2JRlWH7ozsFUDU9aOrqeM0
4,"Veeshy, Brandon Mignacca","4I9gvXHCMWMzarL8yOyjIL,5k1EecpyjkutxtmZKlsBOj",Let Me Down,1njLPXRavhN7KTt7nRLO3X,27,2JRlWH7ozsFUDU9aOrqeM0


In [131]:
page_size = 50
saved_songs_res = sp.current_user_saved_tracks(limit=page_size)
saved_tracks = saved_songs_res['items']
total = saved_songs_res['total']
off = page_size
while off < total + page_size:
    next_res = sp.current_user_saved_tracks(offset=off, limit=page_size)
    saved_tracks.extend(next_res['items'])
    off += page_size

In [132]:
_saved_tracks = list()
for i in saved_tracks:
    t = i['track']
    artists = ", ".join(x['name'] for x in t['artists'])
    artist_ids = ",".join(x['id'] for x in t['artists'])
    name = t['name']
    track_id = t['id']
    popularity = t['popularity']
    album_id = t['album']['id']
    _saved_tracks.append({
        "Artists": artists,
        "Artist IDs": artist_ids,
        "Name": name,
        "Track ID": track_id,
        "Popularity": popularity,
        "Album ID": album_id
    })

saved_tracks = pd.DataFrame(_saved_tracks)
saved_tracks.head()

Unnamed: 0,Artists,Artist IDs,Name,Track ID,Popularity,Album ID
0,Grum,3VEqFWRt47xQAZJMBF3duQ,Lose Control,0bqjS54zmDlYanW8mlx28k,54,52beQVaTRkEsdip6N4nWZo
1,KASHIWA Daisuke,5sGsy5o8hBSMmDUFTC5Q2P,april.#02,6P3bha1HLEaiwaDA5SlTRB,19,2Gl1cVFZAs3DX3uvaTHCHG
2,Memtrix,3bffaBH7akOhsSLsVJcJug,All You Are,1Pz2FNGmyvRYnlKd58ANEW,52,4CF7zjsdRMyUjjvQ2cvb6l
3,"Sound Quelle, Matt Fax","5mdTuNl23tON1WlsVbvD18,1XgI1X3xjXCKRP1ZjhqgkV",Sunburst,67b4kyYW9PcSu9ChwywKYe,36,1MYWKh9RzeEcToi4chhpoC
4,"Last Heroes, Satellite Empire","3HHfEn7yPOy3IiHS6CHG97,1DGpuIJ6KAI5bcaFGbJZJs",Take Your Time (feat. Satellite Empire),2piFAX26ski1mMmhWGnZQ8,35,4oQg6odo4r4sOG1GbG5TDk


In [133]:
saved_tracks.shape

(286, 6)

## Audio features

Now that we have basic track info, we can make more requests for some more granular track features. That's the data we _really_ want.

In [134]:
track_ids = saved_tracks['Track ID'].tolist()
features = list()
page_size = 100
i = 0
while i < len(track_ids) + page_size:
    _features = sp.audio_features(track_ids[i:i + page_size])
    features.extend(_features)
    i += page_size

Convert to a dataframe and take the interesting columns.

In [135]:
features_df = pd.DataFrame(features[:-1])
features_df = features_df[['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
features_df.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0bqjS54zmDlYanW8mlx28k,0.542,0.857,2,-6.133,0,0.0352,0.00113,0.571,0.0779,0.162,128.011,337500,4
1,6P3bha1HLEaiwaDA5SlTRB,0.387,0.47,0,-8.508,0,0.0476,0.613,0.859,0.133,0.219,119.95,1662987,4
2,1Pz2FNGmyvRYnlKd58ANEW,0.349,0.696,3,-3.274,0,0.0362,0.0137,0.00407,0.116,0.0782,173.901,330862,4
3,67b4kyYW9PcSu9ChwywKYe,0.638,0.823,0,-9.17,1,0.0401,0.0379,0.902,0.187,0.0384,124.017,210968,4
4,2piFAX26ski1mMmhWGnZQ8,0.475,0.44,2,-7.699,1,0.0297,0.126,3e-06,0.107,0.0584,95.043,267790,4


Do the same thing for the recent tracks.

In [136]:
track_ids = recents['Track ID']
features = sp.audio_features(track_ids.tolist())
recents_features_df = pd.DataFrame(features)[['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
recents_features_df.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,13NCh4WiC6l7AGyWDMvLqh,0.363,0.893,1,-8.242,1,0.0309,0.0016,0.934,0.355,0.0557,90.461,350000,4
1,54leWgkYwmwTGXf97IyQAZ,0.428,0.862,8,-7.327,1,0.0381,7.4e-05,0.0101,0.213,0.325,89.5,281564,4
2,6Ig7e0deA3hFCuOQtPXjDk,0.174,0.635,11,-9.17,0,0.0281,0.00128,0.924,0.142,0.103,90.018,346750,4
3,0bWRGbQRGJeMZnTwryl09m,0.349,0.947,4,-7.222,0,0.0421,0.00414,0.803,0.25,0.0437,178.028,350500,4
4,1njLPXRavhN7KTt7nRLO3X,0.487,0.71,2,-8.642,0,0.0314,0.0134,2.3e-05,0.114,0.176,88.467,293000,4


Combine the features from my library and my recents into one dataframe, and drop duplicates based on track ID.

In [137]:
master_features_df = pd.concat([features_df, recents_features_df])

In [138]:
master_features_df = master_features_df.drop_duplicates(subset='id')
master_features_df.shape

(311, 14)

Merge in the human-readable information (song name, artists' names, etc.).

In [139]:
human_song_info = pd.concat([recents, saved_tracks])
master_df = master_features_df.merge(right=human_song_info, how='right', left_on='id', right_on='Track ID')
master_df['hoverdata'] = master_df['Artists'] + " - " + master_df["Name"]
master_df.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,tempo,duration_ms,time_signature,Artists,Artist IDs,Name,Track ID,Popularity,Album ID,hoverdata
0,13NCh4WiC6l7AGyWDMvLqh,0.363,0.893,1,-8.242,1,0.0309,0.0016,0.934,0.355,...,90.461,350000,4,Veeshy,4I9gvXHCMWMzarL8yOyjIL,Beverly Blues,13NCh4WiC6l7AGyWDMvLqh,20,2JRlWH7ozsFUDU9aOrqeM0,Veeshy - Beverly Blues
1,54leWgkYwmwTGXf97IyQAZ,0.428,0.862,8,-7.327,1,0.0381,7.4e-05,0.0101,0.213,...,89.5,281564,4,"Veeshy, Phonic Youth","4I9gvXHCMWMzarL8yOyjIL,19j2mxQkCDl6jNS9s6bf3D",Sunset Strip,54leWgkYwmwTGXf97IyQAZ,35,2JRlWH7ozsFUDU9aOrqeM0,"Veeshy, Phonic Youth - Sunset Strip"
2,6Ig7e0deA3hFCuOQtPXjDk,0.174,0.635,11,-9.17,0,0.0281,0.00128,0.924,0.142,...,90.018,346750,4,Veeshy,4I9gvXHCMWMzarL8yOyjIL,Re-L.A.X.,6Ig7e0deA3hFCuOQtPXjDk,20,2JRlWH7ozsFUDU9aOrqeM0,Veeshy - Re-L.A.X.
3,0bWRGbQRGJeMZnTwryl09m,0.349,0.947,4,-7.222,0,0.0421,0.00414,0.803,0.25,...,178.028,350500,4,Veeshy,4I9gvXHCMWMzarL8yOyjIL,Mulholland Drive,0bWRGbQRGJeMZnTwryl09m,23,2JRlWH7ozsFUDU9aOrqeM0,Veeshy - Mulholland Drive
4,1njLPXRavhN7KTt7nRLO3X,0.487,0.71,2,-8.642,0,0.0314,0.0134,2.3e-05,0.114,...,88.467,293000,4,"Veeshy, Brandon Mignacca","4I9gvXHCMWMzarL8yOyjIL,5k1EecpyjkutxtmZKlsBOj",Let Me Down,1njLPXRavhN7KTt7nRLO3X,27,2JRlWH7ozsFUDU9aOrqeM0,"Veeshy, Brandon Mignacca - Let Me Down"


## Plotting the data

Finally, let's make some cool graphs.

In [140]:
color_scheme = ['#ff0000', '#0000ff']
fig = px.scatter_3d(master_df, x='danceability', y='energy', z='speechiness', width=800, height=600, color='valence', color_continuous_scale=color_scheme, hover_name='hoverdata')
zoom_level = 1.3
camera = dict(
    eye=dict(x=zoom_level, y=zoom_level, z=zoom_level),
    center_z=-0.25
)
fig.update_layout(scene_camera=camera, title='Speechiness, Energy, and Danceability of saved tracks and most recent 50 tracks', margin=dict(b=0, l=0, r=0, t=50))
fig.show()

In [141]:
fig = px.density_heatmap(master_df, x='danceability', y='energy', width=800, height=800, nbinsx=20, nbinsy=20, color_continuous_scale=color_scheme)
fig.update_layout(title='Danceability and energy histogram of saved tracks and most recent 50 tracks')
fig.show()

In [142]:
fig = px.histogram(master_df, x='tempo', width=800, height=800)
fig.update_layout(title='Tempo histogram of saved tracks and most recent 50 tracks')
fig.show()

## Playlists

Let's generate the same plots, but this time for a specific playlist, and see how it compares.

In [143]:
playlist_id = '2fYyFd58VaANjZ1uDAOsfZ'

In [144]:
pl = sp.playlist(playlist_id)
_playlist_tracks = list()
for i in pl['tracks']['items']:
    t = i['track']
    artists = ", ".join(x['name'] for x in t['artists'])
    artist_ids = ",".join(x['id'] for x in t['artists'])
    name = t['name']
    track_id = t['id']
    popularity = t['popularity']
    album_id = t['album']['id']
    _playlist_tracks.append({
        "Artists": artists,
        "Artist IDs": artist_ids,
        "Name": name,
        "Track ID": track_id,
        "Popularity": popularity,
        "Album ID": album_id
    })

pl_df = pd.DataFrame(_playlist_tracks)
pl_df.head()

Unnamed: 0,Artists,Artist IDs,Name,Track ID,Popularity,Album ID
0,"Matt Fax, Jack Dawson","1XgI1X3xjXCKRP1ZjhqgkV,2UrCf8LmvXFfXAR0HtGkSx",Close My Eyes,2eQfD774357Ymh9NzCl5T2,44,3NPt95B03k5zZptWmquiJu
1,"Anyma, Meg Myers","4iBwchw0U0GZv5RfVYSMxN,0W8xe7IqAPlnBRMUpWOUuJ",Running (feat. Meg Myers) - Extended Mix,2T5lAs95i1Gx6FjYYJZ1zr,31,16Kon7rTm0D4jOwEu2me2c
2,"Lipless, Blue Noir","0XmmX4fE4SiRMu3ICsP5sA,5N4aUAJA011nkAumBGSeAN",Cimmerian,0dmXGQ2zRVOm6N60S0UdHK,47,1a0g4P7lv4e9Mw9IjNxNLf
3,Nora En Pure,24DO0PijjITGIEWsO8XaPs,Reminiscing,1jCfoQwzEeIuawlCbhM1Kp,55,1qBiRuMAIssv7uT0xGksv3
4,Klur,5Y1YwWzFX7BIxBbdAOXOEJ,Between,4ZJXmXQeC0qL1nPVAWNRGd,39,6yMAZmRCZjavW98B0aF9Of


In [145]:
pl_df.shape

(80, 6)

In [146]:
pl_track_ids = pl_df['Track ID'].tolist()
pl_track_features = sp.audio_features(pl_track_ids)[:-1]
pl_features_df = pd.DataFrame(pl_track_features)[['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
pl_features_df.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2eQfD774357Ymh9NzCl5T2,0.434,0.841,10,-6.182,0,0.0382,0.000664,6e-06,0.115,0.233,123.899,222581,4
1,2T5lAs95i1Gx6FjYYJZ1zr,0.699,0.803,9,-7.437,0,0.0435,0.0347,0.693,0.115,0.0289,124.979,385437,4
2,0dmXGQ2zRVOm6N60S0UdHK,0.702,0.875,4,-6.652,1,0.0435,0.215,0.922,0.102,0.0509,120.998,216198,4
3,1jCfoQwzEeIuawlCbhM1Kp,0.612,0.848,0,-7.733,0,0.0587,0.129,0.917,0.275,0.0383,123.063,258537,4
4,4ZJXmXQeC0qL1nPVAWNRGd,0.674,0.606,10,-10.777,1,0.0354,0.359,0.802,0.108,0.238,122.011,302184,4


In [147]:
pl_master_df = pl_features_df.merge(right=pl_df, left_on='id', right_on='Track ID')
pl_master_df['hoverdata'] = pl_master_df['Artists'] + " - " + pl_master_df["Name"]
pl_master_df.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,tempo,duration_ms,time_signature,Artists,Artist IDs,Name,Track ID,Popularity,Album ID,hoverdata
0,2eQfD774357Ymh9NzCl5T2,0.434,0.841,10,-6.182,0,0.0382,0.000664,6e-06,0.115,...,123.899,222581,4,"Matt Fax, Jack Dawson","1XgI1X3xjXCKRP1ZjhqgkV,2UrCf8LmvXFfXAR0HtGkSx",Close My Eyes,2eQfD774357Ymh9NzCl5T2,44,3NPt95B03k5zZptWmquiJu,"Matt Fax, Jack Dawson - Close My Eyes"
1,2T5lAs95i1Gx6FjYYJZ1zr,0.699,0.803,9,-7.437,0,0.0435,0.0347,0.693,0.115,...,124.979,385437,4,"Anyma, Meg Myers","4iBwchw0U0GZv5RfVYSMxN,0W8xe7IqAPlnBRMUpWOUuJ",Running (feat. Meg Myers) - Extended Mix,2T5lAs95i1Gx6FjYYJZ1zr,31,16Kon7rTm0D4jOwEu2me2c,"Anyma, Meg Myers - Running (feat. Meg Myers) -..."
2,0dmXGQ2zRVOm6N60S0UdHK,0.702,0.875,4,-6.652,1,0.0435,0.215,0.922,0.102,...,120.998,216198,4,"Lipless, Blue Noir","0XmmX4fE4SiRMu3ICsP5sA,5N4aUAJA011nkAumBGSeAN",Cimmerian,0dmXGQ2zRVOm6N60S0UdHK,47,1a0g4P7lv4e9Mw9IjNxNLf,"Lipless, Blue Noir - Cimmerian"
3,1jCfoQwzEeIuawlCbhM1Kp,0.612,0.848,0,-7.733,0,0.0587,0.129,0.917,0.275,...,123.063,258537,4,Nora En Pure,24DO0PijjITGIEWsO8XaPs,Reminiscing,1jCfoQwzEeIuawlCbhM1Kp,55,1qBiRuMAIssv7uT0xGksv3,Nora En Pure - Reminiscing
4,4ZJXmXQeC0qL1nPVAWNRGd,0.674,0.606,10,-10.777,1,0.0354,0.359,0.802,0.108,...,122.011,302184,4,Klur,5Y1YwWzFX7BIxBbdAOXOEJ,Between,4ZJXmXQeC0qL1nPVAWNRGd,39,6yMAZmRCZjavW98B0aF9Of,Klur - Between


In [148]:
fig = px.scatter_3d(pl_master_df, x='danceability', y='energy', z='speechiness', width=800, height=600, color='valence', color_continuous_scale=color_scheme, hover_name='hoverdata')
zoom_level = 1.3
camera = dict(
    eye=dict(x=zoom_level, y=zoom_level, z=zoom_level),
    center_z=-0.25
)
fig.update_layout(scene_camera=camera, title='Speechiness, Energy, and Danceability of playlist', margin=dict(b=0, l=0, r=0, t=50))
fig.show()

In [149]:
fig = px.density_heatmap(pl_master_df, x='danceability', y='energy', width=800, height=800, color_continuous_scale=color_scheme)
fig.update_layout(title='Danceability and energy histogram of playlist')
fig.show()

In [150]:
fig = px.histogram(pl_master_df, x='tempo', width=800, height=800)
fig.update_layout(title='Tempo histogram of playlist')
fig.show()