In [1]:
import os
os.environ["ROOT"] = "/Users/per.morten.halvorsen@schibsted.com/personal"
os.chdir(os.environ["ROOT"])
os.environ.get("SPOTIFY_REDIRECT_URI")

'https://perhalvorsen.com/radio'

If the below cell doesn't work, go through the steps in the notebook `authorize.ipynb`. 

In [2]:
from website.radio.worker.authorize import get_token
TOKEN = get_token()
assert TOKEN

In [3]:
# other imports 
import numpy as np
import pandas as pd
import plotly.express as px
import requests

from dash import Dash, dcc, html, Input, Output, callback
from datetime import date 
from typing import List

pd.options.plotting.backend = "plotly"

# Track Audio Features

We will explore the returned audio features for a track.
For more info on these data, see: https://developer.spotify.com/documentation/web-api/reference/get-several-audio-features


Example request:
```bash 
curl --request GET \
  --url 'https://api.spotify.com/v1/audio-features?ids=7ouMYWpwJ422jRcDASZB7P%2C4VqPOruhp5EdPBeR92t6lQ%2C2takcwOaAZWiXQijPHIx7B' \
  --header 'Authorization: Bearer 1POdFZRZbvb...qqillRxMr2z'
```


In [4]:
def make_headers(token=TOKEN):
    return {"Authorization": f"Bearer {token}", "Content-Type": "application/x-www-form-urlencoded"}


def get_audio_features(ids, token=TOKEN):
    endpoint = "https://api.spotify.com/v1/audio-features"
    params = {
        "ids": ",".join(ids)
    }
    headers = make_headers(token)
    return requests.get(
        url=endpoint,
        params=params,
        headers=headers
    ).json().get("audio_features")

ids = ["7ouMYWpwJ422jRcDASZB7P", "4VqPOruhp5EdPBeR92t6lQ"]
names = ["Knights of Cydonia", "Uprising"]

example_features = get_audio_features(ids)
example_features

[{'danceability': 0.366,
  'energy': 0.963,
  'key': 11,
  'loudness': -5.301,
  'mode': 0,
  'speechiness': 0.142,
  'acousticness': 0.000273,
  'instrumentalness': 0.0122,
  'liveness': 0.115,
  'valence': 0.211,
  'tempo': 137.114,
  'type': 'audio_features',
  'id': '7ouMYWpwJ422jRcDASZB7P',
  'uri': 'spotify:track:7ouMYWpwJ422jRcDASZB7P',
  'track_href': 'https://api.spotify.com/v1/tracks/7ouMYWpwJ422jRcDASZB7P',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7ouMYWpwJ422jRcDASZB7P',
  'duration_ms': 366213,
  'time_signature': 4},
 {'danceability': 0.602,
  'energy': 0.905,
  'key': 2,
  'loudness': -4.046,
  'mode': 1,
  'speechiness': 0.0775,
  'acousticness': 0.000202,
  'instrumentalness': 0.064,
  'liveness': 0.117,
  'valence': 0.411,
  'tempo': 128.019,
  'type': 'audio_features',
  'id': '4VqPOruhp5EdPBeR92t6lQ',
  'uri': 'spotify:track:4VqPOruhp5EdPBeR92t6lQ',
  'track_href': 'https://api.spotify.com/v1/tracks/4VqPOruhp5EdPBeR92t6lQ',
  'analysis_url': 'htt

In [5]:
def get_audio_analysis(id, token=TOKEN):
    endpoint = f"https://api.spotify.com/v1/audio-analysis/{id}"
    headers = make_headers(token)
    return requests.get(
        url=endpoint,
        headers=headers
    ).json()

example_analysis = get_audio_analysis(ids[0])
example_analysis

{'meta': {'analyzer_version': '4.0.0',
  'platform': 'Linux',
  'detailed_status': 'OK',
  'status_code': 0,
  'timestamp': 1571738086,
  'analysis_time': 12.0111,
  'input_process': 'libvorbisfile L+R 44100->22050'},
 'track': {'num_samples': 8074998,
  'duration': 366.21307,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 2.66449,
  'start_of_fade_out': 353.7676,
  'loudness': -5.301,
  'tempo': 137.114,
  'tempo_confidence': 0.181,
  'time_signature': 4,
  'time_signature_confidence': 0.982,
  'key': 11,
  'key_confidence': 0.649,
  'mode': 0,
  'mode_confidence': 0.485,
  'codestring': 'eJxVnAuW7LCNZLeSSxD_5P431vcG6rntnmOPxVJKFAkEAgHwzbNXe_2O3_fb47z31rq_cefv3jb7vnf_9uRvd-_-5uk_bs7fzhxn_9rX5--8uUe_k8u2-m-efftbrf9anzxs3Pad1tevjTn967iD5_3avfe3et9rf3v_euMXY8_Bm8799cWLuHl-b4zv1089ebV75vn1e7_ffevN19f8je9zVvv1ts_3G310p8YkXntcT2bZe-vfHvPxkrZ_77ud_zfOb7xv8oFjfe_x48l6_NbtzKJ5Ofr3e2Pe9fHq35z8lrHH3

Audio analysis currently not needed, since we are not allowed to train any models on Spotify data. 
We instead want to focus on track meta data (artist, name, genre, etc.).

In [6]:
def get_track(id, token=TOKEN):
    endpoint = f"https://api.spotify.com/v1/tracks/"
    params = {
        "ids": ",".join(ids)
    }
    headers = make_headers(token)
    return requests.get(
        url=endpoint,
        params=params,
        headers=headers
    ).json()

get_track(ids[0])

{'tracks': [{'album': {'album_type': 'album',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/12Chz98pHFMPJEknJQMWvI'},
      'href': 'https://api.spotify.com/v1/artists/12Chz98pHFMPJEknJQMWvI',
      'id': '12Chz98pHFMPJEknJQMWvI',
      'name': 'Muse',
      'type': 'artist',
      'uri': 'spotify:artist:12Chz98pHFMPJEknJQMWvI'}],
    'available_markets': ['AR',
     'AU',
     'AT',
     'BE',
     'BO',
     'BR',
     'BG',
     'CA',
     'CL',
     'CO',
     'CR',
     'CY',
     'CZ',
     'DK',
     'DO',
     'DE',
     'EC',
     'EE',
     'SV',
     'FI',
     'FR',
     'GR',
     'GT',
     'HN',
     'HK',
     'HU',
     'IS',
     'IE',
     'IT',
     'LV',
     'LT',
     'LU',
     'MY',
     'MT',
     'MX',
     'NL',
     'NZ',
     'NI',
     'NO',
     'PA',
     'PY',
     'PE',
     'PH',
     'PL',
     'PT',
     'SG',
     'SK',
     'ES',
     'SE',
     'CH',
     'TW',
     'TR',
     'UY',
     'US',
     'GB',
     'AD

In [7]:
def get_track_names(ids, token=TOKEN):
    tracks = get_track(ids, token)
    return [track.get("name") for track in tracks.get("tracks") if track] if tracks.get("tracks") else None


def get_track_artists(ids, token=TOKEN):
    tracks = get_track(ids, token)
    return [
        ", ".join([artist.get("name") for artist in track.get("artists")])
        for track in tracks.get("tracks") if track
    ] if tracks.get("tracks") else None

get_track_names(ids), get_track_artists(ids)

(['Knights of Cydonia', 'Uprising'], ['Muse', 'Muse'])

Let's bake all of this into a single function.

In [8]:
def get_audio_features_df(ids, token=TOKEN, names=None, artists=None):
    features = get_audio_features(ids, token)
    names = get_track_names(ids, token) if names is None else names
    artists = get_track_artists(ids, token) if artists is None else artists

    if not features:
        return None
    
    for feature, name, artist in zip(features.copy(), names, artists):
        if not feature:
            features.remove(feature)
            continue    
        feature["name"] = name
        feature["artist"] = artist

    return pd.DataFrame(features).set_index(["name", "artist"])

df = get_audio_features_df(ids)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
name,artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Knights of Cydonia,Muse,0.366,0.963,11,-5.301,0,0.142,0.000273,0.0122,0.115,0.211,137.114,audio_features,7ouMYWpwJ422jRcDASZB7P,spotify:track:7ouMYWpwJ422jRcDASZB7P,https://api.spotify.com/v1/tracks/7ouMYWpwJ422...,https://api.spotify.com/v1/audio-analysis/7ouM...,366213,4
Uprising,Muse,0.602,0.905,2,-4.046,1,0.0775,0.000202,0.064,0.117,0.411,128.019,audio_features,4VqPOruhp5EdPBeR92t6lQ,spotify:track:4VqPOruhp5EdPBeR92t6lQ,https://api.spotify.com/v1/tracks/4VqPOruhp5Ed...,https://api.spotify.com/v1/audio-analysis/4VqP...,304840,4


In [9]:
df.index.map(str).map(lambda x: x.replace("', '", " - ").strip("(')"))

Index(['Knights of Cydonia - Muse', 'Uprising - Muse'], dtype='object')

Some of the features exist on different scales, but most have values between 0 and 1. Let's scale the features we want to keep for our analysis to be between 0 and 1, keep the columns we want, and remove the rest.

In [10]:
def scale(df, columns=["tempo"], min=0, max=220):
    df = df.copy()
    tmp_df = df[columns]
    df[columns] = (tmp_df - min) / (max - min)
    return df


def select(df, columns=["danceability", "energy", "valence", "tempo"]):
    return df[columns]


def squeeze_index(df):
    df = df.copy()
    # df.index = df.index.map(" - ".join).rename("track")
    df.index = df.index.map(str).map(lambda x: x.replace("', '", " - ").strip("(')")).rename("track")
    return df

preprocessed_df = scale(select(squeeze_index(df)))
preprocessed_df

Unnamed: 0_level_0,danceability,energy,valence,tempo
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Knights of Cydonia - Muse,0.366,0.963,0.211,0.623245
Uprising - Muse,0.602,0.905,0.411,0.581905


In [11]:
def preprocessing(
        ids: List | pd.Series | np.array, 
        columns=["danceability", "energy", "valence", "tempo"], 
        scale_columns=[], 
        names=None,
        artists=None,
        token=TOKEN
    ) -> pd.DataFrame:

    df = get_audio_features_df(ids, token, names, artists)
    df = select(df, columns)
    df = scale(df, scale_columns)
    df = squeeze_index(df)

    return df.drop_duplicates()

preprocessed_df = preprocessing(ids)
preprocessed_df

Unnamed: 0_level_0,danceability,energy,valence,tempo
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Knights of Cydonia - Muse,0.366,0.963,0.211,137.114
Uprising - Muse,0.602,0.905,0.411,128.019


## Basic feature plotting

Now that preprocessing is handled, let's do some basic plotting to get a feel for the feature data.

In [12]:
def bar(df):
    df = scale(df, ["tempo"], 220, 0)
    return df.plot.bar(barmode='group', title="Audio Features (bar)", template="plotly_dark")

bar(preprocessed_df)

In [13]:
def scatter(df, labels={}):
    return px.scatter(
        df.reset_index(), 
        x="valence", 
        range_x=[0, 1.1],

        y="energy", 
        range_y=[0, 1.1],
        
        color="tempo",
        # range_color=[0, 1.1],

        size="danceability",
        
        hover_name="track", 
        hover_data=[*labels.keys()],
        template="plotly_dark",

        title="Audio Features (scatter)",
        labels={label:label.upper() for label in df.columns} if not labels else labels
    )

scatter(preprocessed_df)

# Load my listening history
Great, now we can look at my particular listening history, and analyze the basic plots for some random samples. 

In [13]:
history = pd.read_csv("data/history.csv").tail(10000)
history

Unnamed: 0,played_at,id,artist,name
43212,2023-03-16T13:37:07.009Z,74bVVMDluzzrbB6YHT7rX7,Kevin de Vries,Dance With Me - Kölsch Remix
43213,2023-03-16T13:40:31.007Z,3bYEYuHHiX1TmdLUiWykIc,Brutalismus 3000,DIE LIEBE KOMMT NICHT AUS BERLIN
43214,2023-03-16T13:46:54.040Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me
43215,2023-03-16T13:51:53.066Z,1UopEDdYwbavBAvlY1gA6b,Chris Avantgarde,Mind Control
43216,2023-03-16T13:57:43.033Z,1BUK2OhjoaxPi6n7CPq25q,Private Agenda,P.S.R.
...,...,...,...,...
53207,2023-12-21T13:10:26.358Z,465nAxOI5pErqke80RoQTF,Norberg,Feliz navidad
53208,2023-12-21T13:13:05.237Z,29FQ6xgwKNDioJpAem6ht3,Red Rudolph Band,Holly Jolly Christmas
53209,2023-12-21T13:14:59.389Z,3DQFbtGXEnr4uWIh04ljQr,Greydon Park Band,Let It Snow
53210,2023-12-21T13:20:14.605Z,4z3fcWQdyZEs6lULJ87TI0,James Jackson Jazz Trio,Frosty The Snowman


In [14]:
unique_ids = history["id"].unique()
unique_ids.shape

(4769,)

In [15]:
unique_artists = history["artist"].unique()
unique_artists.shape

(2722,)

In [16]:
unique_tracks = history["name"].unique()
unique_tracks.shape

(4560,)

In [17]:
random_sample = history.sample(100, random_state=42)
random_sample

Unnamed: 0,played_at,id,artist,name
49464,2023-09-04T09:33:27.698Z,1JTQG8J8jN8tjDTyUWnaRe,Warren Zeiders,Ride the Lightning - 717 Tapes
47896,2023-07-27T08:00:17.877Z,0pH2mn7degF1qlAYpLubbi,Sebastian Gahler,Naoko
44943,2023-05-10T15:02:04.379Z,1J48wURibctLxcMvjOQYPK,Albert King,Sweet Fingers
47954,2023-07-27T13:24:06.286Z,6iJUSrA6XoX4FNLGPqzHOv,Headie One,Martin's Sofa
47733,2023-07-24T18:58:41.261Z,1AdYZ6X00nXmO613Y7GJOl,blink-182,I Miss You
...,...,...,...,...
46999,2023-07-13T13:05:26.370Z,3duP6mBVDsHM8ioN7o7dAK,Little Big Town,Boondocks
52401,2023-12-06T15:27:08.961Z,3YgtkOxZsTuaZdL8McA1FQ,Fred again..,adore u
51037,2023-10-24T20:58:40.030Z,5hmcw6bW5V24CvJqsz87We,Diplo,On My Mind
50751,2023-10-16T07:29:59.422Z,7DjHiDck3fqsiBBnRX5FLb,Lavern,Hold Me - Sped Up


In [38]:
preprocessed_sample = preprocessing(random_sample["id"], names=random_sample["name"], artists=random_sample["artist"])

In [39]:
preprocessed_sample

Unnamed: 0_level_0,danceability,energy,valence,tempo
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ride the Lightning - 717 Tapes - Warren Zeiders,0.576,0.474,0.4000,77.834
Naoko - Sebastian Gahler,0.487,0.261,0.1620,118.670
Sweet Fingers - Albert King,0.774,0.742,0.9080,114.306
"""Martin's Sofa"", 'Headie One",0.737,0.612,0.5950,143.066
I Miss You - blink-182,0.423,0.714,0.5930,110.017
...,...,...,...,...
Boondocks - Little Big Town,0.410,0.780,0.6950,88.016
adore u - Fred again..,0.747,0.867,0.6840,124.015
On My Mind - Diplo,0.771,0.731,0.6290,123.026
Hold Me - Sped Up - Lavern,0.649,0.970,0.0667,142.309


In [20]:
bar(preprocessed_sample.head(25))

In [37]:
scatter(preprocessed_sample)

It looks like a random sample of my listening history is pretty dispersed. 

A lot of the low-energy, low-valence songs also have low danceability (smaller bubbles), and a lot of the high-energy, high-valence songs have high danceability (bigger bubbles).

I would have expected to see a stronger correlation between tempo and these other features, but it looks like there is a lot of variation here, represented by a high spread of coloring in the plot. 

## Time specific listening plots

Now, instead of looking only at a random sample of my listening history, let's look at the listening history for a specific time period.
To do this, we can build an `audio_features` dataframe for the entire listening history. Note, we've only read in the more recent 10k tracks, in order to limit the scope of this analysis. Another important thing to note is that the Spotify API only allows us to get the audio features for 100 tracks at a time, so we will need to make multiple requests to get all of the audio features for our listening history.

In [40]:
s_ids = history["id"].unique()
s_ids = np.array_split(s_ids, len(s_ids) // 100 + 1)
len(s_ids), len(s_ids[0])

(48, 100)

### Get audio features for all tracks in listening history

In [41]:
def preprocess_full_history(history):
    # only preprocess 100 unique ids at a time
    unique_tracks = history[["id", "name", "artist"]].drop_duplicates().copy()
    unique_tracks_split = np.array_split(unique_tracks, len(unique_tracks) // 100 + 1)

    dfs = [
        preprocessing(ids=tracks["id"], names=tracks["name"], artists=tracks["artist"])
        for tracks in unique_tracks_split
    ]
    
    return pd.concat(dfs)

# actual path: history_features_f23-03-16_t23-12-21.csv
assert os.path.exists("data/history_features.csv"), "only rerun if history_features is missing"

preprocessed_history = preprocess_full_history(history) 
preprocessed_history


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



Unnamed: 0_level_0,danceability,energy,valence,tempo
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dance With Me - Kölsch Remix - Kevin de Vries,0.705,0.7090,0.0390,128.005
DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus 3000,0.552,0.8820,0.4840,165.009
Dance With Me - Kevin de Vries,0.792,0.7040,0.4720,124.003
Mind Control - Chris Avantgarde,0.730,0.8380,0.1740,127.005
P.S.R. - Private Agenda,0.107,0.0261,0.0382,67.007
...,...,...,...,...
When We Kissed Under The Mistletoe - Gus Anderson,0.709,0.2690,0.4070,123.966
Winter Serenade - Andy Goodman,0.539,0.1660,0.2990,101.393
Christmas Blues - Ramsey Lewis Trio,0.644,0.3950,0.5920,139.373
Come Thou Fount of Every Blessing - Upstage Trio,0.516,0.1140,0.5410,89.728


In [37]:
scatter(preprocessed_history.head(1000))

NameError: name 'preprocessed_history' is not defined

We can rejoin this into the `history` dataframe, and then plot the features for a specific time period.

In [43]:
preprocessed_history.head()

Unnamed: 0_level_0,danceability,energy,valence,tempo
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dance With Me - Kölsch Remix - Kevin de Vries,0.705,0.709,0.039,128.005
DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus 3000,0.552,0.882,0.484,165.009
Dance With Me - Kevin de Vries,0.792,0.704,0.472,124.003
Mind Control - Chris Avantgarde,0.73,0.838,0.174,127.005
P.S.R. - Private Agenda,0.107,0.0261,0.0382,67.007


In [44]:
history["track"] = history["name"] + " - " + history["artist"]
history.head()

Unnamed: 0,played_at,id,artist,name,track
43212,2023-03-16T13:37:07.009Z,74bVVMDluzzrbB6YHT7rX7,Kevin de Vries,Dance With Me - Kölsch Remix,Dance With Me - Kölsch Remix - Kevin de Vries
43213,2023-03-16T13:40:31.007Z,3bYEYuHHiX1TmdLUiWykIc,Brutalismus 3000,DIE LIEBE KOMMT NICHT AUS BERLIN,DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus...
43214,2023-03-16T13:46:54.040Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries
43215,2023-03-16T13:51:53.066Z,1UopEDdYwbavBAvlY1gA6b,Chris Avantgarde,Mind Control,Mind Control - Chris Avantgarde
43216,2023-03-16T13:57:43.033Z,1BUK2OhjoaxPi6n7CPq25q,Private Agenda,P.S.R.,P.S.R. - Private Agenda


In [45]:
history_features = history.merge(preprocessed_history, on="track")
history_features

Unnamed: 0,played_at,id,artist,name,track,danceability,energy,valence,tempo
0,2023-03-16T13:37:07.009Z,74bVVMDluzzrbB6YHT7rX7,Kevin de Vries,Dance With Me - Kölsch Remix,Dance With Me - Kölsch Remix - Kevin de Vries,0.705,0.7090,0.039,128.005
1,2023-03-16T13:40:31.007Z,3bYEYuHHiX1TmdLUiWykIc,Brutalismus 3000,DIE LIEBE KOMMT NICHT AUS BERLIN,DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus...,0.552,0.8820,0.484,165.009
2,2023-03-16T13:46:54.040Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries,0.792,0.7040,0.472,124.003
3,2023-05-09T11:51:41.802Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries,0.792,0.7040,0.472,124.003
4,2023-05-16T17:43:04.343Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries,0.792,0.7040,0.472,124.003
...,...,...,...,...,...,...,...,...,...
9811,2023-12-21T12:46:32.235Z,5vt00tH4PyhiAuV4kGIUAK,Gus Anderson,When We Kissed Under The Mistletoe,When We Kissed Under The Mistletoe - Gus Anderson,0.709,0.2690,0.407,123.966
9812,2023-12-21T12:49:31.497Z,0RjianjAK0XaKKeKzFKDIJ,Andy Goodman,Winter Serenade,Winter Serenade - Andy Goodman,0.539,0.1660,0.299,101.393
9813,2023-12-21T12:52:10.337Z,0KyArXxWFOxjKQOxl0P97i,Ramsey Lewis Trio,Christmas Blues,Christmas Blues - Ramsey Lewis Trio,0.644,0.3950,0.592,139.373
9814,2023-12-21T12:57:26.130Z,55tU7r4O96CUsaWZuMDaBo,Upstage Trio,Come Thou Fount of Every Blessing,Come Thou Fount of Every Blessing - Upstage Trio,0.516,0.1140,0.541,89.728


# Save & Load

Checkpoint to avoid re-running API calls.

In [46]:
history_features.to_csv("data/history_features_f23-03-16_t23-12-21.csv", index=False)

In [49]:
history_features = pd.read_csv("data/history_features_f23-03-16_t23-12-21.csv")

# Time specific listening plots

## Build columns for easy Dash filtering


#### Time features

In [50]:
# make sure time zone is Europe Oslo
history_features["timestamp"] = pd.to_datetime(history_features["played_at"], format='mixed').dt.tz_convert("Europe/Oslo")
history_features = history_features.sort_values("timestamp")

# hour and month are good ways to partition listening patterns 
history_features["hour"] = history_features["timestamp"].dt.hour
history_features["month"] = history_features["timestamp"].dt.month

#### Play count

In [59]:
# add play count
play_count = history_features.groupby(["id"]).size().rename("play_count")
history_features.drop(columns=["play_count"], inplace=True) if "play_count" in history_features.columns else None
history_features = history_features.merge(play_count, on="id", how="left")
history_features = history_features.sort_values("timestamp").drop_duplicates(subset=["timestamp"], keep="last")  # clean up after join
history_features

Unnamed: 0,played_at,id,artist,name,track,danceability,energy,valence,tempo,timestamp,hour,month,session_id,play_count
0,2023-03-16T13:37:07.009Z,74bVVMDluzzrbB6YHT7rX7,Kevin de Vries,Dance With Me - Kölsch Remix,Dance With Me - Kölsch Remix - Kevin de Vries,0.705,0.7090,0.0390,128.005,2023-03-16 14:37:07.009000+01:00,14,3,0,1
1,2023-03-16T13:40:31.007Z,3bYEYuHHiX1TmdLUiWykIc,Brutalismus 3000,DIE LIEBE KOMMT NICHT AUS BERLIN,DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus...,0.552,0.8820,0.4840,165.009,2023-03-16 14:40:31.007000+01:00,14,3,0,1
2,2023-03-16T13:46:54.040Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries,0.792,0.7040,0.4720,124.003,2023-03-16 14:46:54.040000+01:00,14,3,0,3
3,2023-03-16T13:51:53.066Z,1UopEDdYwbavBAvlY1gA6b,Chris Avantgarde,Mind Control,Mind Control - Chris Avantgarde,0.730,0.8380,0.1740,127.005,2023-03-16 14:51:53.066000+01:00,14,3,0,2
4,2023-03-16T13:57:43.033Z,1BUK2OhjoaxPi6n7CPq25q,Private Agenda,P.S.R.,P.S.R. - Private Agenda,0.107,0.0261,0.0382,67.007,2023-03-16 14:57:43.033000+01:00,14,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9461,2023-12-21T13:10:26.358Z,465nAxOI5pErqke80RoQTF,Norberg,Feliz navidad,Feliz navidad - Norberg,0.889,0.1980,0.7480,123.788,2023-12-21 14:10:26.358000+01:00,14,12,717,4
9462,2023-12-21T13:13:05.237Z,29FQ6xgwKNDioJpAem6ht3,Red Rudolph Band,Holly Jolly Christmas,Holly Jolly Christmas - Red Rudolph Band,0.647,0.2710,0.3460,148.227,2023-12-21 14:13:05.237000+01:00,14,12,717,4
9463,2023-12-21T13:14:59.389Z,3DQFbtGXEnr4uWIh04ljQr,Greydon Park Band,Let It Snow,Let It Snow - Greydon Park Band,0.687,0.2580,0.5680,134.261,2023-12-21 14:14:59.389000+01:00,14,12,717,4
9464,2023-12-21T13:20:14.605Z,4z3fcWQdyZEs6lULJ87TI0,James Jackson Jazz Trio,Frosty The Snowman,Frosty The Snowman - James Jackson Jazz Trio,0.566,0.2540,0.4580,132.701,2023-12-21 14:20:14.605000+01:00,14,12,717,4


#### Session

In [60]:
# assign "listening session id", i.e. tracks played within 30 minutes of each other are considered part of the same session
history_features["session_id"] = history_features["timestamp"].diff().dt.total_seconds().gt(1800).cumsum()
history_features.head(50)

Unnamed: 0,played_at,id,artist,name,track,danceability,energy,valence,tempo,timestamp,hour,month,session_id,play_count
0,2023-03-16T13:37:07.009Z,74bVVMDluzzrbB6YHT7rX7,Kevin de Vries,Dance With Me - Kölsch Remix,Dance With Me - Kölsch Remix - Kevin de Vries,0.705,0.709,0.039,128.005,2023-03-16 14:37:07.009000+01:00,14,3,0,1
1,2023-03-16T13:40:31.007Z,3bYEYuHHiX1TmdLUiWykIc,Brutalismus 3000,DIE LIEBE KOMMT NICHT AUS BERLIN,DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus...,0.552,0.882,0.484,165.009,2023-03-16 14:40:31.007000+01:00,14,3,0,1
2,2023-03-16T13:46:54.040Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries,0.792,0.704,0.472,124.003,2023-03-16 14:46:54.040000+01:00,14,3,0,3
3,2023-03-16T13:51:53.066Z,1UopEDdYwbavBAvlY1gA6b,Chris Avantgarde,Mind Control,Mind Control - Chris Avantgarde,0.73,0.838,0.174,127.005,2023-03-16 14:51:53.066000+01:00,14,3,0,2
4,2023-03-16T13:57:43.033Z,1BUK2OhjoaxPi6n7CPq25q,Private Agenda,P.S.R.,P.S.R. - Private Agenda,0.107,0.0261,0.0382,67.007,2023-03-16 14:57:43.033000+01:00,14,3,0,2
5,2023-03-16T14:00:45.964Z,5z4mF8QSQRsABywXxYrLUB,Patricia Wolf,Upward Swimming Fish,Upward Swimming Fish - Patricia Wolf,0.247,0.286,0.0352,72.105,2023-03-16 15:00:45.964000+01:00,15,3,0,1
6,2023-03-16T14:05:36.231Z,2OGcwhOAldo9aiDwBnXMfc,Gilligan Moss,Special Thing,Special Thing - Gilligan Moss,0.655,0.904,0.632,125.028,2023-03-16 15:05:36.231000+01:00,15,3,0,1
7,2023-03-16T14:09:16.857Z,75rGONmoi48LLYBFaGiYsv,Jayda G,Both Of Us - Edit,Both Of Us - Edit - Jayda G,0.735,0.71,0.507,123.962,2023-03-16 15:09:16.857000+01:00,15,3,0,6
8,2023-03-16T14:12:23.174Z,5aH41smzTehoCIqj7VmAT3,Kill Them With Colour,Wave,Wave - Kill Them With Colour,0.925,0.756,0.598,123.526,2023-03-16 15:12:23.174000+01:00,15,3,0,2
9,2023-03-16T14:16:06.137Z,2wi0QZ8SagYzfSeAee6Rm4,Elkka,I Just Want To Love You,I Just Want To Love You - Elkka,0.596,0.781,0.306,134.0,2023-03-16 15:16:06.137000+01:00,15,3,0,2


Double check that a session can last longer than 30 minutes. 

In [61]:
longest_session_id = history_features["session_id"].value_counts().sort_values(ascending=False).head(1).index.item()
longest_session_id

491

In [62]:
history_features[history_features["session_id"] == longest_session_id]

Unnamed: 0,played_at,id,artist,name,track,danceability,energy,valence,tempo,timestamp,hour,month,session_id,play_count
6115,2023-09-14T12:45:52.221Z,0hSdx0sbSmYRVJzZTuspu0,Malaa,Deep (with DJ Snake & Yung Felix),Deep (with DJ Snake & Yung Felix) - Malaa,0.954,0.791,0.493,128.005,2023-09-14 14:45:52.221000+02:00,14,9,491,56
6116,2023-09-14T12:52:22.591Z,4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,As It Was - Harry Styles,0.520,0.731,0.662,173.930,2023-09-14 14:52:22.591000+02:00,14,9,491,16
6117,2023-09-14T12:55:45.220Z,3DKCTIiJ97bS9TGiqcABjo,Dave,Thiago Silva,Thiago Silva - Dave,0.667,0.673,0.669,144.256,2023-09-14 14:55:45.220000+02:00,14,9,491,6
6118,2023-09-14T12:58:32.108Z,6Ku2dAf2mm5ne7tT7QUbdF,Alok,Deep Down (feat. Never Dull) - Nathan Dawe Remix,Deep Down (feat. Never Dull) - Nathan Dawe Rem...,0.675,0.758,0.535,125.943,2023-09-14 14:58:32.108000+02:00,14,9,491,4
6119,2023-09-14T13:02:09.119Z,5yOx2ezoz2hBfoE9mItGVo,Skrillex,Hydrate,Hydrate - Skrillex,0.683,0.786,0.282,75.000,2023-09-14 15:02:09.119000+02:00,15,9,491,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6225,2023-09-14T20:55:12.338Z,3JvKfv6T31zO0ini8iNItO,Tom Odell,Another Love,Another Love - Tom Odell,0.445,0.537,0.131,122.769,2023-09-14 22:55:12.338000+02:00,22,9,491,3
6226,2023-09-14T20:59:07.012Z,2PLi7OmleXPNBrGLon3sUy,The Temper Trap,Sweet Disposition,Sweet Disposition - The Temper Trap,0.542,0.813,0.305,129.090,2023-09-14 22:59:07.012000+02:00,22,9,491,3
6227,2023-09-14T21:02:04.615Z,7oAD6PCPiKu02kLqagi9Up,Tom Santa,Bad For Me,Bad For Me - Tom Santa,0.668,0.957,0.666,137.919,2023-09-14 23:02:04.615000+02:00,23,9,491,2
6228,2023-09-14T21:05:52.994Z,6tifCCTIVBLC2TmTquYG7G,Skrillex,Fine Day Anthem,Fine Day Anthem - Skrillex,0.745,0.850,0.540,138.030,2023-09-14 23:05:52.994000+02:00,23,9,491,27


## Interactive plots

Using Dash, we can build an interactive plot that allows us to filter the listening history by time and date.

In [63]:
history_features.head()

Unnamed: 0,played_at,id,artist,name,track,danceability,energy,valence,tempo,timestamp,hour,month,session_id,play_count
0,2023-03-16T13:37:07.009Z,74bVVMDluzzrbB6YHT7rX7,Kevin de Vries,Dance With Me - Kölsch Remix,Dance With Me - Kölsch Remix - Kevin de Vries,0.705,0.709,0.039,128.005,2023-03-16 14:37:07.009000+01:00,14,3,0,1
1,2023-03-16T13:40:31.007Z,3bYEYuHHiX1TmdLUiWykIc,Brutalismus 3000,DIE LIEBE KOMMT NICHT AUS BERLIN,DIE LIEBE KOMMT NICHT AUS BERLIN - Brutalismus...,0.552,0.882,0.484,165.009,2023-03-16 14:40:31.007000+01:00,14,3,0,1
2,2023-03-16T13:46:54.040Z,7i08AhQcrdD4GLlr2Pmamg,Kevin de Vries,Dance With Me,Dance With Me - Kevin de Vries,0.792,0.704,0.472,124.003,2023-03-16 14:46:54.040000+01:00,14,3,0,3
3,2023-03-16T13:51:53.066Z,1UopEDdYwbavBAvlY1gA6b,Chris Avantgarde,Mind Control,Mind Control - Chris Avantgarde,0.73,0.838,0.174,127.005,2023-03-16 14:51:53.066000+01:00,14,3,0,2
4,2023-03-16T13:57:43.033Z,1BUK2OhjoaxPi6n7CPq25q,Private Agenda,P.S.R.,P.S.R. - Private Agenda,0.107,0.0261,0.0382,67.007,2023-03-16 14:57:43.033000+01:00,14,3,0,2


In [73]:
def interative_scatter(df):
    app = Dash(__name__)

    style = {
        "font-family": "sans-serif",
        "font-size": "16px",
        "color": "white",
    }

    # set up sliders for month and hour
    app.layout = html.Div([
        dcc.Graph(id='graph-with-slider'),

        # month
        dcc.Markdown("Month", style=style),
        dcc.RangeSlider(
            id='month-slider',
            marks=None,
            # marks={str(month): str(month) for month in df['month'].unique()},
            max=df['month'].max(),
            min=df['month'].min(),
            step=1,
            tooltip={"placement": "bottom", "always_visible": True},
            value=[7, 10],
        ),

        # hour range slider
        dcc.Markdown("Hour", style=style),
        dcc.RangeSlider(
            id='hour-slider', 
            marks=None,
            # marks={str(hour): str(hour) for hour in df['hour'].unique()},
            max=df['hour'].max(),
            min=df['hour'].min(),
            step=1,
            tooltip={"placement": "bottom", "always_visible": True},
            value=[9, 16],
        ),

        # play count slider
        dcc.Markdown("Play count", style=style),
        dcc.RangeSlider(
            id='play-count-slider', 
            marks=None, 
            # marks={str(play_count): str(play_count) for play_count in df['play_count'].unique()},
            max=df['play_count'].max(),
            min=1,
            step=1,
            tooltip={"placement": "bottom", "always_visible": True},
            value=[3, df['play_count'].max()],
        ),
    ])

    @app.callback(
        Output('graph-with-slider', 'figure'),
        Input('month-slider', 'value'),
        Input('hour-slider', 'value'),
        Input('play-count-slider', 'value')
    )
    def update_figure(
        selected_month, 
        selected_hour,
        selected_play_count
        ):
        filtered_df = df.copy()
        filtered_df = filtered_df[filtered_df.month.between(selected_month[0], selected_month[1])]
        filtered_df = filtered_df[filtered_df.hour.between(selected_hour[0], selected_hour[1])]

        # play count should be recalculated based on filtered data
        play_count = filtered_df.groupby(["id"]).size().rename("play_count")
        filtered_df.drop(columns=["play_count"], inplace=True) if "play_count" in filtered_df.columns else None
        filtered_df = filtered_df.merge(play_count, on="id")
        filtered_df = filtered_df[filtered_df.play_count.between(selected_play_count[0], selected_play_count[1])]

        # clean labels 
        labels={
            "valence": "Positivity",
            "energy": "Energy",
            "tempo": "Tempo",
            "danceability": "Danceability",
            "track": "Track",
            "play_count": "Play count",
        }

        return scatter(filtered_df, labels).update_layout(
            transition_duration=500, 
            title="Audio Features (scatter) - {months} btwn hours {hours} - Play counts {play_counts}".format(
                months=str([
                    date(2023, selected_month[0], 1).strftime('%B'),
                    date(2023, selected_month[1], 1).strftime('%B')
                ]).replace("'", ""),
                hours=str(selected_hour),
                play_counts=str(selected_play_count)
            ),
        )

    app.run_server(debug=True)

    return app

# interative_scatter(history_features)

[1;31m---------------------------------------------------------------------------[0m
[1;31mValueError[0m                                Traceback (most recent call last)
File [1;32mnattype.pyx:58[0m, in [0;36mpandas._libs.tslibs.nattype._make_error_func.f[1;34m()[0m

[1;31mValueError[0m: NaTType does not support strftime



In [75]:
def interative_scatter_per_session(df, max_sessions=25):
    app = Dash(__name__)

    style = {
        "font-family": "sans-serif",
        "font-size": "16px",
        "font-weight": "bold",
        "color": "white",
    }

    # instead of month and hour, we use session id dropdown
    app.layout = html.Div([
        dcc.Graph(id='graph-with-slider'),

        dcc.Markdown("Date range and session id", style=style),
        html.Div([
            # date picker range
            dcc.DatePickerRange(
                id='date-picker-range',
                min_date_allowed=df['timestamp'].min().date(),
                max_date_allowed=df['timestamp'].max().date(),
                initial_visible_month=df['timestamp'].max().date(),
                start_date=df['timestamp'].min().date(),
                end_date=df['timestamp'].max().date(),
                style=dict(**style, **{"width": "40%", "float": "left"}),
            ),

            # session id dropdown
            dcc.Dropdown(
                id='session-id-dropdown',
                options=[{'label': str(session_id), 'value': session_id} for session_id in df['session_id'].unique()],
                value=[df['session_id'].unique()[0]],
                multi=True,
                placeholder="Select session id from current date range",
                style={"width": "60%", "float": "left"}
            ),

        ], style={"display":"inline-block", "backgroundColor": "#333333", "padding": "10px", "width": "90%"}),           

        # play count slider
        dcc.Markdown("Play count", style=style),
        dcc.RangeSlider(
            id='play-count-range-slider', 
            marks=None, 
            # marks={str(play_count): str(play_count) for play_count in df['play_count'].unique()},
            max=df['play_count'].max(),
            min=1,
            step=1,
            tooltip={"placement": "bottom", "always_visible": True},
            value=[0, df['play_count'].max()],
        ),

    ])

    @app.callback(
        Output('session-id-dropdown', 'value'),
        Output('session-id-dropdown', 'options'),
        Input('date-picker-range', 'start_date'),
        Input('date-picker-range', 'end_date'),
    )
    def update_session_id_dropdown(start_date, end_date):
        filtered_df = df.copy()
        filtered_df = filtered_df[filtered_df.timestamp.between(
            start_date,
            end_date
        )]

        return (
            filtered_df['session_id'].unique()[:max_sessions],  # values default to first 'max_sessions' session ids
            [{'label': str(session_id), 'value': session_id} for session_id in filtered_df['session_id'].unique()] # options for all session ids within date range
        )
    

    @app.callback(
        Output('play-count-range-slider', 'max'),
        Output('play-count-range-slider', 'value'),
        Input('date-picker-range', 'start_date'),
        Input('date-picker-range', 'end_date'),
    )
    def update_play_count_range(start_date, end_date):
        filtered_df = df.copy()
        filtered_df = filtered_df[filtered_df.timestamp.between(
            start_date,
            end_date
        )]

        # play count should be recalculated based on filtered data
        play_count = filtered_df.groupby(["id"]).size().rename("play_count")
        max_play_count = play_count.max() if play_count.any() else 1

        return (
            max_play_count,
            [0, max_play_count],
        )
    

    @app.callback(
        Output('graph-with-slider', 'figure'),
        Input('session-id-dropdown', 'value'),
        Input('play-count-range-slider', 'value'),
        Input('date-picker-range', 'start_date'),
        Input('date-picker-range', 'end_date'),
    )
    def update_figure(
        session_ids_dropdown,
        play_count_range,
        start_date, 
        end_date
        ):
        filtered_df = df.copy()

        # filter by date range
        filtered_df = filtered_df[filtered_df.timestamp.between(
            start_date, 
            end_date
        )]

        # default to first session id if none selected
        session_ids_dropdown = session_ids_dropdown if session_ids_dropdown else filtered_df['session_id'].unique()[:1]
        filtered_df = filtered_df[filtered_df.session_id.isin(session_ids_dropdown)]

        # play count should be recalculated based on filtered data
        play_count = filtered_df.groupby(["id"]).size().rename("play_count")
        filtered_df.drop(columns=["play_count"], inplace=True) if "play_count" in filtered_df.columns else None
        filtered_df = filtered_df.merge(play_count, on="id")
        filtered_df = filtered_df[filtered_df.play_count.between(play_count_range[0], play_count_range[1])]

        # clean labels 
        labels={
            "valence": "Positivity",
            "energy": "Energy",
            "tempo": "Tempo",
            "danceability": "Danceability",
            "track": "Track",
            "play_count": "Play count",
            "session_id": "Session id",
        }

        return scatter(filtered_df, labels).update_layout(
            transition_duration=500, 
            title="Audio Features (scatter) <br>" \
                "<sup>Session IDs {session_ids} - Play counts {play_counts} - Dates {dates}</sup>".format(
                session_ids=str([session_ids_dropdown[0], session_ids_dropdown[-1]]).replace("'", ""),
                play_counts=str(play_count_range),
                dates=str([
                    filtered_df[filtered_df["session_id"].isin(session_ids_dropdown)]["timestamp"].min().date().strftime('%d %B %Y'),
                    filtered_df[filtered_df["session_id"].isin(session_ids_dropdown)]["timestamp"].max().date().strftime('%d %B %Y'),
                ]).replace("'", ""),
            ),
        )
     
    app.run_server(debug=True)

    return app

_=interative_scatter_per_session(history_features, max_sessions=25)