# Part 1: Analyzing My Spotify Streaming History

Spotify's "Spotify.me" feature - access @ https://spotify.me/en - provides a snapshot of your Spotify listening history. Under GDPR, Spotify allows the export of all of your streaming history (saved for as long as you've been a Spotify user). I downloaded my streaming history - and proceeded to run an analysis on when I listen to music, what I listen to, and how it fits in with the rest of my life.

Public Code: https://github.com/shomilj/Explore-Spotify

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from loader import SpotifyAPI, HealthAPI
from dateutil.parser import parse
from pytz import timezone
from datetime import timedelta
import pytz
from datetime import datetime
from collections import defaultdict
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd, numpy as np
from tqdm import tqdm_notebook as tqdm
import plotly
from sklearn import preprocessing
plotly.offline.init_notebook_mode(connected=True)

In [3]:
ROOT = '/Users/shomil/Documents/Datasets/personal'

In [4]:
spotify = SpotifyAPI(ROOT)

In [5]:
spotify.help()


        Available Features:
        • load_searches (199 records)
        • load_streaming (57328 records)
        • load_tracks (1156 records)
        


In [6]:
def range_axis(start_date, end_date):
    X = []
    delta = timedelta(days=1)
    while start_date <= end_date:
        ts = start_date.strftime('%Y-%m-%d')
        X.append(ts)
        start_date += delta
    return sorted(X)

def range_axis_months(start_date, end_date):
    r = range_axis(start_date, end_date)
    r = np.unique([x[:7] for x in r]) # remove the --d part
    return r

def date_bucket(dt):
    return dt.strftime("%Y-%m-%d")

def day_axis():
    X = []
    start_date = datetime.now()
    end_date = start_date + timedelta(days=1)
    delta = timedelta(minutes=1)
    while start_date <= end_date:
        ts = start_date.strftime('%H:%M')
        X.append(ts)
        start_date += delta
    return sorted(X)

def time_bucket(dt):
    return dt.strftime("%H:%M")

def plot(X, y, title, xaxis='', yaxis=''):
    fig = go.Figure(data=[go.Scatter(x=X, y=y, line_shape='linear')])
    fig.update_layout(
        title=title,
        yaxis_title=yaxis,
        xaxis_title=xaxis,
        font=dict(size=12)
    )
    fig.show()

### Extracting Time-Relevant Information

In [15]:
actions = []
days = set()

for search in spotify.load_searches():
    dt = parse(search.get('searchTime'), fuzzy=True, ignoretz=True)
    dt = pytz.utc.localize(dt)
    dt = dt.astimezone(timezone('US/Pacific'))
    if dt in days:
        continue
    days.add(dt)
    actions.append((dt, 'search', search))
    
    
for track in spotify.load_streaming():
    dt = parse(track.get('endTime'), fuzzy=True, ignoretz=True)
    dt = pytz.utc.localize(dt)
    dt = dt.astimezone(timezone('US/Pacific'))
    if dt in days:
        continue
    days.add(dt)
    actions.append((dt, 'stream', track))

In [16]:
actions = list(sorted(actions, key=lambda a : a[0]))

In [17]:
# # Filter to Time in the USA for Testing Purposes (we don't have DateTime accurate yet)
# actions = list(filter(lambda a : a[0].year == 2019 and a[0].month < 12 and a[0].month > 8, actions))

### Analyze Historical Usage
How has my Spotify streaming frequency changed over time?

In [18]:
data = defaultdict(int)

for action in actions:
    dt = action[0]
    bucket = date_bucket(dt)
    data[bucket] += 1
    
X = range_axis(actions[0][0], actions[-1][0])
y = [data[bucket] for bucket in X]

plot(X, y, title='Spotify Streaming over All Time', xaxis='Time', yaxis='Count')

### Analyze Daily Usage
When, during the day, do I listen to Spotify?

In [19]:
data = defaultdict(int)

for action in actions:
    data[time_bucket(action[0])] += 1
    
X = day_axis()
y = [data[bucket] for bucket in X]
plot(X, y, title='Spotify Streaming over Day', xaxis='Time', yaxis='Count')

### Most Popular Tracks & Artists
What do I listen to the most?

In [20]:
df = pd.DataFrame.from_dict(spotify.load_streaming())

In [21]:
favorite_tracks = df.groupby('trackName').sum().sort_values('msPlayed', ascending=False)
favorite_tracks.head(15)

Unnamed: 0_level_0,msPlayed
trackName,Unnamed: 1_level_1
Another Place,32173559
Outnumbered,29362714
Phases,27054957
Mean It,26576389
You & Me,26574604
Breathe,25794466
Soldier,24237992
Better,23220595
Feelings,22881410
Dreamer,22672846


In [22]:
favorite_artists = df.groupby('artistName').sum().sort_values('msPlayed', ascending=False)
favorite_artists.head(15)

Unnamed: 0_level_0,msPlayed
artistName,Unnamed: 1_level_1
Lauv,204296202
Bastille,150401342
OneRepublic,149374250
ILLENIUM,86868143
Kygo,83514507
Avicii,81348055
Coldplay,78151513
Why Don't We,75756793
James TW,73590651
John Williams,73389203


## Comparision to All Music

Let's take a look at these on a plot. It appears that the difference between songs that I really enjoy and those that fit into the "general" category is striking; there's a sharp curve for both of these graphs.

In [23]:
plot(favorite_artists.index,favorite_artists['msPlayed'], title="My Favorite Artists", xaxis='Artist Name', yaxis='ms played')

In [24]:
plot(favorite_tracks.index,favorite_tracks['msPlayed'], title="My Favorite Tracks", yaxis='ms played')

## Top Tracks Over Time

In [25]:
top = favorite_tracks.head(20).index.to_list()
top = df[df.trackName.isin(top)]
top = top.assign(endTime=lambda df: df['endTime'].apply(lambda x : x[:10]))
top.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
108,2019-03-21,James TW,You & Me,231653
257,2019-03-24,Khalid,Better,229320
357,2019-03-26,James TW,You & Me,231653
395,2019-03-27,Lewis Capaldi,Someone You Loved,182160
646,2019-04-05,James TW,Soldier,224720


In [26]:
def get_data(tracker):
    data_cumulative = []
    data_monthly = []
    for artist, cum_dict in tracker.items():
        y_cum = []
        y_daily = []
        cum_ms = 0
        for dt in X:
            cum_ms += cum_dict.get(dt, 0)
            y_cum.append(cum_ms)

        y_months = [sum([v for k, v in cum_dict.items() if month in k]) for month in X_months]
        data_cumulative.append(go.Scatter(x=X, y=y_cum, name=artist, line_shape='spline'))
        data_monthly.append(go.Scatter(x=X_months, y=y_months, name=artist, line_shape='spline'))
        
    return data_cumulative, data_monthly

def plot_tracker(data, title):
    fig = go.Figure(data=data)
    fig.update_layout(
        title=title,
        yaxis_title='Total Time Listened To',
        xaxis_title='Time',
        font=dict(size=12)
    )
    fig.show()

In [27]:
artist_tracker = defaultdict(dict)
track_tracker = defaultdict(dict)

for i, row in top.iterrows():
    dt = row.get('endTime')
    artist = row.get('artistName')[:18]
    track = row.get('trackName')[:18]
    ms = int(row.get('msPlayed'))
    artist_tracker[artist][dt] = artist_tracker[artist].setdefault(dt, 0) + ms
    track_tracker[track][dt] = track_tracker[track].setdefault(dt, 0) + ms
    
r = list(sorted(top['endTime']))
X = range_axis(parse(r[0]), parse(r[-1]))
X_months = range_axis_months(parse(r[0]), parse(r[-1]))

In [28]:
artists_cum, artists_mon = get_data(artist_tracker)
tracks_cum, tracks_mon = get_data(track_tracker)

plot_tracker(artists_cum, 'Artists over Time (Cumulative)')
plot_tracker(artists_mon, 'Artists over Time (Monthly)')
plot_tracker(tracks_cum, 'Tracks over Time (Cumulative)')
plot_tracker(tracks_mon, 'Tracks over Time (Monthly)')

## Normalized Monthly Charts

In [29]:
def plot_normed(tracker, title):
    normalizer = pd.DataFrame([list(row.y) for row in tracker])
    x = normalizer.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    normalizer = pd.DataFrame(x_scaled)

    for i, row in enumerate(tracker):
        row.y = normalizer.iloc[i]

    plot_tracker(tracker, title)

In [30]:
plot_normed(artists_mon, 'Artists over Time (Monthly, Normalized)')
plot_normed(tracks_mon, 'Tracks over Time (Monthly, Normalized)')

## Top Songs from Each Month

In [31]:
df = pd.DataFrame.from_dict(spotify.load_streaming())
df['endTime'] = df['endTime'].apply(lambda x : x[:7])
filtered = df.groupby(['endTime','trackName']).size().reset_index().sort_values(0, ascending=False).sort_values('endTime')
filtered = filtered.rename(columns={0: 'count'})

In [32]:
for d in reversed(sorted(set(df['endTime']))):
    print(f"TOP SONGS FOR {d}")
    for i, r in df[(df['endTime'] == d)].groupby(['trackName']).sum().sort_values('msPlayed', ascending=False).head(10).iterrows():
        print(str(r['msPlayed']) + ' - ' + i)
    print('---------------------------')

TOP SONGS FOR 2020-08
917866 - Up With End Credits
675143 - Another Place
674160 - Soldier
639484 - Run Free
524206 - Bracelet
522306 - Castle on the Hill
521812 - Show Yourself
502294 - Dreamer
494852 - Wake Me Up
492800 - Beautiful Day
---------------------------
TOP SONGS FOR 2020-07
4924040 - Got It In You
4408153 - Ancient Light
4343762 - All I Got
3991391 - Build It Better
3834488 - Someone To You
3634363 - My Shot
3526420 - Half Light
3420917 - Past Lives
3403691 - Shine A Light
3317634 - Someone To Stay
---------------------------
TOP SONGS FOR 2020-06
7027228 - Broken Lovers
6062370 - All I Got
5636146 - Future Looks Good
4288932 - The Woods - Acoustic
4271386 - Beam
3993086 - Glass Houses
3761967 - Fire for You
3176874 - Starlight
3156924 - Half Light
3094440 - Say Something
---------------------------
TOP SONGS FOR 2020-05
7092936 - Choke
6255918 - Quite Miss Home
5916886 - Rollercoaster
5744407 - Sometimes
5319516 - Beautiful Day
4937341 - Carry You
4791732 - Home
4412930 -

In [33]:
for d in reversed(sorted(set(df['endTime']))):
    print(f"TOP ARTISTS FOR {d}")
    for i, r in df[(df['endTime'] == d)].groupby(['artistName']).sum().sort_values('msPlayed', ascending=False).head(10).iterrows():
        print(str(r['msPlayed']) + ' - ' + i)
    print('---------------------------')

TOP ARTISTS FOR 2020-08
2042283 - Lauv
1218861 - Taylor Swift
1136788 - James TW
1016855 - OneRepublic
917866 - Michael Giacchino
772113 - Avicii
749382 - Why Don't We
749252 - Alan Silvestri
716660 - Idina Menzel
713055 - ILLENIUM
---------------------------
TOP ARTISTS FOR 2020-07
31181787 - BANNERS
22535835 - Harry Gregson-Williams
21426780 - Taylor Swift
16399318 - John Williams
9055446 - OneRepublic
8507590 - Bastille
7213083 - Lauv
6711704 - Parachute
6406638 - Kodaline
5507299 - James TW
---------------------------
TOP ARTISTS FOR 2020-06
21227805 - OneRepublic
15803817 - The Piano Guys
13243818 - Cannons
9342137 - Kodaline
7961470 - Lauv
7838250 - BØRNS
7563757 - BANNERS
7054964 - Stormfolk
6419900 - Hollow Coves
6314692 - Said the Sky
---------------------------
TOP ARTISTS FOR 2020-05
19058035 - OneRepublic
17855654 - Kodaline
17664423 - Kygo
10739312 - The Lumineers
10437762 - James Arthur
8006604 - Bastille
7083982 - Oh Wonder
7001903 - Imagine Dragons
6482539 - Novo Amor
6

# Part 2: How Does Music Affect My Heartbeat?
Or rather, what type of music do I listen to when my heart's pumping? (could be <==>)

In [37]:
health = HealthAPI(ROOT)

In [38]:
health.help()


Available Features:
• load_heartbeats()
        


In [39]:
hb_df = health.load_heartbeats()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [40]:
print(f"We have {len(hb_df)} heartbeat data points available!")

We have 150927 heartbeat data points available!


In [41]:
hb_df.head()

Unnamed: 0,creationDate,startDate,endDate,value
0,2019-11-13 17:43:25+00:00,2019-11-13 17:43:18+00:00,2019-11-13 17:43:18+00:00,103
1,2019-11-13 17:48:49+00:00,2019-11-13 17:39:14+00:00,2019-11-13 17:39:14+00:00,76
2,2019-11-13 17:53:32+00:00,2019-11-13 17:48:36+00:00,2019-11-13 17:48:36+00:00,75
3,2019-11-13 17:57:15+00:00,2019-11-13 17:53:21+00:00,2019-11-13 17:53:21+00:00,81
4,2019-11-13 18:00:38+00:00,2019-11-13 18:00:37+00:00,2019-11-13 18:00:37+00:00,92


In [42]:
sp_df = pd.DataFrame.from_dict(spotify.load_streaming())

In [43]:
sp_df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2019-03-16 16:51,Cash Cash,Hero (feat. Christina Perri) - Deep Mix,60669
1,2019-03-17 05:17,Cash Cash,Hero (feat. Christina Perri) - Deep Mix,76252
2,2019-03-17 05:18,Cash Cash,Hero (feat. Christina Perri) - Deep Mix,81775
3,2019-03-17 05:22,Steve Void,Perfect Mess,216046
4,2019-03-17 05:26,San Holo,The Future - GOSLO Remix,274480


In [44]:
track_hb = defaultdict(list)
artist_hb = defaultdict(list)

for i, row in tqdm(sp_df.iterrows()):
    dt = parse(row.get('endTime'))
    dt = pytz.utc.localize(dt)
    
    filtered = hb_df[(hb_df["startDate"] <= dt) & (dt <= hb_df["endDate"])]
    if len(filtered) > 0:
        for j, hb_row in filtered.iterrows():
            track_hb[row.get('trackName')].append(hb_row.get('value'))
            artist_hb[row.get('artistName')].append(hb_row.get('value'))


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [45]:
def get_Xy(tracker):
    X, y = tracker.keys(), [np.mean(list(map(float, tracker[t]))) for t in tracker.keys()]
    X, y = list(zip(*sorted(list(zip(X, y)), key=lambda x : x[1])))
    return X, y

def plot_hb(tracker, title):
    X, y = get_Xy(tracker)
    plot(y, list(X), title)

# Results – The Final Heartbeat/Music Correlation
Do these make sense? Judge for yourself! The DataFrames at the bottom may provide a better visualization.

In [46]:
plot_hb(track_hb, 'Heartbeat by Tracks')
plot_hb(artist_hb, 'Heartbeat by Artists')

### Correlation of Tracks & Heartbeat

In [47]:
X, y = get_Xy(track_hb)
track_df = pd.DataFrame(np.array([X, y]).T, columns=['Track', 'Average Heartbeat'])
track_df = track_df.astype({'Average Heartbeat': 'float'})
track_df = track_df.sort_values('Average Heartbeat', ascending=False)
track_df

Unnamed: 0,Track,Average Heartbeat
429,Everywhere - 2017 Remaster,175.062
423,Carry on My Wayward Son - Brass Version,175.062
418,Supremacy - Brass Version,175.062
419,SOS (feat. Aloe Blacc),175.062
420,Just My Type,175.062
...,...,...
4,FRIENDS,62.000
2,Dance Monkey,61.000
1,All You Need To Know (feat. Calle Lehmann),61.000
3,Guiding Light,61.000


### Correlation of Artists & Heartbeat

In [48]:
X, y = get_Xy(artist_hb)
artist_df = pd.DataFrame(np.array([X, y]).T, columns=['Artist', 'Average Heartbeat'])
artist_df = artist_df.astype({'Average Heartbeat': 'float'})
artist_df = artist_df.sort_values('Average Heartbeat', ascending=False)
artist_df

Unnamed: 0,Artist,Average Heartbeat
283,BANNERS,175.0620
282,TOTO,175.0620
281,Lord Huron,175.0620
280,Ariana Grande,171.9845
279,Rodney Atkins,168.9070
...,...,...
4,Sonu Nigam,67.0000
3,The Blue Notes,66.0000
2,Paul Cesar,64.0000
1,Mumford & Sons,61.0000
