# Part 1: Analyzing My Spotify Streaming History

Spotify's "Spotify.me" feature - access @ https://spotify.me/en - provides a snapshot of your Spotify listening history. Under GDPR, Spotify allows the export of all of your streaming history (saved for as long as you've been a Spotify user). I downloaded my streaming history - and proceeded to run an analysis on when I listen to music, what I listen to, and how it fits in with the rest of my life.

Public Code: https://github.com/shomilj/Explore-Spotify

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from loader import SpotifyAPI, HealthAPI
from dateutil.parser import parse
from pytz import timezone
from datetime import timedelta
import pytz
from datetime import datetime
from collections import defaultdict
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd, numpy as np
from tqdm import tqdm_notebook as tqdm
import plotly
from sklearn import preprocessing
plotly.offline.init_notebook_mode(connected=True)

In [3]:
ROOT = '/Users/shomil/Documents/Datasets/personal'

In [4]:
spotify = SpotifyAPI(ROOT)

In [5]:
spotify.help()


        Available Features:
        • load_searches (199 records)
        • load_streaming (97636 records)
        • load_tracks (1156 records)
        


In [6]:
def range_axis(start_date, end_date):
    X = []
    delta = timedelta(days=1)
    while start_date <= end_date:
        ts = start_date.strftime('%Y-%m-%d')
        X.append(ts)
        start_date += delta
    return sorted(X)

def range_axis_months(start_date, end_date):
    r = range_axis(start_date, end_date)
    r = np.unique([x[:7] for x in r]) # remove the --d part
    return r

def date_bucket(dt):
    return dt.strftime("%Y-%m-%d")

def day_axis():
    X = []
    start_date = datetime.now()
    end_date = start_date + timedelta(days=1)
    delta = timedelta(minutes=1)
    while start_date <= end_date:
        ts = start_date.strftime('%H:%M')
        X.append(ts)
        start_date += delta
    return sorted(X)

def time_bucket(dt):
    return dt.strftime("%H:%M")

def plot(X, y, title, xaxis='', yaxis=''):
    fig = go.Figure(data=[go.Scatter(x=X, y=y, line_shape='linear')])
    fig.update_layout(
        title=title,
        yaxis_title=yaxis,
        xaxis_title=xaxis,
        font=dict(size=12)
    )
    fig.show()

### Extracting Time-Relevant Information

In [7]:
actions = []
days = set()

for search in spotify.load_searches():
    dt = parse(search.get('searchTime'), fuzzy=True, ignoretz=True)
    dt = pytz.utc.localize(dt)
    dt = dt.astimezone(timezone('US/Pacific'))
    if dt in days:
        continue
    days.add(dt)
    actions.append((dt, 'search', search))
    
    
for track in spotify.load_streaming():
    dt = parse(track.get('endTime'), fuzzy=True, ignoretz=True)
    dt = pytz.utc.localize(dt)
    dt = dt.astimezone(timezone('US/Pacific'))
    if dt in days:
        continue
    days.add(dt)
    actions.append((dt, 'stream', track))

In [8]:
actions = list(sorted(actions, key=lambda a : a[0]))

In [9]:
# # Filter to Time in the USA for Testing Purposes (we don't have DateTime accurate yet)
# actions = list(filter(lambda a : a[0].year == 2019 and a[0].month < 12 and a[0].month > 8, actions))

### Analyze Historical Usage
How has my Spotify streaming frequency changed over time?

In [10]:
data = defaultdict(int)

for action in actions:
    dt = action[0]
    bucket = date_bucket(dt)
    data[bucket] += 1
    
X = range_axis(actions[0][0], actions[-1][0])
y = [data[bucket] for bucket in X]

plot(X, y, title='Spotify Streaming over All Time', xaxis='Time', yaxis='Count')

### Analyze Daily Usage
When, during the day, do I listen to Spotify?

In [11]:
data = defaultdict(int)

for action in actions:
    data[time_bucket(action[0])] += 1
    
X = day_axis()
y = [data[bucket] for bucket in X]
plot(X, y, title='Spotify Streaming over Day', xaxis='Time', yaxis='Count')

### Most Popular Tracks & Artists
What do I listen to the most?

In [12]:
df = pd.DataFrame.from_dict(spotify.load_streaming())

In [13]:
favorite_tracks = df.groupby('trackName').sum().sort_values('msPlayed', ascending=False)
favorite_tracks.head(15)

Unnamed: 0_level_0,msPlayed
trackName,Unnamed: 1_level_1
Another Place,68598398
Soldier,49281386
Outnumbered,48912004
Phases,48270310
Mean It,46704155
Cough Syrup,41587134
Breathe,40040702
Circles,39297192
Feelings,39217854
Better,39001768


In [14]:
favorite_artists = df.groupby('artistName').sum().sort_values('msPlayed', ascending=False)
favorite_artists.head(15)

Unnamed: 0_level_0,msPlayed
artistName,Unnamed: 1_level_1
Lauv,329653852
Bastille,304291830
OneRepublic,292095065
Kygo,148217544
BANNERS,147403291
ILLENIUM,143402237
Avicii,127149299
Why Don't We,126698303
Coldplay,124913296
James TW,124730610


## Comparision to All Music

Let's take a look at these on a plot. It appears that the difference between songs that I really enjoy and those that fit into the "general" category is striking; there's a sharp curve for both of these graphs.

In [15]:
plot(favorite_artists.index,favorite_artists['msPlayed'], title="My Favorite Artists", xaxis='Artist Name', yaxis='ms played')

In [16]:
plot(favorite_tracks.index,favorite_tracks['msPlayed'], title="My Favorite Tracks", yaxis='ms played')

## Top Tracks Over Time

In [17]:
top = favorite_tracks.head(20).index.to_list()
top = df[df.trackName.isin(top)]
top = top.assign(endTime=lambda df: df['endTime'].apply(lambda x : x[:10]))
top.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
108,2019-03-21,James TW,You & Me,231653
257,2019-03-24,Khalid,Better,229320
357,2019-03-26,James TW,You & Me,231653
422,2019-03-27,Avicii,Wake Me Up,247426
646,2019-04-05,James TW,Soldier,224720


In [18]:
def get_data(tracker):
    data_cumulative = []
    data_monthly = []
    for artist, cum_dict in tracker.items():
        y_cum = []
        y_daily = []
        cum_ms = 0
        for dt in X:
            cum_ms += cum_dict.get(dt, 0)
            y_cum.append(cum_ms)

        y_months = [sum([v for k, v in cum_dict.items() if month in k]) for month in X_months]
        data_cumulative.append(go.Scatter(x=X, y=y_cum, name=artist, line_shape='spline'))
        data_monthly.append(go.Scatter(x=X_months, y=y_months, name=artist, line_shape='spline'))
        
    return data_cumulative, data_monthly

def plot_tracker(data, title):
    fig = go.Figure(data=data)
    fig.update_layout(
        title=title,
        yaxis_title='Total Time Listened To',
        xaxis_title='Time',
        font=dict(size=12)
    )
    fig.show()

In [19]:
artist_tracker = defaultdict(dict)
track_tracker = defaultdict(dict)

for i, row in top.iterrows():
    dt = row.get('endTime')
    artist = row.get('artistName')[:18]
    track = row.get('trackName')[:18]
    ms = int(row.get('msPlayed'))
    artist_tracker[artist][dt] = artist_tracker[artist].setdefault(dt, 0) + ms
    track_tracker[track][dt] = track_tracker[track].setdefault(dt, 0) + ms
    
r = list(sorted(top['endTime']))
X = range_axis(parse(r[0]), parse(r[-1]))
X_months = range_axis_months(parse(r[0]), parse(r[-1]))

In [20]:
artists_cum, artists_mon = get_data(artist_tracker)
tracks_cum, tracks_mon = get_data(track_tracker)

plot_tracker(artists_cum, 'Artists over Time (Cumulative)')
plot_tracker(artists_mon, 'Artists over Time (Monthly)')
plot_tracker(tracks_cum, 'Tracks over Time (Cumulative)')
plot_tracker(tracks_mon, 'Tracks over Time (Monthly)')

## Normalized Monthly Charts

In [21]:
def plot_normed(tracker, title):
    normalizer = pd.DataFrame([list(row.y) for row in tracker])
    x = normalizer.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    normalizer = pd.DataFrame(x_scaled)

    for i, row in enumerate(tracker):
        row.y = normalizer.iloc[i]

    plot_tracker(tracker, title)

In [22]:
plot_normed(artists_mon, 'Artists over Time (Monthly, Normalized)')
plot_normed(tracks_mon, 'Tracks over Time (Monthly, Normalized)')

## Top Songs from Each Month

In [23]:
df = pd.DataFrame.from_dict(spotify.load_streaming())
df['endTime'] = df['endTime'].apply(lambda x : x[:7])
filtered = df.groupby(['endTime','trackName']).size().reset_index().sort_values(0, ascending=False).sort_values('endTime')
filtered = filtered.rename(columns={0: 'count'})

In [24]:
for d in reversed(sorted(set(df['endTime']))):
    print(f"TOP SONGS FOR {d}")
    for i, r in df[(df['endTime'] == d)].groupby(['trackName']).sum().sort_values('msPlayed', ascending=False).head(10).iterrows():
        print(str(r['msPlayed']) + ' - ' + i)
    print('---------------------------')

TOP SONGS FOR 2020-10
14655288 - Out of the Old
7507727 - All I Want
7440716 - Shine
6845581 - Heaven
6557309 - Wondering
6538082 - Welcome Home, Son
6054155 - Let Your Heart Hold Fast
5953483 - In My Veins - Feat. Erin Mccarley
5832056 - Into the Unknown
5782480 - Lost In The Wild
---------------------------
TOP SONGS FOR 2020-09
16132653 - The Funeral
9018869 - Daydreamer
6154711 - Always Gold
5938657 - Let Your Heart Hold Fast
5722772 - Welcome Home, Son
4986193 - Simple Song
4396759 - Surrender
4373862 - The Mute
4117348 - If the Hudson Overflows
3590259 - Circles
---------------------------
TOP SONGS FOR 2020-08
7496503 - It’s All So Incredibly Loud
6644478 - Another Place
6605746 - Soldier
5288994 - Circles
5174298 - Papercut
5134510 - Run Free
4872239 - To Aslan's Camp
4722148 - Up With End Credits
4649241 - The Heart Never Lies - Live
4577233 - Show Yourself
---------------------------
TOP SONGS FOR 2020-07
9848080 - Got It In You
8816306 - Ancient Light
8687524 - All I Got
798

In [25]:
for d in reversed(sorted(set(df['endTime']))):
    print(f"TOP ARTISTS FOR {d}")
    for i, r in df[(df['endTime'] == d)].groupby(['artistName']).sum().sort_values('msPlayed', ascending=False).head(10).iterrows():
        print(str(r['msPlayed']) + ' - ' + i)
    print('---------------------------')

TOP ARTISTS FOR 2020-10
37983992 - Olivia Rodrigo
29382670 - Glass Animals
24109606 - Years & Years
14104952 - Radical Face
8479379 - Frank Sinatra
8449812 - John Swihart
7311437 - Imagine Dragons
7300959 - Bastille
7220463 - OneRepublic
6936978 - WALK THE MOON
---------------------------
TOP ARTISTS FOR 2020-09
17534647 - Band of Horses
16868972 - Radical Face
9239506 - AURORA
8623513 - Years & Years
8061197 - BANNERS
7738792 - Bastille
7331714 - Coldplay
7163307 - OneRepublic
6779237 - ILLENIUM
6577866 - Glass Animals
---------------------------
TOP ARTISTS FOR 2020-08
21291843 - Glass Animals
20123446 - Taylor Swift
16923741 - BANNERS
14043132 - James TW
12903786 - Christophe Beck
12104389 - Hans Zimmer
11873930 - Bastille
11472844 - James Bay
8544897 - OneRepublic
7901084 - Lauv
---------------------------
TOP ARTISTS FOR 2020-07
62363574 - BANNERS
45071670 - Harry Gregson-Williams
42853560 - Taylor Swift
32798636 - John Williams
18110892 - OneRepublic
17015180 - Bastille
14426166 

# Part 2: How Does Music Affect My Heartbeat?
Or rather, what type of music do I listen to when my heart's pumping? (could be <==>)

In [26]:
health = HealthAPI(ROOT)

In [30]:
health.help()


Available Features:
• load_heartbeats()
        


In [31]:
hb_df = health.load_heartbeats()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [32]:
print(f"We have {len(hb_df)} heartbeat data points available!")

We have 150927 heartbeat data points available!


In [33]:
hb_df.head()

Unnamed: 0,creationDate,startDate,endDate,value
0,2019-11-13 17:43:25+00:00,2019-11-13 17:43:18+00:00,2019-11-13 17:43:18+00:00,103
1,2019-11-13 17:48:49+00:00,2019-11-13 17:39:14+00:00,2019-11-13 17:39:14+00:00,76
2,2019-11-13 17:53:32+00:00,2019-11-13 17:48:36+00:00,2019-11-13 17:48:36+00:00,75
3,2019-11-13 17:57:15+00:00,2019-11-13 17:53:21+00:00,2019-11-13 17:53:21+00:00,81
4,2019-11-13 18:00:38+00:00,2019-11-13 18:00:37+00:00,2019-11-13 18:00:37+00:00,92


In [34]:
sp_df = pd.DataFrame.from_dict(spotify.load_streaming())

In [35]:
sp_df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2019-03-16 16:51,Cash Cash,Hero (feat. Christina Perri) - Deep Mix,60669
1,2019-03-17 05:17,Cash Cash,Hero (feat. Christina Perri) - Deep Mix,76252
2,2019-03-17 05:18,Cash Cash,Hero (feat. Christina Perri) - Deep Mix,81775
3,2019-03-17 05:22,Steve Void,Perfect Mess,216046
4,2019-03-17 05:26,San Holo,The Future - GOSLO Remix,274480


In [36]:
track_hb = defaultdict(list)
artist_hb = defaultdict(list)

for i, row in tqdm(sp_df.iterrows()):
    dt = parse(row.get('endTime'))
    dt = pytz.utc.localize(dt)
    
    filtered = hb_df[(hb_df["startDate"] <= dt) & (dt <= hb_df["endDate"])]
    if len(filtered) > 0:
        for j, hb_row in filtered.iterrows():
            track_hb[row.get('trackName')].append(hb_row.get('value'))
            artist_hb[row.get('artistName')].append(hb_row.get('value'))


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [37]:
def get_Xy(tracker):
    X, y = tracker.keys(), [np.mean(list(map(float, tracker[t]))) for t in tracker.keys()]
    X, y = list(zip(*sorted(list(zip(X, y)), key=lambda x : x[1])))
    return X, y

def plot_hb(tracker, title):
    X, y = get_Xy(tracker)
    plot(y, list(X), title)

# Results – The Final Heartbeat/Music Correlation
Do these make sense? Judge for yourself! The DataFrames at the bottom may provide a better visualization.

In [38]:
plot_hb(track_hb, 'Heartbeat by Tracks')
plot_hb(artist_hb, 'Heartbeat by Artists')

### Correlation of Tracks & Heartbeat

In [39]:
X, y = get_Xy(track_hb)
track_df = pd.DataFrame(np.array([X, y]).T, columns=['Track', 'Average Heartbeat'])
track_df = track_df.astype({'Average Heartbeat': 'float'})
track_df = track_df.sort_values('Average Heartbeat', ascending=False)
track_df

Unnamed: 0,Track,Average Heartbeat
429,Everywhere - 2017 Remaster,175.062
423,Carry on My Wayward Son - Brass Version,175.062
419,SOS (feat. Aloe Blacc),175.062
420,Just My Type,175.062
421,"thank u, next",175.062
...,...,...
4,FRIENDS,62.000
2,Dance Monkey,61.000
1,All You Need To Know (feat. Calle Lehmann),61.000
3,Guiding Light,61.000


### Correlation of Artists & Heartbeat

In [40]:
X, y = get_Xy(artist_hb)
artist_df = pd.DataFrame(np.array([X, y]).T, columns=['Artist', 'Average Heartbeat'])
artist_df = artist_df.astype({'Average Heartbeat': 'float'})
artist_df = artist_df.sort_values('Average Heartbeat', ascending=False)
artist_df

Unnamed: 0,Artist,Average Heartbeat
283,BANNERS,175.062000
282,TOTO,175.062000
281,Lord Huron,175.062000
280,Ariana Grande,173.010333
277,Dimitri Vegas & Like Mike,168.907000
...,...,...
4,Sonu Nigam,67.000000
3,The Blue Notes,66.000000
2,Paul Cesar,64.000000
1,Mumford & Sons,61.000000
