# Analyzing My Spotify Streaming History

Spotify's "Spotify.me" feature - access @ https://spotify.me/en - provides a snapshot of your Spotify listening history. Under GDPR, Spotify allows the export of all of your streaming history (saved for as long as you've been a Spotify user). I downloaded my streaming history - and proceeded to run an analysis on when I listen to music, what I listen to, and how it fits in with the rest of my life.

In [432]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [449]:
from loader import SpotifyAPI
from dateutil.parser import parse
from pytz import timezone
from datetime import timedelta
import pytz
from datetime import datetime
from collections import defaultdict
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import plotly
plotly.offline.init_notebook_mode(connected=True)

In [450]:
spotify = SpotifyAPI()

In [451]:
spotify.help()


        Available Features:
        • load_searches (71 records)
        • load_streaming (22020 records)
        • load_tracks (773 records)
        


In [562]:
def range_axis(start_date, end_date):
    X = []
    delta = timedelta(days=1)
    while start_date <= end_date:
        ts = start_date.strftime('%Y-%m-%d')
        X.append(ts)
        start_date += delta
    return sorted(X)

def range_axis_months(start_date, end_date):
    r = range_axis(start_date, end_date)
    r = np.unique([x[:7] for x in r]) # remove the --d part
    return r

def date_bucket(dt):
    return dt.strftime("%Y-%m-%d")

def day_axis():
    X = []
    start_date = datetime.now()
    end_date = start_date + timedelta(days=1)
    delta = timedelta(minutes=1)
    while start_date <= end_date:
        ts = start_date.strftime('%H:%M')
        X.append(ts)
        start_date += delta
    return sorted(X)

def time_bucket(dt):
    return dt.strftime("%H:%M")

def plot(X, y, title, xaxis='', yaxis=''):
    fig = go.Figure(data=[go.Scatter(x=X, y=y, line_shape='linear')])
    fig.update_layout(
        title=title,
        yaxis_title=yaxis,
        xaxis_title=xaxis,
        font=dict(size=12)
    )
    fig.show()

### Extracting Time-Relevant Information

In [476]:
actions = []

for search in spotify.load_searches():
    dt = parse(search.get('searchTime'), fuzzy=True, ignoretz=True)
    dt = pytz.utc.localize(dt)
    dt = dt.astimezone(timezone('US/Pacific'))
    actions.append((dt, 'search', search))
    
    
for track in spotify.load_streaming():
    dt = parse(track.get('endTime'), fuzzy=True, ignoretz=True)
    dt = pytz.utc.localize(dt)
    dt = dt.astimezone(timezone('US/Pacific'))
    actions.append((dt, 'stream', track))

In [477]:
actions = list(sorted(actions, key=lambda a : a[0]))

In [478]:
# # Filter to Time in the USA for Testing Purposes (we don't have DateTime accurate yet)
# actions = list(filter(lambda a : a[0].year == 2019 and a[0].month < 12 and a[0].month > 8, actions))

### Analyze Historical Usage
How has my Spotify streaming frequency changed over time?

In [468]:
data = defaultdict(int)

for action in actions:
    dt = action[0]
    bucket = date_bucket(dt)
    data[bucket] += 1
    
X = range_axis(actions[0][0], actions[-1][0])
y = [data[bucket] for bucket in X]

plot(X, y, title='Spotify Streaming over All Time', xaxis='Time', yaxis='Count')

### Analyze Daily Usage
When, during the day, do I listen to Spotify?

In [469]:
data = defaultdict(int)

for action in actions:
    data[time_bucket(action[0])] += 1
    
X = day_axis()
y = [data[bucket] for bucket in X]
plot(X, y, title='Spotify Streaming over Day', xaxis='Time', yaxis='Count')

### Most Popular Tracks & Artists
What do I listen to the most?

In [480]:
df = pd.DataFrame.from_dict(spotify.load_streaming())

In [482]:
favorite_tracks = df.groupby('trackName').sum().sort_values('msPlayed', ascending=False)
favorite_tracks.head(15)

Unnamed: 0_level_0,msPlayed
trackName,Unnamed: 1_level_1
You & Me,11353946
Daylight,11202478
Beautiful Creatures (feat. MAX),10216498
A Thousand Years,9824923
Speechless (Full),9735994
Dreamer,9429627
Africa,8506608
Say Love,8503985
Agar Tum Saath Ho / Treat You Better,8391912
Sweet Caroline,8123963


In [472]:
favorite_artists = df.groupby('artistName').sum().sort_values('msPlayed', ascending=False)
favorite_artists.head(15)

Unnamed: 0_level_0,msPlayed
artistName,Unnamed: 1_level_1
Penn Masala,76843387
Lauv,52911093
Coldplay,50489435
Alan Silvestri,44240044
James TW,35677742
Maroon 5,35513986
Avicii,35182648
ILLENIUM,34905130
Michael Giacchino,29406494
Hans Zimmer,28492686


## Comparision to All Music

Let's take a look at these on a plot. It appears that the difference between songs that I really enjoy and those that fit into the "general" category is striking; there's a sharp curve for both of these graphs.

In [473]:
plot(favorite_artists.index,favorite_artists['msPlayed'], title="My Favorite Artists", xaxis='Artist Name', yaxis='ms played')

In [474]:
plot(favorite_tracks.index,favorite_tracks['msPlayed'], title="My Favorite Tracks", yaxis='ms played')

## Top Tracks Over Time

In [584]:
top = favorite_tracks.head(20).index.to_list()
top = df[df.trackName.isin(top)]
top = top.assign(endTime=lambda df: df['endTime'].apply(lambda x : x[:10]))
top.head()

Unnamed: 0,artistName,endTime,msPlayed,trackName
41,Marshmello,2018-12-17,187132,Happier
66,ILLENIUM,2018-12-17,240508,Beautiful Creatures (feat. MAX)
99,ILLENIUM,2018-12-17,240508,Beautiful Creatures (feat. MAX)
129,James TW,2018-12-19,219440,Say Love
148,ILLENIUM,2018-12-19,240508,Beautiful Creatures (feat. MAX)


In [585]:
def get_data(tracker):
    data_cumulative = []
    data_monthly = []
    for artist, cum_dict in tracker.items():
        y_cum = []
        y_daily = []
        cum_ms = 0
        for dt in X:
            cum_ms += cum_dict.get(dt, 0)
            y_cum.append(cum_ms)

        y_months = [sum([v for k, v in cum_dict.items() if month in k]) for month in X_months]
        data_cumulative.append(go.Scatter(x=X, y=y_cum, name=artist, line_shape='spline'))
        data_monthly.append(go.Scatter(x=X_months, y=y_months, name=artist, line_shape='spline'))
        
    return data_cumulative, data_monthly

def plot(data, title):
    fig = go.Figure(data=data)
    fig.update_layout(
        title=title,
        yaxis_title='Total Time Listened To',
        xaxis_title='Time',
        font=dict(size=12)
    )
    fig.show()

In [586]:
artist_tracker = defaultdict(dict)
track_tracker = defaultdict(dict)

for i, row in top.iterrows():
    dt = row.get('endTime')
    artist = row.get('artistName')[:18]
    track = row.get('trackName')[:18]
    ms = int(row.get('msPlayed'))
    artist_tracker[artist][dt] = artist_tracker[artist].setdefault(dt, 0) + ms
    track_tracker[track][dt] = track_tracker[track].setdefault(dt, 0) + ms
    
r = list(sorted(top['endTime']))
X = range_axis(parse(r[0]), parse(r[-1]))
X_months = range_axis_months(parse(r[0]), parse(r[-1]))

In [588]:
artists_cum, artists_mon = get_data(artist_tracker)
tracks_cum, tracks_mon = get_data(track_tracker)

plot(artists_cum, 'Artists over Time (Cumulative)')
plot(artists_mon, 'Artists over Time (Monthly)')
plot(tracks_cum, 'Tracks over Time (Monthly)')
plot(tracks_mon, 'Tracks over Time (Monthly)')