#### Import modules:

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#### Read data:

In [19]:
df = pd.read_json('./MyData/StreamingHistory0.json', orient = 'records').append(pd.read_json('./MyData/StreamingHistory1.json', orient = 'records'))

In [20]:
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-12-16 00:01,Linkin Park,Iridescent (Version 2),239441
1,2020-12-16 00:05,The Stone Roses,She Bangs the Drums - Remastered,232357
2,2020-12-16 00:08,ABBA,Waterloo,169007
3,2020-12-16 00:12,David Bowie,Starman - 2012 Remaster,254317
4,2020-12-16 00:16,AC/DC,You Shook Me All Night Long,210002


Podcasts are mixed in with the data

In [21]:
df.shape

(14666, 4)

#### Process data:

In [22]:
podcast_filter = [
    'Armchair Expert with Dax Shepard',
    'Extremities',
    'F1: Beyond The Grid',
    'Fighter Pilot Podcast',
    'Football Cliches - A show about the language of football',
    "Harry Potter and the Philosopher's Stone",
    'Lions Led By Donkeys Podcast',
    'Phoebe Reads a Mystery',
    'Serial',
    'Talking Bull',
    'The Athletic FPL Podcast',
    'The Cricket Podcast',
    'The Steve Dangle Podcast',
    'Tifo Football Podcast',
    'WTF1 Podcast',
    "Well There's Your Problem"
]

df = df[~df['artistName'].isin(podcast_filter)]

In [23]:
df['endTime'] = pd.to_datetime(df['endTime'])  # convert endTime to datetime type

In [24]:
df['date'] = df['endTime'].dt.date  # extract date from endTime

In [25]:
df['minPlayed'] = df['msPlayed'] / 60000  # convert ms to mins

In [26]:
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,date,minPlayed
0,2020-12-16 00:01:00,Linkin Park,Iridescent (Version 2),239441,2020-12-16,3.990683
1,2020-12-16 00:05:00,The Stone Roses,She Bangs the Drums - Remastered,232357,2020-12-16,3.872617
2,2020-12-16 00:08:00,ABBA,Waterloo,169007,2020-12-16,2.816783
3,2020-12-16 00:12:00,David Bowie,Starman - 2012 Remaster,254317,2020-12-16,4.238617
4,2020-12-16 00:16:00,AC/DC,You Shook Me All Night Long,210002,2020-12-16,3.500033


In [27]:
df.shape

(14223, 6)

#### Analysis:

Total minutes:

In [28]:
df['minPlayed'].sum()

41321.59218333333

Total artists/podcasts:

In [29]:
len(df['artistName'].unique())

860

50 most-listed to artists/podcasts:

In [30]:
df.groupby(['artistName'])['minPlayed'].agg(sum).sort_values(ascending = False)[:50]

artistName
Arctic Monkeys                 1328.461750
Red Hot Chili Peppers          1090.212033
The Strokes                     772.775900
Pink Floyd                      736.741333
Queen                           736.733950
The Local Train                 720.978433
Eagles                          703.357617
Electric Light Orchestra        480.857350
Led Zeppelin                    468.202350
Gov't Mule                      459.101833
Eric Clapton                    432.253500
Elton John                      414.407267
The Beatles                     411.234650
Dire Straits                    408.275117
Foo Fighters                    407.684500
Vulfpeck                        393.399267
The Rolling Stones              370.811567
AC/DC                           368.408950
alt-J                           364.845017
Billy Joel                      350.784933
Parikrama                       341.749800
David Gilmour                   337.898750
Lynyrd Skynyrd                  336.519083


In [40]:
# df.groupby(['date']).size().sort_values(ascending = False)
df.groupby(['date'])['minPlayed'].agg(sum).sort_values(ascending = False)

date
2021-07-14    6.913592
2021-06-08    5.850784
2021-07-21    5.794518
2021-08-16    5.568048
2021-11-12    5.493197
                ...   
2021-07-10    0.099211
2021-01-12    0.065082
2021-10-03    0.063100
2021-08-11    0.001557
2021-08-06    0.001142
Name: minPlayed, Length: 348, dtype: float64