# Spotify Data Project
https://github.com/jordanbean/Spotify-Data/blob/master/Spotify_Data.ipynb

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import json
from pprint import pprint

import plotly
import chart_studio.plotly as py
import plotly.graph_objs as go

pd.set_option('display.max_rows', 500)

In [2]:
import chart_studio
from chart_secrets import *
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [4]:
# Read in the data from a Json format and look at the results

with open('StreamingHistory0.json', encoding='utf8') as f:
    data = json.load(f)

pprint(data[:5])

[{'artistName': 'Alphaville',
  'endTime': '2020-06-10 16:56',
  'msPlayed': 35538,
  'trackName': 'Forever Young'},
 {'artistName': 'Pink Floyd',
  'endTime': '2020-06-10 16:57',
  'msPlayed': 15956,
  'trackName': 'Time - 2011 Remastered Version'},
 {'artistName': 'Noir Désir',
  'endTime': '2020-06-10 16:57',
  'msPlayed': 55692,
  'trackName': "L'homme pressé"},
 {'artistName': 'Pink Floyd',
  'endTime': '2020-06-10 16:58',
  'msPlayed': 29094,
  'trackName': 'Shine On You Crazy Diamond, Pts. 6-9 - 2011 Remastered '
               'Version'},
 {'artistName': 'Pink Floyd',
  'endTime': '2020-06-10 16:58',
  'msPlayed': 34683,
  'trackName': 'Shine On You Crazy Diamond, Pts. 1-5 - 2011 Remastered '
               'Version'}]


In [5]:
# Convert from JSON to a data frame

spotify_data = pd.DataFrame()

def extract_json_value(column_name):
    
    return [i[column_name] for i in data]

spotify_data['artist_name'] = extract_json_value('artistName')
spotify_data['end_time'] = extract_json_value('endTime')
spotify_data['ms_played'] = extract_json_value('msPlayed')
spotify_data['track_name'] = extract_json_value('trackName')

In [6]:
spotify_data.sample(10)

Unnamed: 0,artist_name,end_time,ms_played,track_name
17,The White Stripes,2020-06-10 17:15,16643,300 M.P.H Torrential Outpour Blues
39,Klingande,2020-06-10 17:31,71943,Jubel
80,The White Stripes,2020-06-12 17:05,65428,300 M.P.H Torrential Outpour Blues
23,The White Stripes,2020-06-10 17:18,31219,Icky Thump
123,America,2020-06-18 21:56,252111,A Horse with No Name
12,Stevie Wonder,2020-06-10 17:13,35691,Faith
100,Garou,2020-06-18 20:05,211317,Sous le vent
75,Manau,2020-06-10 20:03,29916,La tribu de Dana
33,Chuck Berry,2020-06-10 17:27,6939,Johnny B. Goode
11,Taron Egerton,2020-06-10 17:12,47181,I'm Still Standing


In [7]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   artist_name  129 non-null    object
 1   end_time     129 non-null    object
 2   ms_played    129 non-null    int64 
 3   track_name   129 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.2+ KB


## Format columns and add variables

In [8]:
# Convert the timestamp column to datetime

spotify_data['end_time'] = pd.to_datetime(spotify_data['end_time'])

In [9]:
# Add additional time columns for more interpretable times; drop the MS column

spotify_data['seconds_played'] = spotify_data.ms_played.divide(1000)
spotify_data['minutes_played'] = spotify_data.seconds_played.divide(60)

spotify_data.drop('ms_played', axis=1, inplace=True)

In [10]:
spotify_data.sample(10)

Unnamed: 0,artist_name,end_time,track_name,seconds_played,minutes_played
92,Eagles,2020-06-18 19:44:00,Hotel California - 2013 Remaster,5.892,0.0982
57,Daft Punk,2020-06-10 17:45:00,TRON Legacy (End Titles),8.183,0.136383
38,Adam Levine,2020-06-10 17:30:00,Lost Stars,39.681,0.66135
6,Pink Floyd,2020-06-10 17:07:00,Wish You Were Here - 2011 Remastered Version,219.19,3.653167
36,The White Stripes,2020-06-10 17:29:00,Icky Thump,21.186,0.3531
45,Guns N' Roses,2020-06-10 17:34:00,Knockin' On Heaven's Door,23.694,0.3949
77,The Dave Brubeck Quartet,2020-06-12 17:01:00,Take Five,33.125,0.552083
88,America,2020-06-12 17:12:00,A Horse with No Name,32.405,0.540083
126,De Palmas,2020-06-18 22:07:00,Sur la route,221.593,3.693217
7,Queen,2020-06-10 17:08:00,Bohemian Rhapsody - 2011 Mix,19.79,0.329833


## Most populat artists and songs

In [11]:
# Find the most popular artists by number of times played

most_popular_artists_by_count = spotify_data.groupby(by='artist_name')['track_name'].count().sort_values(ascending=False)[:25]

print('The most played artists by count were: \n\n{}'.format(most_popular_artists_by_count))

The most played artists by count were: 

artist_name
The White Stripes           12
Bob Dylan                   10
Pink Floyd                   8
Queen                        5
Jack Johnson                 5
Red Hot Chili Peppers        4
The Dave Brubeck Quartet     4
Scorpions                    4
George Michael               3
Eagles                       3
Manau                        3
Maître Gims                  3
Daft Punk                    3
Francis Cabrel               3
Yves Jamait                  3
Taron Egerton                3
Alphaville                   3
Bill Withers                 3
Klingande                    2
Chuck Berry                  2
Elton John                   2
The Weeknd                   2
Georges Brassens             2
Guns N' Roses                2
America                      2
Name: track_name, dtype: int64


In [12]:
# Visualize the most popular artists with a standard bar chart

data = [
    
    go.Bar(
            x=most_popular_artists_by_count.index,
            y=most_popular_artists_by_count,
            text=most_popular_artists_by_count,
            textposition='auto',
            opacity=0.75
            
    )]

layout = go.Layout(
    title='Popularity of Artists by Count',
    
    yaxis= dict(
        title='Number of Times Played',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
        titlefont=dict(size=15))
)

fig = go.Figure(data=data, layout=layout)
        
py.iplot(fig, filename='popular_artists')

In [13]:
# Look at most popular artists by amount of time played

most_popular_artists_by_time = spotify_data.groupby(by='artist_name')['minutes_played'].sum().sort_values(ascending=False)[:20]

most_popular_artists_by_time

artist_name
Pink Floyd                  28.580017
Scorpions                   19.293383
Bob Dylan                   13.570633
Jack Johnson                10.056383
Francis Cabrel               6.870533
The Dave Brubeck Quartet     6.565417
Guns N' Roses                5.994900
David Bowie                  5.942267
Eagles                       5.926667
Klingande                    5.926367
George Michael               5.758283
Ibrahim Maalouf              5.630850
Manau                        5.416717
Queen                        5.191583
Alphaville                   5.137600
The White Stripes            4.747983
America                      4.741933
Red Hot Chili Peppers        4.678617
Maître Gims                  4.618367
Taron Egerton                4.600017
Name: minutes_played, dtype: float64

In [14]:
# Look at the most popular songs played

most_popular_songs = spotify_data.track_name.value_counts().sort_values(ascending=False)[:20]

most_popular_songs

300 M.P.H Torrential Outpour Blues        8
Knockin' On Heaven's Door                 6
I'm Still Standing                        5
Icky Thump                                4
Faith                                     4
Californication                           4
Take Five                                 4
Hurricane                                 3
The Times They Are A-Changin'             3
Still Loving You                          3
Ok, Tu T'en Vas                           3
Ain't No Sunshine                         3
La tribu de Dana                          3
Shake It Off                              3
Forever Young                             3
Jubel                                     2
Space Oddity - 2015 Remaster              2
Have A Cigar - 2011 Remastered Version    2
A Horse with No Name                      2
Fly Me To The Moon - Remastered           2
Name: track_name, dtype: int64

## Time of day to listen

In [15]:
# Create time of day variable

def time_of_day(datetime_column, df=spotify_data):
    
    """
    Takes in a datetime column and returns the time of day that the datetime occurs.
    
    Before 12 PM is considered morning, between 12 PM and 5 PM afternoon, and after 5 PM evening.
    """
    
    time_of_day = []
    
    for i in df[datetime_column]:
        
        i = i.hour
        
        if i <= 12:
            
            time_of_day.append('morning')
            
        elif i < 17:
            
            time_of_day.append('afternoon')
            
        else:
            
            time_of_day.append('night')
    
    time_of_day = pd.Categorical(time_of_day, categories=['morning','afternoon','night'], ordered=True)
            
    return time_of_day

In [16]:
from datetime import datetime
import time

def datetime_from_utc_to_local(utc_datetime):
    
    """
    Converts a column from a UTC timestamp to local time, then returns the local time.
    """
    now_timestamp = time.time()
    offset = datetime.fromtimestamp(now_timestamp) - datetime.utcfromtimestamp(now_timestamp)
    return utc_datetime + offset

In [17]:
## Convert from UTC time to eastern time

spotify_data['local_time'] = datetime_from_utc_to_local(spotify_data.end_time)
spotify_data['local_time_of_day'] = time_of_day('local_time')

In [18]:
# Add day of week and organize days as categories

spotify_data['local_day_of_week'] = spotify_data['local_time'].dt.day_name()

spotify_data['local_day_of_week'] = pd.Categorical(spotify_data['local_day_of_week'], 
                                   categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'], 
                                   ordered=True)

In [19]:
## Create a pivot table by time of day and day of week

time_of_day_local_pivot = spotify_data.pivot_table(columns='local_time_of_day', index='local_day_of_week', 
                                             values='minutes_played', aggfunc=np.sum)

start_date = spotify_data.local_time.min()
end_date = spotify_data.local_time.max()

difference_in_weeks = (end_date - start_date).days / (7)
time_of_day_utc_pivot = time_of_day_local_pivot.divide(difference_in_weeks)

In [20]:
trace = go.Heatmap(z=time_of_day_utc_pivot.values,
                  x=time_of_day_utc_pivot.columns,
                  y=time_of_day_utc_pivot.index,
                  colorscale='Greens',
                  reversescale=True)
data=[trace]
py.iplot(data, filename='spotify_heatmap')

## Higher likelihood to play a song

In [21]:
# Read in the library information

with open('YourLibrary.json', encoding='utf8') as f:
    library_data = json.load(f)

In [22]:
library_df = pd.DataFrame()

def extract_json_value_library(column_name, data_source=library_data):
    
    """
    Takes in a JSON series of values and converts them to a list
    """
    
    return [i[column_name] for i in data_source['tracks']]

library_df['artist_name'] = extract_json_value_library('artist')
library_df['album'] = extract_json_value_library('album')
library_df['track_name'] = extract_json_value_library('track')

In [23]:
library_df.sample(10)

Unnamed: 0,artist_name,album,track_name
78,David Bowie,Space Oddity [Space Oddity 40th Anniversary Ed...,The Prettiest Star - Stereo Version;2009 Remas...
45,David Bowie,Space Oddity [Space Oddity 40th Anniversary Ed...,"Ragazzo Solo, Ragazza Sola - Full Length Stere..."
49,Meghan Trainor,Title,I'll Be Home
11,Meghan Trainor,Title,Lips Are Movin
79,Maroon 5,V,Animals
29,Meghan Trainor,Title,What If I
44,Mr. Probz,Waves,Waves - Robin Schulz Radio Edit
52,Meghan Trainor,Title,Title - Acoustic
69,David Bowie,Space Oddity [Space Oddity 40th Anniversary Ed...,Wild Eyed Boy From Freecloud - Alternate Album...
8,White Lies,To Lose My Life ...,Farewell To The Fairground


In [24]:
library_df.size

252

## Scatter Plot of Artist Plays

In [25]:
def time_of_day_scatter_df(df = spotify_data, time_of_day=None):
    
    """
    Takes in a data frame, filters it for the time of day if the variable is not None, and finds the 
    frequency of plays, minutes played, and unique songs by artists. The function returns a data frame
    with this information.
    """
    
    if time_of_day:
        df = df[df['local_time_of_day'] == time_of_day]
        
    grouped_artists = df.groupby(by='artist_name')
    
    number_of_plays = grouped_artists['track_name'].count()
    minutes_played = grouped_artists['minutes_played'].sum()
    unique_songs_played = [len(i) for i in grouped_artists['track_name'].unique()]
    
    scatter_df = pd.DataFrame({'number_of_plays':number_of_plays, 
                           'minutes_played':minutes_played,
                           'unique_songs_played':unique_songs_played},
                         index = number_of_plays.index)
    
    return scatter_df

In [26]:
scatter_df = time_of_day_scatter_df()

In [27]:
scatter_df.head()

Unnamed: 0_level_0,number_of_plays,minutes_played,unique_songs_played
artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adam Levine,1,0.66135,1
Alphaville,3,5.1376,1
America,2,4.741933,1
Audioslave,2,2.863283,1
Bill Withers,3,1.1117,1
