## Spotify - Setup & Cleaning

In [1]:
import os
import sys
import spotipy
import spotipy.util as util

from SpotifyToken import get_Token

import numpy as np
import pandas as pd
import json

To keep my spotify credentials private, I have created a function get_Token in a seperate script. I import the function and use it to access the Spotify API in this notebook. This also makes it easier to access the API in future projects.

In [2]:
spotify = get_Token()

env: SPOTIPY_CLIENT_ID=cid
env: SPOTIPY_CLIENT_SECRET=secret
env: SPOTIPY_REDIRECT_URI=uri


In [3]:
def playlistParser(uri_input):
    
    '''
    FUNCTION
    ----------
    - Requests info on a playlist from the Spotify API
    - Convert retrieved data into JSON format
    - Parses the JSON file, extracting, dropping, and expanding relevant columns
    - Cleans data in each column so each inputted URI has standardized information
    
    - For list inputs:
        - Combines all playlists into a single dataframe
    - For string inputs:
        - Dataframe containing data for the single playlist input
    
    PARAMETER
    ----------
    uri_input | string or list of strings | A spotify playlist URI, or list of URI's. A URI can be obtained 
                                            through the online Spotify API, or manually copy and pasted from 
                                            the Spotify desktop app.
    
    
    RETURNS
    ----------
    A dataframe containing all relevant track info (artist, album, name, etc.) and audio features.
    Column list:
    
    ~ Main dataframe containing key ID info and basic track info
    ~ Track Info dataframe
        - Contains in-depth audio data on each track such as valence, tempo, and danceability
    '''
    
    #empty dataframe to contain data for every playlist
    final_tracks = pd.DataFrame()
    
    #initializing empty list for if/else loop
    uri_list = []
    
    #if input is a string, append it to an empty list
    if type(uri_input) == list:
        
        uri_list = uri_input
        
    #else, the inputted list will be used
    elif type(uri_input) == str:
        uri_list.append(uri_input)
    
    else:
        raise TypeError('Uri input must be of type string or list of strings')
        
    #looping through each uri in the list 
    for uri in uri_list:
        
        #assert checking if the input is a valid spotify playlist uri
        assert 'spotify:playlist:' in uri

        #requesting spotify playlist data from the spotify API, 100 track maximum
        #encoding to json with json.dumps
        #reading json data into a pandas dataframe
        main = spotify.playlist_tracks(uri, limit = 100)
        main = json.dumps(main, sort_keys = True)
        main = pd.read_json(main)

        #de-nesting, renaming, and selecting relevant columns
        main = main[['items']]
        
        main = pd.json_normalize(main['items'])

        main = main[['track.uri', 'track.name', 'track.popularity', 'track.explicit',
              'track.duration_ms', 'track.album.release_date', 'track.artists', 'track.id']]

        main = pd.concat([main.drop(columns = 'track.artists'), main['track.artists'].apply(pd.Series)], axis = 1)

        main = pd.concat([main.drop(columns = 0), main[0].apply(pd.Series)], axis = 1)

        main = main.rename(columns = {'id':'artist_id', 'name':'artist','uri':'artist_uri', 'track.uri':'track_uri', 
                                       'track.id':'track_id', 'track.name':'track', 'track.popularity':'popularity', 
                                       'track.explicit':'explicit','track.duration_ms':'duration_ms', 
                                       'track.album.release_date':'release_date'})

        main = main[['artist_id','artist', 'artist_uri', 'track_uri', 'track_id', 'track', 
                 'popularity', 'explicit', 'duration_ms', 'release_date']]

        #creating a list of track uri's
        track_uri_list = list(main['track_uri'])
            
        #requesting audio features for each track in the track uri list
        track_feat = spotify.audio_features(track_uri_list)
        track_feat = json.dumps(track_feat, sort_keys = True)
        track_feat = pd.read_json(track_feat)

        track_feat = track_feat.drop(columns = ['analysis_url', 'duration_ms', 
                                                'track_href', 'type', 'uri'])
        
        track_feat = track_feat.rename(columns = {'id':'track_id'})

        #renaming musical keys to their string equivalents (equivalancies defined by Spotify API)
        track_feat['key'] = track_feat['key'].replace({0:'C', 1:'C#', 2:'D', 3:'D#',
                                                      4:'E', 5:'F', 6:'F#', 7:'G', 
                                                      8:'G#',9:'A', 10:'A#', 11:'B'})

        #renaming mode column to string equivalents (equivalencies defined by Spotify API)
        track_feat['mode'] = track_feat['mode'].replace({0:'minor', 1:'major'})


        #merging the tracks and track_feat dataframes
        tracks = pd.merge(main, track_feat, on = 'track_id', how = 'outer')

        #converting the duration_ms column to be in seconds
        tracks = tracks.rename(columns = {'duration_ms':'duration_s'})
        tracks['duration_s'] = (tracks['duration_s']/1000).astype(int)
                       
        #concatenating each track dataframe
        final_tracks = pd.concat([final_tracks, tracks])
        
    #returns the dataframe with columns in the following order
    return final_tracks[['artist_uri', 'artist_id', 'track_uri', 'track_id',
                         'artist', 'track', 'release_date','popularity', 'explicit', 
                         'key', 'mode', 'time_signature', 'tempo', 'duration_s','energy', 'liveness', 'speechiness', 'valence', 'acousticness', 'loudness', 
                         'danceability', 'instrumentalness']]

The reason I bounce between using pd.json_normalize and the pd.concat method of de-nesting columns is primarily because json_normalize does not work properly with columns of list objects. pd.concat can handle both lists and dictionaries so it is more versatile, but not as concise as json_normalize. In fact, json_normalize also names the resulting columns after the original column they were nested within. EX: de-nesting 'artists' column -> 'artists.name', 'artists.followers'. vs. pd.concat that simply returns the regular column names. This can get confusing as many columns share the same name.

In [4]:
#List of my top 100 songs from 2016-2019
my_uris = ['spotify:playlist:37i9dQZF1EtlAw8h3kJ9CQ', 
           'spotify:playlist:37i9dQZF1Ejne7mgVu7fqi', 
           'spotify:playlist:37i9dQZF1E9HGRdbGH3mZy', 
           'spotify:playlist:37i9dQZF1CyLdx5Yd2zecw']

#Parsing the playlists
my_tracks = playlistParser(my_uris)

In [5]:
my_tracks.head()

Unnamed: 0,artist_uri,artist_id,track_uri,track_id,artist,track,release_date,popularity,explicit,key,...,tempo,duration_s,energy,liveness,speechiness,valence,acousticness,loudness,danceability,instrumentalness
0,spotify:artist:1uXrhF4cZsmDQZDueF9uJT,1uXrhF4cZsmDQZDueF9uJT,spotify:track:0viLuvPHY9klTMQjftf7mr,0viLuvPHY9klTMQjftf7mr,dave rodgers,DEJA VU - Extended ver.,2013-05-29,51,False,G#,...,77.025,260,0.966,0.105,0.0626,0.808,0.102,-6.99,0.568,0.0
1,spotify:artist:5LXEAEGrpKQtpyCu2sZuWu,5LXEAEGrpKQtpyCu2sZuWu,spotify:track:2h47SG8bNphmicAll4H9RV,2h47SG8bNphmicAll4H9RV,Oingo Boingo,Dead Man's Party,1985-01-01,54,False,C#,...,161.261,381,0.863,0.246,0.0383,0.923,0.5,-10.984,0.568,0.000137
2,spotify:artist:4tYSBptyGeVyZsk8JC4JHZ,4tYSBptyGeVyZsk8JC4JHZ,spotify:track:1n8wr8tRHs5jmBxNWXedcn,1n8wr8tRHs5jmBxNWXedcn,Shoreline Mafia,"Bands (feat. Ohgeesy, Fenix Flexin & Master Kato)",2018-07-13,72,True,C#,...,99.964,174,0.385,0.142,0.435,0.488,0.371,-12.189,0.9,0.0
3,spotify:artist:5n3811iYeaFM01gSJMQuRH,5n3811iYeaFM01gSJMQuRH,spotify:track:3uA8gvGhBiiHtzU7f71t7y,3uA8gvGhBiiHtzU7f71t7y,Ceremony,Turn Away the Bad Thing,2019-06-26,15,False,A,...,174.042,242,0.733,0.106,0.0371,0.0668,6.3e-05,-9.107,0.368,0.866
4,spotify:artist:77mJc3M7ZT5oOVM7gNdXim,77mJc3M7ZT5oOVM7gNdXim,spotify:track:1zFbBrZmJr1Z49HYe84dBg,1zFbBrZmJr1Z49HYe84dBg,Her's,Harvey,2018-08-24,60,False,A,...,141.941,211,0.855,0.128,0.0372,0.717,0.0735,-6.153,0.588,0.815


In [6]:
my_tracks.shape

(400, 22)

In [7]:
#Checking for null values and duplicate rows
my_tracks.drop_duplicates(subset = ['track', 'artist'], inplace = True)

#some duplicates that were missed
my_tracks = my_tracks.loc[~my_tracks['track'].isin(['Only A Lad - 10" EP','Only A Lad - 1988 Boingo Alive Version',
                                                    'Boasty (feat. Idris Elba) - Kingdom 93 Remix'])]

In [8]:
my_tracks.isna().any()

artist_uri          False
artist_id           False
track_uri           False
track_id            False
artist              False
track               False
release_date        False
popularity          False
explicit            False
key                 False
mode                False
time_signature      False
tempo               False
duration_s          False
energy              False
liveness            False
speechiness         False
valence             False
acousticness        False
loudness            False
danceability        False
instrumentalness    False
dtype: bool

In [9]:
my_tracks.shape

(369, 22)

In [10]:
my_tracks.head()

Unnamed: 0,artist_uri,artist_id,track_uri,track_id,artist,track,release_date,popularity,explicit,key,...,tempo,duration_s,energy,liveness,speechiness,valence,acousticness,loudness,danceability,instrumentalness
0,spotify:artist:1uXrhF4cZsmDQZDueF9uJT,1uXrhF4cZsmDQZDueF9uJT,spotify:track:0viLuvPHY9klTMQjftf7mr,0viLuvPHY9klTMQjftf7mr,dave rodgers,DEJA VU - Extended ver.,2013-05-29,51,False,G#,...,77.025,260,0.966,0.105,0.0626,0.808,0.102,-6.99,0.568,0.0
1,spotify:artist:5LXEAEGrpKQtpyCu2sZuWu,5LXEAEGrpKQtpyCu2sZuWu,spotify:track:2h47SG8bNphmicAll4H9RV,2h47SG8bNphmicAll4H9RV,Oingo Boingo,Dead Man's Party,1985-01-01,54,False,C#,...,161.261,381,0.863,0.246,0.0383,0.923,0.5,-10.984,0.568,0.000137
2,spotify:artist:4tYSBptyGeVyZsk8JC4JHZ,4tYSBptyGeVyZsk8JC4JHZ,spotify:track:1n8wr8tRHs5jmBxNWXedcn,1n8wr8tRHs5jmBxNWXedcn,Shoreline Mafia,"Bands (feat. Ohgeesy, Fenix Flexin & Master Kato)",2018-07-13,72,True,C#,...,99.964,174,0.385,0.142,0.435,0.488,0.371,-12.189,0.9,0.0
3,spotify:artist:5n3811iYeaFM01gSJMQuRH,5n3811iYeaFM01gSJMQuRH,spotify:track:3uA8gvGhBiiHtzU7f71t7y,3uA8gvGhBiiHtzU7f71t7y,Ceremony,Turn Away the Bad Thing,2019-06-26,15,False,A,...,174.042,242,0.733,0.106,0.0371,0.0668,6.3e-05,-9.107,0.368,0.866
4,spotify:artist:77mJc3M7ZT5oOVM7gNdXim,77mJc3M7ZT5oOVM7gNdXim,spotify:track:1zFbBrZmJr1Z49HYe84dBg,1zFbBrZmJr1Z49HYe84dBg,Her's,Harvey,2018-08-24,60,False,A,...,141.941,211,0.855,0.128,0.0372,0.717,0.0735,-6.153,0.588,0.815


I am not confident in the data under the "tempo" column. As you can see from the first entry, "Deja Vu" by Dave Rodgers,
the song is labeled as being 77 beats per minute. If you [listen](https://www.youtube.com/watch?v=wOEdpBUky5E) to the song, you'll quickly realize that this estimate is very wrong. In reality, this song is much closer to about double that. This a common mistake for BPM detectors to make. 

Additionally, the key column can also be subject to similar errors. A quick google search for "Deja Vu Dave Rodgers Key" pulls up several results, with key estimations at F minor, G# major, and Bb minor. This is a little more forgiveable, as the [circle of fifths](https://en.wikipedia.org/wiki/Circle_of_fifths) denotes relative minor and major keys. For example, F minor and G# major are relative keys. [Relative Keys](https://en.wikipedia.org/wiki/Relative_key) are keys that share all of the same notes, but are arranged in a different order.

The reason for Bb minors appearance in key estimators is likely due to the concept of [Dominant Keys](https://en.wikipedia.org/wiki/Dominant_(music)#Dominant_key). Dominant keys are keys that have a tonic that is a perfect fifth above the main tonic key. In this case, Bb minor's tonic is Bb minor, making its dominant key F minor, since F is the dominant note in a Bb minor chord and F minor is a perfect fifth above the main tonic (Bb minor). A little music theory lesson for you!

If I were to standaradize the key column by renaming all relative keys, we would lose information. Though keys such as F minor and G# major share all of the same notes, their arrangement and order can affect the musical motif and still produce different (yet complimentary) movements. This is again to due to concepts such as dominant keys, as both F minor and G# major will have different dominants.

Having explained that, it looks like spotify's key estimation is in agreement with other online estimators, at least for Deja Vu. For this reason, I will simply proceed with caution when interpreting results from this column.

In [11]:
#URI list for the top spotify tracks from 2016-2019
top_uri = ['spotify:playlist:3eXx1NJL3Ncp2Ysvcsx4Vs',
           'spotify:playlist:37i9dQZF1DX1HUbZS4LEyL',
           'spotify:playlist:37i9dQZF1DX7Axsg3uaDZb',
           'spotify:playlist:0Vw1hrp1Q2tL08dGeS4qH0']

#Parsing the playlist
top_tracks = playlistParser(top_uri)

In [12]:
#shape before dropping duplicates and null
top_tracks.shape

(345, 22)

In [13]:
#Checking for null values and duplicate rows
top_tracks.drop_duplicates(subset = ['track', 'artist'], inplace = True)

In [14]:
top_tracks.isna().any()

artist_uri          False
artist_id           False
track_uri           False
track_id            False
artist              False
track               False
release_date        False
popularity          False
explicit            False
key                 False
mode                False
time_signature      False
tempo               False
duration_s          False
energy              False
liveness            False
speechiness         False
valence             False
acousticness        False
loudness            False
danceability        False
instrumentalness    False
dtype: bool

In [15]:
#shape after dropping duplicates and null
top_tracks.shape

(328, 22)

In [16]:
top_tracks.head()

Unnamed: 0,artist_uri,artist_id,track_uri,track_id,artist,track,release_date,popularity,explicit,key,...,tempo,duration_s,energy,liveness,speechiness,valence,acousticness,loudness,danceability,instrumentalness
0,spotify:artist:6KImCVD70vtIoJWnq6nGn3,6KImCVD70vtIoJWnq6nGn3,spotify:track:1M4qEo4HE3PRaCOM7EXNJq,1M4qEo4HE3PRaCOM7EXNJq,Harry Styles,Adore You,2019-12-06,82,False,G#,...,99.048,207,0.771,0.102,0.0483,0.569,0.0237,-3.675,0.676,7e-06
1,spotify:artist:7nqlScm2smydSRl13eaP8E,7nqlScm2smydSRl13eaP8E,spotify:track:7C1trmcQQ5n5RNy4l6ziCv,7C1trmcQQ5n5RNy4l6ziCv,Nea,Some Say - Felix Jaehn Remix,2020-01-10,87,False,F#,...,120.03,186,0.7,0.174,0.0397,0.637,0.406,-5.591,0.682,0.0
2,spotify:artist:56mfhUDKa1vec6rSLZV5Eg,56mfhUDKa1vec6rSLZV5Eg,spotify:track:5T490vvoFNU6psep0NPmxs,5T490vvoFNU6psep0NPmxs,Jawsh 685,Savage Love (Laxed - Siren Beat),2020-06-09,87,False,D#,...,149.919,171,0.48,0.271,0.0688,0.723,0.243,-8.556,0.795,0.0
3,spotify:artist:7bXgB6jMjp9ATFy66eO08Z,7bXgB6jMjp9ATFy66eO08Z,spotify:track:1IIKrJVP1C9N7iPtG6eOsK,1IIKrJVP1C9N7iPtG6eOsK,Chris Brown,Go Crazy,2020-05-08,88,True,C,...,94.148,176,0.578,0.25,0.145,0.581,0.316,-8.932,0.755,0.0
4,spotify:artist:7c0XG5cIJTrrAgEC3ULPiq,7c0XG5cIJTrrAgEC3ULPiq,spotify:track:4JB0EAT4BkLShTyTt50FmO,4JB0EAT4BkLShTyTt50FmO,Ty Dolla $ign,"Ego Death (feat. Kanye West, FKA twigs & Skril...",2020-07-01,83,False,A#,...,111.039,231,0.615,0.654,0.132,0.321,0.061,-5.624,0.813,0.0


In [17]:
#saving to csv files to be easily imported in other notebooks
my_tracks.to_csv('my_tracks.csv', index = False)
top_tracks.to_csv('top_tracks.csv', index = False)

#### Finished! Next notebook: EDA