# <strong>Analysing Spotify Song Data from 2009-2019</strong>

This project aims to analyse the differences in music from 2009 to 2019.

## <strong> 1. Collecting and Preprocessing Spotify Data </strong>

In [87]:
#importing pandas as pd
import numpy as np

#importing spotify libraries
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [88]:
cid ="b36dcee1d1c84d73bc2e088dbe2b2791" 
secret = "b44b14f096874401966de78ec0077cc6"

In [89]:
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [22]:
track_results = sp.search(q='year:2019', type='track', limit=50,offset=1)

In [193]:
# timeit library to measure the time needed to run this code
import timeit
start = timeit.default_timer()

# create empty lists where the results are going to be stored
artist_name = []
track_name = []
popularity = []
track_id = []
year=[]

for y in range(2009,2020):
    for i in range(0,10000,50):
        track_results = sp.search(q='year:'+str(y), type='track', limit=50,offset=i)
        for i, t in enumerate(track_results['tracks']['items']):
            artist_name.append(t['artists'][0]['name'])
            track_name.append(t['name'])
            track_id.append(t['id'])
            popularity.append(t['popularity'])
            year.append(y)  

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

Time to run this code (in seconds): 1611.554714652004


In [196]:
print('number of elements in the track_id list:', len(track_id))

number of elements in the track_id list: 110000


In [197]:
import pandas as pd

df_tracks = pd.DataFrame({'artist_name':artist_name,'track_name':track_name,'track_id':track_id,'popularity':popularity, 'year':year})
print(df_tracks.shape)
df_tracks.head()

(110000, 5)


Unnamed: 0,artist_name,track_name,track_id,popularity,year
0,Miley Cyrus,Party In The U.S.A.,5Q0Nhxo0l2bP3pNjpGJwV1,72,2009
1,Cage The Elephant,Ain't No Rest for the Wicked,3Pzh926pXggbMe2ZpXyMV7,71,2009
2,Kid Cudi,Day 'N' Nite (nightmare),5FEXPoPnzueFJQCPRIrC3c,69,2009
3,Owl City,Fireflies,3DamFFqW32WihKkTVlwTYQ,75,2009
4,Kid Cudi,Pursuit Of Happiness (Nightmare),5iSEsR6NKjlC9SrIJkyL3k,67,2009


In [198]:
# check for duplicates
df_tracks[df_tracks.duplicated(subset=['artist_name','track_name'])].count()

artist_name    7726
track_name     7726
track_id       7726
popularity     7726
year           7726
dtype: int64

In [199]:
df_tracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [200]:
# check for duplicates again
df_tracks[df_tracks.duplicated(subset=['artist_name','track_name'])].count()

artist_name    0
track_name     0
track_id       0
popularity     0
year           0
dtype: int64

In [201]:
df_tracks.shape

(102274, 5)

## <strong>2. Retrieving Track Audio Features</strong>

In [202]:
# again measuring the time
start = timeit.default_timer()

# empty list, batchsize and the counter for None results
rows = []
batchsize = 100
None_counter = 0

for i in range(0,len(df_tracks['track_id']),batchsize):
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:',None_counter)

stop = timeit.default_timer()
print ('Time to run this code (in seconds):',stop - start)

Number of tracks where no audio features were available: 49
Time to run this code (in seconds): 425.9274579789999


In [203]:
print('number of elements in the track_id list:', len(rows))

number of elements in the track_id list: 102225


In [204]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
print("Shape of the dataset:", df_audio_features.shape)
df_audio_features.head()

Shape of the dataset: (102225, 18)


Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.00124,https://api.spotify.com/v1/audio-analysis/5Q0N...,0.454,202067,0.708,5Q0Nhxo0l2bP3pNjpGJwV1,0.000181,10,0.0679,-4.526,0,0.0675,192.18,4,https://api.spotify.com/v1/tracks/5Q0Nhxo0l2bP...,audio_features,spotify:track:5Q0Nhxo0l2bP3pNjpGJwV1,0.483
1,0.0438,https://api.spotify.com/v1/audio-analysis/3Pzh...,0.634,175493,0.849,3Pzh926pXggbMe2ZpXyMV7,0.0,0,0.363,-7.075,1,0.105,156.004,4,https://api.spotify.com/v1/tracks/3Pzh926pXggb...,audio_features,spotify:track:3Pzh926pXggbMe2ZpXyMV7,0.919
2,0.503,https://api.spotify.com/v1/audio-analysis/5FEX...,0.878,221267,0.446,5FEXPoPnzueFJQCPRIrC3c,6e-06,11,0.117,-6.362,0,0.0635,138.035,4,https://api.spotify.com/v1/tracks/5FEXPoPnzueF...,audio_features,spotify:track:5FEXPoPnzueFJQCPRIrC3c,0.805
3,0.0294,https://api.spotify.com/v1/audio-analysis/3Dam...,0.591,228347,0.649,3DamFFqW32WihKkTVlwTYQ,0.0,3,0.133,-6.72,1,0.0417,90.002,4,https://api.spotify.com/v1/tracks/3DamFFqW32Wi...,audio_features,spotify:track:3DamFFqW32WihKkTVlwTYQ,0.489
4,0.511,https://api.spotify.com/v1/audio-analysis/5iSE...,0.629,295293,0.622,5iSEsR6NKjlC9SrIJkyL3k,0.000174,0,0.383,-8.906,1,0.0404,115.293,4,https://api.spotify.com/v1/tracks/5iSEsR6NKjlC...,audio_features,spotify:track:5iSEsR6NKjlC9SrIJkyL3k,0.264


In [205]:
df_audio_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102225 entries, 0 to 102224
Data columns (total 18 columns):
acousticness        102225 non-null float64
analysis_url        102225 non-null object
danceability        102225 non-null float64
duration_ms         102225 non-null int64
energy              102225 non-null float64
id                  102225 non-null object
instrumentalness    102225 non-null float64
key                 102225 non-null int64
liveness            102225 non-null float64
loudness            102225 non-null float64
mode                102225 non-null int64
speechiness         102225 non-null float64
tempo               102225 non-null float64
time_signature      102225 non-null int64
track_href          102225 non-null object
type                102225 non-null object
uri                 102225 non-null object
valence             102225 non-null float64
dtypes: float64(9), int64(4), object(5)
memory usage: 14.0+ MB


## <strong>3. Merging Both Dataframes</strong>

In [206]:
columns_to_drop = ['analysis_url','track_href','type','uri']
df_audio_features.drop(columns_to_drop, axis=1,inplace=True)

df_audio_features.rename(columns={'id': 'track_id'}, inplace=True)

df_audio_features.shape

(102225, 14)

In [207]:
# merge both dataframes
# the 'inner' method will make sure that we only keep track IDs present in both datasets
df = pd.merge(df_tracks,df_audio_features,on='track_id',how='inner')
print("Shape of the dataset:", df_audio_features.shape)
df.head()

Shape of the dataset: (102225, 14)


Unnamed: 0,artist_name,track_name,track_id,popularity,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Miley Cyrus,Party In The U.S.A.,5Q0Nhxo0l2bP3pNjpGJwV1,72,2009,0.00124,0.454,202067,0.708,0.000181,10,0.0679,-4.526,0,0.0675,192.18,4,0.483
1,Cage The Elephant,Ain't No Rest for the Wicked,3Pzh926pXggbMe2ZpXyMV7,71,2009,0.0438,0.634,175493,0.849,0.0,0,0.363,-7.075,1,0.105,156.004,4,0.919
2,Kid Cudi,Day 'N' Nite (nightmare),5FEXPoPnzueFJQCPRIrC3c,69,2009,0.503,0.878,221267,0.446,6e-06,11,0.117,-6.362,0,0.0635,138.035,4,0.805
3,Owl City,Fireflies,3DamFFqW32WihKkTVlwTYQ,75,2009,0.0294,0.591,228347,0.649,0.0,3,0.133,-6.72,1,0.0417,90.002,4,0.489
4,Kid Cudi,Pursuit Of Happiness (Nightmare),5iSEsR6NKjlC9SrIJkyL3k,67,2009,0.511,0.629,295293,0.622,0.000174,0,0.383,-8.906,1,0.0404,115.293,4,0.264


In [209]:
df.to_csv('SpotifyAudioFeatures2009_2019.csv')

## <strong>4. Visualising Differences Over the Decade</strong>

In [235]:
import plotly.plotly as py
import plotly.graph_objs as go

def boxplots(feature):
    year2009 = go.Box(
        y=df[df['year']==2009][feature],name="2009"
    )
    year2010 = go.Box(
        y=df[df['year']==2010][feature],name="2010"
    )
    year2011 = go.Box(
        y=df[df['year']==2011][feature],name="2011"
    )
    year2012 = go.Box(
        y=df[df['year']==2012][feature],name="2012"
    )
    year2013 = go.Box(
        y=df[df['year']==2013][feature],name="2013"
    )
    year2014 = go.Box(
        y=df[df['year']==2014][feature],name="2014"
    )
    year2015 = go.Box(
        y=df[df['year']==2015][feature],name="2015"
    )
    year2016 = go.Box(
        y=df[df['year']==2016][feature],name="2016"
    )
    year2017 = go.Box(
        y=df[df['year']==2017][feature],name="2017"
    )
    year2018 = go.Box(
        y=df[df['year']==2018][feature],name="2018"
    )
    year2019 = go.Box(
        y=df[df['year']==2019][feature],name="2019"
    )

    data = [year2009, year2010, year2011, year2012, year2013, year2014, year2015, year2016, year2017, year2018, year2019]
    layout = go.Layout(
    yaxis=dict(title=feature),xaxis=dict(tickangle=90)
    )
    fig = go.Figure(data=data, layout=layout)
    return fig


In [236]:
py.iplot(boxplots('loudness'))

In [237]:
py.iplot(boxplots('acousticness'))

In [238]:
py.iplot(boxplots('danceability'))

In [239]:
py.iplot(boxplots('energy'))

In [240]:
py.iplot(boxplots('instrumentalness'))

In [241]:
py.iplot(boxplots('liveness'))

In [242]:
py.iplot(boxplots('speechiness'))