# <strong>Retrieving Spotify Song Data from 2019</strong>

In [1]:
#importing spotify libraries
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
cid ="xx" 
secret = "xx"

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Retrieving Track Data

In [4]:
# timeit library to measure the time needed to run this code
import timeit
start = timeit.default_timer()

# create empty lists where the results are going to be stored
artist_name = []
track_name = []
popularity = []
track_id = []

In [5]:
for i in range(0,10000,50):
    track_results = sp.search(q='year:2019', type='track', limit=50,offset=i)
    for i, t in enumerate(track_results['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
      

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

Time to run this code (in seconds): 162.18516139500025


In [6]:
print('number of elements in the track_id list:', len(track_id))

number of elements in the track_id list: 10000


In [7]:
import pandas as pd

df_tracks = pd.DataFrame({'artist_name':artist_name,'track_name':track_name,'track_id':track_id,'popularity':popularity})
print(df_tracks.shape)
df_tracks.head()

(10000, 4)


Unnamed: 0,artist_name,track_name,track_id,popularity
0,Shawn Mendes,Señorita,0TK2YIli7K1leLovkQiNik,100
1,Drake,Money In The Grave (Drake ft. Rick Ross),5ry2OE6R2zPQFDO85XkgRb,92
2,Lizzo,Truth Hurts,5qmq61DAAOUaW8AUo8xKhh,90
3,Lil Nas X,Panini,6fTt0CH2t0mdeB2N9XFG5r,91
4,Billie Eilish,bad guy,2Fxmhks0bxGSBdJ92vM42m,96


In [8]:
df_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
artist_name    10000 non-null object
track_name     10000 non-null object
track_id       10000 non-null object
popularity     10000 non-null int64
dtypes: int64(1), object(3)
memory usage: 312.6+ KB


In [9]:
# check for duplicates
df_tracks[df_tracks.duplicated(subset=['artist_name','track_name'])].count()

artist_name    1295
track_name     1295
track_id       1295
popularity     1295
dtype: int64

In [10]:
df_tracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [11]:
# check for duplicates again
df_tracks[df_tracks.duplicated(subset=['artist_name','track_name'])].count()

artist_name    0
track_name     0
track_id       0
popularity     0
dtype: int64

In [12]:
df_tracks.shape

(8705, 4)

## Retrieving Track Audio Features

In [13]:
# again measuring the time
start = timeit.default_timer()

# empty list, batchsize and the counter for None results
rows = []
batchsize = 100
None_counter = 0

for i in range(0,len(df_tracks['track_id']),batchsize):
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:',None_counter)

stop = timeit.default_timer()
print ('Time to run this code (in seconds):',stop - start)

Number of tracks where no audio features were available: 204
Time to run this code (in seconds): 26.998793757000385


In [14]:
print('number of elements in the track_id list:', len(rows))

number of elements in the track_id list: 8501


In [15]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
print("Shape of the dataset:", df_audio_features.shape)
df_audio_features.head()

Shape of the dataset: (8501, 18)


Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.037,https://api.spotify.com/v1/audio-analysis/0TK2...,0.759,190960,0.54,0TK2YIli7K1leLovkQiNik,0.0,9,0.0945,-6.039,0,0.0287,116.947,4,https://api.spotify.com/v1/tracks/0TK2YIli7K1l...,audio_features,spotify:track:0TK2YIli7K1leLovkQiNik,0.75
1,0.101,https://api.spotify.com/v1/audio-analysis/5ry2...,0.831,205427,0.502,5ry2OE6R2zPQFDO85XkgRb,0.0,10,0.122,-4.045,0,0.046,100.541,4,https://api.spotify.com/v1/tracks/5ry2OE6R2zPQ...,audio_features,spotify:track:5ry2OE6R2zPQFDO85XkgRb,0.101
2,0.11,https://api.spotify.com/v1/audio-analysis/5qmq...,0.715,173325,0.624,5qmq61DAAOUaW8AUo8xKhh,0.0,4,0.123,-3.046,0,0.114,158.087,4,https://api.spotify.com/v1/tracks/5qmq61DAAOUa...,audio_features,spotify:track:5qmq61DAAOUaW8AUo8xKhh,0.412
3,0.342,https://api.spotify.com/v1/audio-analysis/6fTt...,0.703,114893,0.594,6fTt0CH2t0mdeB2N9XFG5r,0.0,5,0.123,-6.146,0,0.0752,153.848,4,https://api.spotify.com/v1/tracks/6fTt0CH2t0md...,audio_features,spotify:track:6fTt0CH2t0mdeB2N9XFG5r,0.475
4,0.328,https://api.spotify.com/v1/audio-analysis/2Fxm...,0.701,194088,0.425,2Fxmhks0bxGSBdJ92vM42m,0.13,7,0.1,-10.965,1,0.375,135.128,4,https://api.spotify.com/v1/tracks/2Fxmhks0bxGS...,audio_features,spotify:track:2Fxmhks0bxGSBdJ92vM42m,0.562


In [16]:
df_audio_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8501 entries, 0 to 8500
Data columns (total 18 columns):
acousticness        8501 non-null float64
analysis_url        8501 non-null object
danceability        8501 non-null float64
duration_ms         8501 non-null int64
energy              8501 non-null float64
id                  8501 non-null object
instrumentalness    8501 non-null float64
key                 8501 non-null int64
liveness            8501 non-null float64
loudness            8501 non-null float64
mode                8501 non-null int64
speechiness         8501 non-null float64
tempo               8501 non-null float64
time_signature      8501 non-null int64
track_href          8501 non-null object
type                8501 non-null object
uri                 8501 non-null object
valence             8501 non-null float64
dtypes: float64(9), int64(4), object(5)
memory usage: 1.2+ MB


## Merging Both Dataframes

In [17]:
columns_to_drop = ['analysis_url','track_href','type','uri']
df_audio_features.drop(columns_to_drop, axis=1,inplace=True)

df_audio_features.rename(columns={'id': 'track_id'}, inplace=True)

df_audio_features.shape

(8501, 14)

In [18]:
# merge both dataframes
# the 'inner' method will make sure that we only keep track IDs present in both datasets
df = pd.merge(df_tracks,df_audio_features,on='track_id',how='inner')
print("Shape of the dataset:", df_audio_features.shape)
df.head()

Shape of the dataset: (8501, 14)


Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Shawn Mendes,Señorita,0TK2YIli7K1leLovkQiNik,100,0.037,0.759,190960,0.54,0.0,9,0.0945,-6.039,0,0.0287,116.947,4,0.75
1,Drake,Money In The Grave (Drake ft. Rick Ross),5ry2OE6R2zPQFDO85XkgRb,92,0.101,0.831,205427,0.502,0.0,10,0.122,-4.045,0,0.046,100.541,4,0.101
2,Lizzo,Truth Hurts,5qmq61DAAOUaW8AUo8xKhh,90,0.11,0.715,173325,0.624,0.0,4,0.123,-3.046,0,0.114,158.087,4,0.412
3,Lil Nas X,Panini,6fTt0CH2t0mdeB2N9XFG5r,91,0.342,0.703,114893,0.594,0.0,5,0.123,-6.146,0,0.0752,153.848,4,0.475
4,Billie Eilish,bad guy,2Fxmhks0bxGSBdJ92vM42m,96,0.328,0.701,194088,0.425,0.13,7,0.1,-10.965,1,0.375,135.128,4,0.562


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8501 entries, 0 to 8500
Data columns (total 17 columns):
artist_name         8501 non-null object
track_name          8501 non-null object
track_id            8501 non-null object
popularity          8501 non-null int64
acousticness        8501 non-null float64
danceability        8501 non-null float64
duration_ms         8501 non-null int64
energy              8501 non-null float64
instrumentalness    8501 non-null float64
key                 8501 non-null int64
liveness            8501 non-null float64
loudness            8501 non-null float64
mode                8501 non-null int64
speechiness         8501 non-null float64
tempo               8501 non-null float64
time_signature      8501 non-null int64
valence             8501 non-null float64
dtypes: float64(9), int64(5), object(3)
memory usage: 1.2+ MB


In [20]:
df[df.duplicated(subset=['artist_name','track_name'],keep=False)]

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence


## Export Dataframe to CSV File

In [21]:
df.to_csv('SpotifyAudioFeatures2019.csv')