# Song Analysis Using Spotify API

In [1]:
#import libraries
import sys
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup as bs
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

In [2]:
#Spotify authorization scope
scope = 'user-library-read'

In [3]:
#Spotify API credentials
%store -r spotify_cid
cid = spotify_cid
%store -r spotify_secret
secret = spotify_secret

In [4]:
#connect to Spotify through wrapper Spotipy
client_cred = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_cred)

### Functions

In [5]:
def get_track_info(track,artist):
    '''
    function returns dictionary with track's info (including audio features)
    parameters:
        track-->str
        artist-->str
    '''
    #search Spotify API for general song info
    info_json = sp.search(q='artist:' + artist + ' track:' + track)

    #create dictionary with song info
    info = {'track':info_json['tracks']['items'][0]['name'],
            'artist':info_json['tracks']['items'][0]['album']['artists'][0]['name'],
            'track_id':info_json['tracks']['items'][0]['id'],
            'release_date':info_json['tracks']['items'][0]['album']['release_date']}
            
    #add audio features to dictionary
    audio_info = sp.audio_features(info['track_id'])[0]
    info['dance'] = audio_info['danceability']
    info['energy'] = audio_info['energy']
    info['loud'] = audio_info['loudness']
    info['speech'] = audio_info['speechiness']
    info['acoust'] = audio_info['acousticness']
    info['live'] = audio_info['liveness']
    info['valence'] = audio_info['valence']
    info['tempo'] = audio_info['tempo']
    
    return info

In [6]:
def get_sim_songs(id_list):
    '''
    function returns list of tup (track, artist) of songs similar to seed tracks
    parameters:
        id_list-->list of song ids (str) for seed tracks
    '''
    #find similar songs (using Spotify's bart algorithm)
    song_recs = sp.recommendations(limit=100,seed_tracks=[i for i in id_list])['tracks']
    
    #create list of tup (track,artist)
    sim_songs = [(s['name'],s['artists'][0]['name']) for s in song_recs]
    
    return sim_songs

In [7]:
def get_df(track_list):
    '''
    function returns dataframe with audio features for available songs
    parameters:
        track_list-->list of tup (track, artist) of songs
    '''
    d = []
    for track, artist in track_list:
        try:
            d.append(get_track_info(track.lower().replace("'",""),
                                    artist.lower().replace("'","")))
        except:
            print(track + ' by ' + artist + ' is not available')
            
    return pd.DataFrame(data=d)

In [23]:
def rank_by_features(df_seed,df_rec):
    '''
    function returns df_rec sorted by similarity to seed tracks
    parameters:
        df_seed-->dataframe with seed tracks
        df_rec-->dataframe with songs recommended by Spotify API
    '''
    #calculate average values in seed tracks
    row_df = pd.DataFrame(data=[df_seed.mean(axis=0)],index=['seed_avg'])
    
    #calculate Euclidean distance with seed averages
    df_rec['sim_score'] = 1-pairwise_distances(df_rec.drop(columns=['track','track_id','artist','release_date']),
                                               row_df.to_numpy()[0,None],
                                               metric='euclidean')
   
    #sort dataframe by similarity to seed averages
    return df_rec.sort_values(by='sim_score',ascending=False)

### Information on Seed Tracks

The lists below contain the top five tracks for each of country, R&B/hip-hop, and rock/alternative as of the week of May 15, 2021, based on Billboard Top 100 charts.

In [8]:
#country seed tracks
cty_songs = [('Forever After All','Luke Combs'),
             ('The Good Ones','Gabby Barrett'),
             ('Made for You','Jake Owen'),
             ('Hell of a View','Eric Church'),
             ('Breaking Up Was Easy in the 90s','Sam Hunt')]

#create dataframe with track information
df_cty = get_df(cty_songs)

In [9]:
#R&B/hip-hop seed tracks
rb_songs = [('Leave the Door Open','Bruno Mars'),
            ('Peaches (feat. Daniel Caesar & Giveon)','Justin Bieber'),
            ('Rapstar','Polo G'),
            ('Astronaut in the Ocean','Masked Wolf'),
            ('Up','Cardi B')]

#create dataframe with track information
df_rb = get_df(rb_songs)

In [10]:
#rock/alternative seed tracks
rock_songs = [('Without You','The Kid LAROI'),
              ('Your Power','Billie Eilish'),
              ("My Ex's Best Friend",'Machine Gun Kelly'),
              ('Mood','24kGoldn'),
              ('Therefore I Am','Billie Eilish')]

#create dataframe with track information
df_rock = get_df(rock_songs)

### Song Recommendations

Use get_sim_songs function to find song recommendations for respective genres and create dataframes with song information for recommended songs.

In [11]:
#country
sim_cty = get_sim_songs(df_cty['track_id'])
df_sim_cty = get_df(sim_cty)

In [12]:
#R&B/hip-hop
sim_rb = get_sim_songs(df_rb['track_id'])
df_sim_rb = get_df(sim_rb)

Often by Sickick is not available


In [13]:
#rock/alternative
sim_rock = get_sim_songs(df_rock['track_id'])
df_sim_rock = get_df(sim_rock)

Intro by Kenndog is not available


### Song Similarity with Euclidean Distance

In [24]:
d = rank_by_features(df_cty,df_sim_cty)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 9 while Y.shape[1] == 8

In [22]:
d

Unnamed: 0,track,artist,track_id,release_date,dance,energy,loud,speech,acoust,live,valence,tempo,sim_score
0,God Gave Me You,Blake Shelton,0w9LJae3sVlZlH2CnxTInF,2011-07-11,0.483,0.844,-5.043,0.0314,0.00481,0.4050,0.543,151.977,-37.192441
1,Details,Billy Currington,4MCPNnJkkIzNGCCBE9XVhX,2019-07-19,0.678,0.714,-4.030,0.1110,0.01640,0.2980,0.868,139.991,-25.263325
2,Canadian Summer,Dean Brody,27Iw98HmmdY56bFQBxZW50,2020-11-18,0.516,0.893,-2.957,0.0451,0.05740,0.1200,0.744,122.005,-7.715697
3,My Side of the Fence,Dan + Shay,3fHgqkRp331r3W0WtUGBGr,2018-06-22,0.502,0.575,-5.585,0.0268,0.38400,0.2330,0.246,147.893,-33.099022
4,Record Year,Eric Church,5fSPbm5lcwtqwXkeQQswW8,2015-11-11,0.589,0.641,-8.120,0.0838,0.20500,0.0864,0.570,160.031,-45.291265
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Big Gangsta,Kevin Gates,2HYvkdq6lpFXkVz9Tfqi8r,2019-05-31,0.879,0.574,-5.602,0.3910,0.51900,0.1440,0.346,144.911,-30.120660
96,Never Break Heart,Eric Church,3vmFm0a4LNxzX0PYGLaxh4,2021-04-16,0.535,0.505,-7.585,0.0296,0.27300,0.1070,0.332,148.069,-33.317580
97,Forever After All,Luke Combs,6IBcOGPsniK3Pso1wHIhew,2020-10-23,0.487,0.650,-5.195,0.0253,0.19100,0.0933,0.456,151.964,-37.173907
98,Forever Like That,Ben Rector,7wF4asYZw8cAHEljYd2Wid,2013-08-20,0.592,0.304,-9.833,0.0298,0.56400,0.1070,0.307,114.720,-3.115091


In [53]:
df_cty1 = df_cty.drop(columns=['track_id','artist','release_date']).set_index('track',drop=True)
df_cty1

Unnamed: 0_level_0,dance,energy,loud,speech,acoust,live,valence,tempo
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Forever After All,0.487,0.65,-5.195,0.0253,0.191,0.0933,0.456,151.964
The Good Ones,0.519,0.552,-5.023,0.0259,0.18,0.149,0.331,89.957
Made For You,0.581,0.441,-6.829,0.0268,0.77,0.111,0.337,82.125
Hell Of A View,0.689,0.582,-6.778,0.0236,0.119,0.101,0.883,99.021
Breaking Up Was Easy In The 90's,0.562,0.649,-5.4,0.0494,0.231,0.341,0.376,145.913


In [54]:
dist_out = 1-pairwise_distances(df_cty1, metric="euclidean")

In [55]:
dist_out

array([[  1.        , -61.00747624, -68.86099134, -51.96886011,
         -5.06070855],
       [-61.00747624,   1.        ,  -7.0602485 ,  -8.25076512,
        -54.95774625],
       [-68.86099134,  -7.0602485 ,   1.        , -15.91836249,
        -62.80705339],
       [-51.96886011,  -8.25076512, -15.91836249,   1.        ,
        -45.91595704],
       [ -5.06070855, -54.95774625, -62.80705339, -45.91595704,
          1.        ]])

In [56]:
song_group = pd.DataFrame(dist_out).iloc[:, list(range(len(df_cty)))]

In [57]:
song_group["sum"] = song_group.sum(axis=1)

In [58]:
song_group

Unnamed: 0,0,1,2,3,4,sum
0,1.0,-61.007476,-68.860991,-51.96886,-5.060709,-185.898036
1,-61.007476,1.0,-7.060248,-8.250765,-54.957746,-130.276236
2,-68.860991,-7.060248,1.0,-15.918362,-62.807053,-153.646656
3,-51.96886,-8.250765,-15.918362,1.0,-45.915957,-121.053945
4,-5.060709,-54.957746,-62.807053,-45.915957,1.0,-167.741465


In [59]:
top_sim_songs = list(song_group.sort_values('sum', ascending = False).index)


In [61]:
top_sim_index = [i for i in top_sim_songs if i not in list(range(len(df_cty))) ]

In [63]:
top_sim_songs

[3, 1, 2, 4, 0]

### Using ranked songs to find more songs similar to seed tracks

In [None]:
rank = 0
while len(sim_songs) < 500:
    x = sp.recommendations(limit = 100,seed_tracks = [sim_songs[top_sim_index[rank]][2]])['tracks']
    for num in range(len(x)):
        track_name = x[num]['name']
        track_id = x[num]['id']
        artist = x[num]['artists'][0]['name']
        release_date = x[num]['album']['release_date']
        if [track_name,artist, track_id, release_date] not in sim_songs:
            sim_songs.append([track_name,artist, track_id, release_date])
    rank +=1
    print(rank, len(sim_songs))

In [None]:
songs_for_genius = pd.DataFrame(sim_songs)

In [None]:
songs_for_genius