# Popular Music Analysis
Looking at Billboard Hot 100 songs between 1950 - 2018

For this jupyter notebook, we will be collecting songs along with their associated audio feature from Spotify

# Parsing JSON files related to Billboard Top 100

- go through each JSON file, and extract Billboard Top 100 data for each year, and store into a Pandas DataFrame

In [None]:
years = range(1950,2016)

with open('data/years/1950.json') as data_file:
    data = json.load(data_file)
    
df = pd.DataFrame(data)

for year in years:
    with open('data/years/' + str(year) + '.json') as data_file:
        data = json.load(data_file)
    
    #df = df.append(data,ignore_index=True,verify_integrity=True)
    if year == 1950:
        df = pd.DataFrame(data)
    else:
        df = df.append(data,ignore_index=True,verify_integrity=True)

#df

In [None]:
df.tail(5)

In [None]:
# pickle billboard Hot 100 tracks between 1950 - 2015
df.to_pickle("data/billboard_tracks.pkl")

# Using pandas to pull the Billboard Hot 100 for 2016, 2017, and 2018 from Wikipedia.com

In [None]:
for year in range(2016,2019):

    # pull billboard Hot 100 charts from Wikipedia
    billboard_data = pd.read_html('https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_' + \
                                  str(year),header=0)
    
    # store web scraped data into a pandas dataframe
    df_temp = pd.DataFrame(data=billboard_data[0].loc[:,['No.','Title','Artist(s)']])
    
    # rename the columns to match original dataset
    df_temp.rename(columns={'No.':'pos','Title':'title','Artist(s)':'artist'},inplace=True)
    
    # insert year column
    df_temp['year'] = pd.Series(int(year),index=df_temp.index)
    
    if year == 2016:
        df_wiki = df_temp
    else:
        df_wiki = df_wiki.append(df_temp,ignore_index=True,verify_integrity=True)
        
    # remove quotations(") from song titles
    df_wiki['title'] = df_wiki['title'].str.replace('"','')
    
    # update songs manually to get lyrics
    df_wiki.iloc[68,1] = 'Down In the DM'
    df_wiki.iloc[87,1] = 'Watch Me (Whip / Nae Nae)'
    df_wiki.iloc[125,2] = 'Zayn & Taylor Swift'
    df_wiki.iloc[127,1] = 'ISpy'
    df_wiki.iloc[198,1] = 'Look At Me'
    df_wiki.iloc[218,2] = 'Khalid & Normani'
    df_wiki.iloc[239,2] = 'The Weeknd & Kendrick Lamar'
    df_wiki.iloc[246,2] = 'Kendrick Lamar & SZA'
    df_wiki.iloc[249,1] = 'Love.'
    df_wiki.iloc[278,2] = 'Jay Rock, Kendrick Lamar, Future & James Blake'
    df_wiki.iloc[289,2] = 'Nicky Jam & J Balvin'
    df_wiki.iloc[292,1] = 'Dura (Remix)'


In [None]:
df_wiki.head()

In [None]:
df_wiki.loc[df_wiki['title'] == "Dura (Remix)"]

In [None]:
# manually update 
df_wiki.iloc[292,1] = 'Dura (Remix)'

In [None]:
# songs without lyrics from PyLyrics: 256, 262, 280, 290

## helper function to get tags associated with each artist + count dupes in lyrics

In [None]:
import musicbrainzngs as mb

mb.set_useragent("billboard-top-100-lyrics", "1.0", "schaich.kevin@gmail.com")

def get_tags(artist):
    artist_id = mb.search_artists(artist)['artist-list'][0]['id']
    tags = mb.get_artist_by_id(artist_id, includes=["tags"])['artist']["tag-list"]

    return [tag['name'] for tag in tags]

def count_dupes(lyrics):
    list = lyrics.split("\n")
    count = 0
    seen = set()
    for item in list:
        if item in seen:
            count += 1
        else:
            seen.add(item)
    return count

## obtain + clean features for Billboard Hot 100 songs between 2016 and 2018

In [None]:
from PyLyrics import *
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textstat.textstat import textstat as ts
import re

billboard_attr_list = []
sent_analyzer = SentimentIntensityAnalyzer()

for index, row in df_wiki.iterrows():
    
    data = {}
    artist = row['artist']
    title = row['title']

    # clean up artist / title data
    artist = re.sub(r' feat..*', r'', artist)
    artist = re.sub(r' ft..*', r'', artist)
    artist = re.sub(r' featuring..*', r'', artist)
    artist = re.sub(r' and..*',r'',artist)
    artist = re.sub(r',..*',r'',artist)

    # update artist name in dataframe
    df_wiki.loc[index,['artist']] = artist

    if index == 278:
        df_wiki.iloc[278,2] = 'Jay Rock, Kendrick Lamar, Future & James Blake'
        artist = 'Jay Rock, Kendrick Lamar, Future & James Blake'

    # getting genre tags per artist
    try: 
        data['tags'] = get_tags(artist)
        #print('get_tags works')
    except:
        try:
            #artist_name = 'Meghan Trainor'
            query = "artist:" + artist
            search_results = spotify_object.search(query,limit=1,type='artist')

            # get the genre
            data['tags'] = search_results['artists']['items'][0]['genres']
            #print('spotify API works')
        except:
            print('Issue with getting tags for artist: ' + str(artist))


    # PyLyrics library for finding lyrics
    try: 
        lyrics = PyLyrics.getLyrics(artist,title) #Print the lyrics directly
        lyrics_repl = lyrics.replace("\n",". ")
        data['lyrics'] = lyrics
        
        # sentiment analysis with VADER
        data['sentiment'] = sent_analyzer.polarity_scores(lyrics_repl)
        
        # functions that rate the lyric complexity
        data['f_k_grade'] = ts.flesch_kincaid_grade(lyrics_repl)
        data['flesch_index'] = ts.flesch_reading_ease(lyrics_repl)
        data['fog_index'] = ts.gunning_fog(lyrics_repl)
        data['difficult_words'] = ts.difficult_words(lyrics_repl)
        data['num_syllables'] = ts.syllable_count(lyrics_repl)
        data['num_words'] = ts.lexicon_count(lyrics_repl, True)
        data['num_lines'] = ts.sentence_count(lyrics_repl)
        
        # count dupes in lyrics
        data['num_dupes'] = count_dupes(lyrics)
        
    except:
        print('Issue with getting lyrics for track: ' + str(title) + ' by ' + str(artist))

    billboard_attr_list.append(data)

df_3 = pd.DataFrame(billboard_attr_list)
df_3 = pd.concat([df_wiki,df_3],axis=1,sort=None)


# Pickle DataFrame of Billboard Hot 100 songs between 2016 and 2018

In [None]:
# pickle billboard hot 100 for years 2016 - 2018
#df_3.to_pickle("data/billboard_tracks_2016_2018.pkl")

In [None]:
df_3.loc[df_3['lyrics'].isna()]

## connecting to Spotify API with spotipy

In [24]:
import os
import sys
import json
from json.decoder import JSONDecodeError
import webbrowser
import spotipy
import spotipy.util as util
from pprint import pprint

# visualization libraries
import seaborn as sns  
import matplotlib.pyplot as plt
from matplotlib import style
% matplotlib inline

import pandas as pd
import numpy as np

import fuzzyset

from collections import Counter

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# User ID: stevay?si=yNi65NanRZeIRXlu0Gjobw

In [2]:
username = sys.argv[1]
username

'-f'

In [186]:
try: 
    token = util.prompt_for_user_token(username,
                                       scope=None,
                                       client_id = '39cc01d4b4a544ebad4e813e7190e606',
                                       client_secret = '21e5aa26f17247bcae6d6aeba252a49f',
                                       redirect_uri='http://google.com/')
except:
    os.remove(f".cache-{username}")
    token = util.prompt_for_user_token(username,
                                       scope=None,
                                       client_id = '39cc01d4b4a544ebad4e813e7190e606',
                                       client_secret = '21e5aa26f17247bcae6d6aeba252a49f',
                                       redirect_uri='http://google.com/')
    

# create spotify object
spotify_object = spotipy.Spotify(auth=token)



            User authentication requires interaction with your
            web browser. Once you enter your credentials and
            give authorization, you will be redirected to
            a url.  Paste that url you were directed to to
            complete the authorization.

        
Opened https://accounts.spotify.com/authorize?client_id=39cc01d4b4a544ebad4e813e7190e606&response_type=code&redirect_uri=http%3A%2F%2Fgoogle.com%2F in your browser


Enter the URL you were redirected to: https://www.google.com/?code=AQDCH5TnMjw4SPA0km0UJUnbB7LVTK7rurcbJJnW-5MXub1nUXDz40kyODKfUA9Cjx89e4zkOQyZYeHg7j-SJ8v5Ytur5k9fidr1X4BWogz3PYh_6Gyp3G2Xmy_hIGAthSlIYbgX3ha7exFYh_94z6djJPViKyrDNhIqjSeGTxUsVXjrIMcQAxy9




In [None]:
# useful code to print JSON from spotify
# print(json.dumps(VARIABLE, sort_keys=True,indent=4))

# Helper function to search for specific tracks of artists on Spotify and pull features of interest (e.g. track_id)


In [390]:
def get_spotify_track_id(df):
    '''given a DataFrame with columns 'artist' and 'title', use Spotify API to look up:
    'track_name'
    'track_id'
    'track_preview_url'
    'track_album_cover_art'
    
    '''
    
    
    # iterate through list of artists/track names in dataset
    spotify_list_data = []

    for index, row in df.iterrows():
        #print(row['artist'] + ' - ' + row['title'])
        data = {}

        # define query
        query = 'artist:' + str(row['artist'] + ' track:' + str(row['title'].replace("'",'')))

        # search with Spotipy / Spotify API / spotify_object
        search_result = spotify_object.search(query, limit=5,type='track')

        if len(search_result['tracks']['items']) == 0:
            data['track_name'] = None
            data['track_id'] = None
            data['track_preview_url'] = None
            data['track_album_cover_art'] = None

        else:
            # list to store tracks from search results
            temp_list = []

            # iterate through all results to find the right track
            for i in range(len(search_result['tracks']['items'])):

                temp_list.append(search_result['tracks']['items'][i]['name'])

            # initiate new fuzzy match set
            fz = fuzzyset.FuzzySet()

            # Create a list of terms we would like to match against in a fuzzy way
            for i in temp_list:

                fz.add(i)

            # find index to pull results from Spotify
            correct_index = temp_list.index(fz.get(str(row['title']))[0][1])

            # save results
            data['track_name'] = search_result['tracks']['items'][correct_index]['name']
            data['track_id'] = search_result['tracks']['items'][correct_index]['id']
            data['track_preview_url'] = search_result['tracks']['items'][correct_index]['preview_url']
            data['track_album_cover_art'] = search_result['tracks']['items'][correct_index]['album']['images'][0]['url']

        # append to list
        spotify_list_data.append(data)


    # create new DataFrame based on results from Spotify
    df_temp = pd.DataFrame(spotify_list_data)
    
    return df_temp
    

In [246]:
df_2 = get_spotify_track_id(df) # get spotify track IDs for Billboard Hot 100 songs between 1950 - 2015
df_4 = get_spotify_track_id(df_3) # get spotify track IDs for Billboard Hot 100 songs between 2016- 2018

retrying ...4secs
retrying ...1secs


In [409]:
# missing tracks
df_4.loc[df_4['track_id'].isna()]

Unnamed: 0,track_album_cover_art,track_id,track_name,track_preview_url
70,,,,


# Use below to save pickle files

In [355]:
# pickle spotify track IDs
#df_2.to_pickle("data/spotify_track_ids.pkl")
#df_4.to_pickle("data/spotify_track_ids_2016_2018.pkl")

# Use below to load all the relevant pickle files

In [470]:
# load spotify track IDs (1950 - 2015)
df_2 = pd.read_pickle("data/spotify_track_ids.pkl")

# load billboard Hot 100 tracks (1950 - 2015)
df = pd.read_pickle("data/billboard_tracks.pkl")

# load billboard Hot 100 tracks (2016 - 2018)
df_3 = pd.read_pickle("data/billboard_tracks_2016_2018.pkl")

# load spotify track IDs (2016 - 2018)
df_4 = pd.read_pickle("data/spotify_track_ids_2016_2018.pkl")

# Use below code to manually update df_2 with spotify data

In [229]:
len(df_2.loc[df_2['track_name'].isna()]) # 33 songs not on Spotify

33

In [709]:
df.iloc[78,:]

artist                                                Nervous Norvus
difficult_words                                                   50
f_k_grade                                                        4.1
flesch_index                                                   80.28
fog_index                                                        5.2
lyrics             TRANSFUSION\nNervous Norvus\n\nZZZZZZOOOOOOOOO...
num_dupes                                                         20
num_lines                                                         40
num_syllables                                                  479.7
num_words                                                        345
pos                                                               47
sentiment          {'neg': 0.057, 'neu': 0.914, 'pos': 0.029, 'co...
tags                                            [death by cirrhosis]
title                                                    Transfusion
year                              

In [198]:
df_2.iloc[78,:]

track_album_cover_art    None
track_id                 None
track_name               None
track_preview_url        None
Name: 78, dtype: object

In [696]:
# update df_2 with helper function below

update_track(78,0,df_2)

In [674]:
# helper function to update dataframes manually
def update_track(row,index,df):
    '''
    Purpose of this helper function is to update a specified DataFrame with data pulled via the Spotify API.
    
    parameters: 
    row = which row in DataFrame to update
    index = the specific item in the Spotify API search result to update the DataFrame with
    df = the DataFrame to update
    '''

    # get the track name
    df.iloc[row,2] = search_results_test['tracks']['items'][index]['name']

    # get the track ID
    df.iloc[row,1] = search_results_test['tracks']['items'][index]['id']

    # preview url
    df.iloc[row,3] = search_results_test['tracks']['items'][index]['preview_url']

    # album cover art
    df.iloc[row,0] = search_results_test['tracks']['items'][index]['album']['images'][0]['url']

## Script to gather Spotify Audio Features for each track

In [188]:
#track_audio_features_list = []
#counter = 0
track_ids = []
#track_features_dict = {} 

for index, row in df_2.iterrows():
    #print(row['track_id'])
    
    track_ids.append(row['track_id'])
    
    if len(track_ids) == 50 or index == (len(df_2)-1):
        track_features = spotify_object.audio_features(tracks=[track_ids])
        track_features_df_temp = pd.DataFrame(track_features)
        df_3.append()
        
        if index == 49:
            df_3 = track_features_df_temp
        else:
            df_3 = df_3.append(track_features_df_temp,ignore_index=False,verify_integrity=True)
        track_ids = []

# Use below to manually search track content in Spotify

In [None]:
# use 98\u00ba for 98 Degrees...

In [342]:
artist_name = 'Nicky Jam'
track_name = "X"

In [528]:
#track_name = '4gB7HrYHbJVJ5RFOjxmoq4'
#query = "artist:" + artist_name + " track:" + track_name # define filters
#query = " track:" + track_name # define filters
search_results_test = spotify_object.search(query,limit=5,type='track')

In [529]:
len(search_results_test['tracks']['items'])

0

In [527]:
print(json.dumps(search_results_test,sort_keys=True,indent=4))

{
    "tracks": {
        "href": "https://api.spotify.com/v1/search?query=spotify%3Atrack%3A4gB7HrYHbJVJ5RFOjxmoq4&type=track&market=US&offset=0&limit=5",
        "items": [],
        "limit": 5,
        "next": null,
        "offset": 0,
        "previous": null,
        "total": 0
    }
}


In [346]:
# get the track name

search_results_test['tracks']['items'][1]['name']

'X'

In [326]:
# get the track ID

search_results_test['tracks']['items'][0]['id']

'6KBYefIoo7KydImq1uUQlL'

In [327]:
# preview url

search_results_test['tracks']['items'][0]['preview_url']

'https://p.scdn.co/mp3-preview/cab6e91217590b37cc254585f3f951c5a5280375?cid=39cc01d4b4a544ebad4e813e7190e606'

In [304]:
# album cover art

search_results_test['tracks']['items'][0]['album']['images'][0]['url']

'https://i.scdn.co/image/9c1ba089336e2b67a7bb8d385bada87481852ede'

# combine data frames together with pandas '.concat' function

Combine the following:
- df + df_2
- df_3 + df_4


In [385]:
df_final = pd.concat([df,df_2],axis=1,sort=None)
df_half_2 = pd.concat([df_3,df_4],axis=1,sort=None)
df_final = df_final.append(df_half_2,ignore_index=True,verify_integrity=True)

# pickle the final dataframe
#df_final.to_pickle("data/billboard_tracks_1950_2018_no_audio_features.pkl")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


# Load pickle file of all songs without Spotify audio features

In [471]:
# load spotify track IDs (1950 - 2018)
df_final = pd.read_pickle("data/billboard_tracks_1950_2018_no_audio_features.pkl")

# songs missing from Spotify (34 songs total)

In [474]:
# songs that are missing from Spotify (34 songs total)
df_final.loc[df_final['track_id'].isna(),:]

Unnamed: 0,artist,difficult_words,f_k_grade,flesch_index,fog_index,lyrics,num_dupes,num_lines,num_syllables,num_words,pos,sentiment,tags,title,track_album_cover_art,track_id,track_name,track_preview_url,year
78,Nervous Norvus,50.0,4.1,80.28,5.2,TRANSFUSION\nNervous Norvus\n\nZZZZZZOOOOOOOOO...,20.0,40.0,479.7,345.0,47,"{'neg': 0.057, 'neu': 0.914, 'pos': 0.029, 'co...",[death by cirrhosis],Transfusion,,,,,1956
91,Jane Powell,2.0,0.9,99.23,4.4,"I give to you and you give to me\nTrue love, t...",1.0,8.0,60.3,52.0,86,"{'neg': 0.0, 'neu': 0.648, 'pos': 0.352, 'comp...","[actress, dancer, singer]",True Love,,,,,1956
180,Kathy Linden,6.0,2.1,90.77,4.4,"Bomp, bomp, bomp, bomp-bomp-bomp\nBomp, bomp, ...",17.0,22.0,184.5,140.0,90,"{'neg': 0.0, 'neu': 0.93, 'pos': 0.07, 'compou...",[goodbye jimmy],Billy,,,,,1958
391,Ray Charles,5.0,1.3,103.63,6.0,Take these chains from my heart and set me fre...,10.0,17.0,204.3,181.0,92,"{'neg': 0.056, 'neu': 0.632, 'pos': 0.312, 'co...","[american, blues, classic pop and rock, countr...",Take These Chains From My Heart,,,,,1963
411,Dave Clark Five,13.0,1.3,98.21,4.8,You say that you love (Say you love me)\nAll o...,27.0,40.0,342.9,280.0,23,"{'neg': 0.022, 'neu': 0.71, 'pos': 0.268, 'com...","[british, english, uk]",Glad All Over,,,,,1964
431,Dave Clark Five,4.0,1.3,98.21,4.8,"It's right that I should care about you,\nAnd ...",8.0,16.0,140.4,116.0,63,"{'neg': 0.023, 'neu': 0.727, 'pos': 0.25, 'com...","[british, english, uk]",Because,,,,,1964
433,Dave Clark Five,8.0,0.5,105.66,5.2,"Whoa, can't you see that she's mine\nWe've bee...",14.0,29.0,283.5,251.0,67,"{'neg': 0.084, 'neu': 0.805, 'pos': 0.111, 'co...","[british, english, uk]",Can't You See That She's Mine,,,,,1964
483,Dave Clark Five,3.0,-3.0,125.62,4.0,"Here they come again, mmmm-mm-mm\nCatch us if ...",20.0,24.0,129.6,137.0,54,"{'neg': 0.039, 'neu': 0.936, 'pos': 0.025, 'co...","[british, english, uk]",Catch Us If You Can,,,,,1965
501,Dave Clark Five,7.0,2.9,94.15,6.4,Come on (come on let me show you where it's at...,18.0,28.0,355.5,309.0,80,"{'neg': 0.007, 'neu': 0.896, 'pos': 0.097, 'co...","[british, english, uk]",I Like It Like That,,,,,1965
603,Tremeloes,9.0,1.3,98.21,4.8,"Oh, don't it hurt deep inside \nTo see someone...",7.0,20.0,183.6,155.0,45,"{'neg': 0.099, 'neu': 0.82, 'pos': 0.081, 'com...","[british, classic pop and rock, english, uk]",Silence Is Golden,,,,,1967


# songs missing lyrics (4 songs)

In [477]:
df_final.loc[df_final['lyrics'].isna()]

Unnamed: 0,artist,difficult_words,f_k_grade,flesch_index,fog_index,lyrics,num_dupes,num_lines,num_syllables,num_words,pos,sentiment,tags,title,track_album_cover_art,track_id,track_name,track_preview_url,year
4284,Rich the Kid,,,,,,,,,,57,,"[atl hip hop, hip hop, pop, pop rap, rap, sout...",Plug Walk,https://i.scdn.co/image/4a05d4f5bec8e872222f9a...,1ZAyjvIk9YiD76yYy0TEG6,Plug Walk,,2018
4290,YG,,,,,,,,,,63,,"[drake, hip-hop, rap, rap us, yg]",Big Bank,https://i.scdn.co/image/f8e9e3b5009829666d3e7b...,2iedgVVNKPzzAkeDBVQ6sR,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",,2018
4308,Nio García,,,,,,,,,,81,,"[reggaeton flow, tropical]",Te Boté,https://i.scdn.co/image/bc4e0238d504046c887b30...,4X5V0XWXYbjlq4yBuPiYfA,Te Bote,https://p.scdn.co/mp3-preview/4f10f76774297e35...,2018
4318,YoungBoy Never Broke Again,,,,,,,,,,91,,[trap music],Outside Today,https://i.scdn.co/image/b512dfac0b66cec52865a2...,3sA7HKGzcKTVscdiTCrWpX,Outside Today,https://p.scdn.co/mp3-preview/e862d38519014ce8...,2018


# Songs missing tags (2 songs)

In [478]:
df_final.loc[df_final['tags'].isna()]

Unnamed: 0,artist,difficult_words,f_k_grade,flesch_index,fog_index,lyrics,num_dupes,num_lines,num_syllables,num_words,pos,sentiment,tags,title,track_album_cover_art,track_id,track_name,track_preview_url,year
4246,Khalid & Normani,12.0,1.7,97.2,4.42,"Sorry if it's hard to catch my vibe, mmm\nI ne...",18.0,49.0,463.0,392.0,19,"{'neg': 0.153, 'neu': 0.626, 'pos': 0.221, 'co...",,Love Lies,https://i.scdn.co/image/ea0c58be347de0c1ea9a2c...,45Egmo7icyopuzJN0oMEdk,Love Lies (with Normani),https://p.scdn.co/mp3-preview/d53d5678b946219b...,2018
4301,ASAP Ferg,37.0,1.7,97.09,6.52,"Yeah\n\nRide with the mob, alhamdulillah\nChec...",29.0,56.0,562.0,451.0,74,"{'neg': 0.154, 'neu': 0.796, 'pos': 0.049, 'co...",,Plain Jane,https://i.scdn.co/image/e0d2d77ca43c5ea0f89b25...,4dVpf9jZjcORqGTLUaeYj9,Plain Jane,https://p.scdn.co/mp3-preview/044999082327cc7c...,2018


# Remove songs missing lyrics

In [487]:
df_final_2 = df_final.drop([4284,4290,4308,4318])
df_final_2.reset_index(inplace=True, drop=True)

# Pickle DataFrame (without Spotify audio features)

In [491]:
# pickle the final dataframe, removing songs without lyrics
df_final_2.to_pickle("data/billboard_tracks_1950_2018_no_audio_features_v2.pkl")

In [3]:
# load dataframe, without audio features, but no missing lyrics
df_final_2 = pd.read_pickle("data/billboard_tracks_1950_2018_no_audio_features_v2.pkl")

# get Spotify audio features for all songs

Skip songs without a Spotify track_id = will throw an error

In [536]:
# Script to get Spotify audio features for each track

#track_audio_features_list = []
#counter = 0
track_ids = []
#track_features_dict = {} 

for index, row in df_final_2.iterrows():
    #print(row['track_id'])
    
    if row['track_id']:
        track_ids.append(row['track_id'])

        if len(track_ids) == 50 or index == (len(df_final_2)-1):
            track_features = spotify_object.audio_features(tracks=track_ids)
            track_features_df_temp = pd.DataFrame(track_features)
            #df_3.append()

            if index == 49:
                df_audio_features = track_features_df_temp
            else:
                df_audio_features = df_audio_features.append(track_features_df_temp,ignore_index=True, \
                                                             verify_integrity=True)
            track_ids = []
        


In [543]:
df_audio_features.columns

Index(['acousticness', 'analysis_url', 'danceability', 'duration_ms', 'energy',
       'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'track_href', 'type', 'uri',
       'valence'],
      dtype='object')

# insert tracks that are missing Spotify audio features, but have other important features (e.g. lyrics)

In [554]:
df_audio_features_updated = df_audio_features

# iterate through tracks that do not have Spotify audio features
for index, row in df_final_2.loc[df_final_2['track_id'].isna(),:].iterrows():
    
    # set Spotify audio features as 'None'
    line = pd.DataFrame({"acousticness": None, "analysis_url": None, "danceability":None,
                     "duration_ms":None,"energy":None,'id':None,'instrumentalness':None,
                     'key':None,'liveness':None,'loudness':None,'mode':None,'speechiness':None,
                     'tempo':None,'time_signature':None,'track_href':None,'type':None,'uri':None,
                     'valence':None}, index=[index])
    
    # insert song track in its designated index / row
    df_audio_features_updated = pd.concat([df_audio_features_updated[:index], line, df_audio_features_updated[index:]]).reset_index(drop=True)

In [555]:
df_audio_features_updated

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.903000,https://api.spotify.com/v1/audio-analysis/3k5y...,0.214,207573,0.1940,3k5ycyXX5qsCjLd7R2vphp,0.000001,1,0.1070,-15.636,1,0.0325,86.198,3,https://api.spotify.com/v1/tracks/3k5ycyXX5qsC...,audio_features,spotify:track:3k5ycyXX5qsCjLd7R2vphp,0.3390
1,0.930000,https://api.spotify.com/v1/audio-analysis/6vSK...,0.387,198000,0.0958,6vSKLIxqiYllqDd2VcGVHv,0.000000,8,0.1700,-12.278,1,0.0391,167.122,5,https://api.spotify.com/v1/tracks/6vSKLIxqiYll...,audio_features,spotify:track:6vSKLIxqiYllqDd2VcGVHv,0.2240
2,0.971000,https://api.spotify.com/v1/audio-analysis/0788...,0.475,185000,0.1390,0788TdvEjGtjFVb6CTFdJH,0.000173,5,0.1280,-12.619,1,0.0333,86.156,3,https://api.spotify.com/v1/tracks/0788TdvEjGtj...,audio_features,spotify:track:0788TdvEjGtjFVb6CTFdJH,0.3740
3,0.795000,https://api.spotify.com/v1/audio-analysis/1v52...,0.703,156987,0.1970,1v52YkW75eN4RJ2EyDdQVI,0.000000,7,0.0982,-11.312,1,0.0327,82.114,3,https://api.spotify.com/v1/tracks/1v52YkW75eN4...,audio_features,spotify:track:1v52YkW75eN4RJ2EyDdQVI,0.4690
4,0.886000,https://api.spotify.com/v1/audio-analysis/2BNP...,0.608,194333,0.1970,2BNPXyUhq5WFpAO1Ff5yEg,0.000001,8,0.1140,-14.830,1,0.0446,123.357,4,https://api.spotify.com/v1/tracks/2BNPXyUhq5WF...,audio_features,spotify:track:2BNPXyUhq5WFpAO1Ff5yEg,0.6390
5,0.910000,https://api.spotify.com/v1/audio-analysis/12WJ...,0.645,174040,0.4300,12WJhNxX6EYC2rAlfVOW4d,0.000000,1,0.5610,-11.916,1,0.0444,132.643,4,https://api.spotify.com/v1/tracks/12WJhNxX6EYC...,audio_features,spotify:track:12WJhNxX6EYC2rAlfVOW4d,0.9440
6,0.896000,https://api.spotify.com/v1/audio-analysis/28IB...,0.778,160750,0.2910,28IBsttOK2aG9Cb4QtqhMr,0.000514,2,0.1470,-12.703,1,0.0390,107.730,3,https://api.spotify.com/v1/tracks/28IBsttOK2aG...,audio_features,spotify:track:28IBsttOK2aG9Cb4QtqhMr,0.6950
7,0.286000,https://api.spotify.com/v1/audio-analysis/4c5S...,0.574,172520,0.5000,4c5SQiPpwZKPs1YXm7ms0u,0.000002,11,0.1610,-9.870,1,0.0354,77.461,4,https://api.spotify.com/v1/tracks/4c5SQiPpwZKP...,audio_features,spotify:track:4c5SQiPpwZKPs1YXm7ms0u,0.8590
8,0.923000,https://api.spotify.com/v1/audio-analysis/44ZA...,0.191,205707,0.2020,44ZAlAAn0bAEzUJgZky04H,0.000362,8,0.2890,-14.450,1,0.0289,92.091,1,https://api.spotify.com/v1/tracks/44ZAlAAn0bAE...,audio_features,spotify:track:44ZAlAAn0bAEzUJgZky04H,0.1810
9,0.940000,https://api.spotify.com/v1/audio-analysis/1hVE...,0.248,242040,0.2200,1hVEHu6SemMvlYAP5MWkDj,0.001970,7,0.1090,-10.768,0,0.0299,71.377,4,https://api.spotify.com/v1/tracks/1hVEHu6SemMv...,audio_features,spotify:track:1hVEHu6SemMvlYAP5MWkDj,0.1910


# concatenate remaining dataframes (df_audio_features_updated + df_final_2)

In [None]:
df_final_set = pd.concat([df_final_2,df_audio_features_updated],axis=1,sort=None)

# Pickle the DataFrame (without Spotify audio features)

In [3]:
# pickle the final dataframe, removing songs without lyrics
#df_final_set.to_pickle("data/billboard_tracks_1950_2018_FINAL.pkl")

# load dataframe, without audio features, but no missing lyrics
df_final_set = pd.read_pickle("data/billboard_tracks_1950_2018_FINAL.pkl")

In [4]:
df_final_set.tail()

Unnamed: 0,artist,difficult_words,f_k_grade,flesch_index,fog_index,lyrics,num_dupes,num_lines,num_syllables,num_words,...,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
4319,Luke Combs,16.0,2.1,96.08,5.32,Are you sitting at home all alone trying to fa...,22.0,42.0,466.0,382.0,...,0.191,-6.16,0,0.0311,144.075,4,https://api.spotify.com/v1/tracks/4gB7HrYHbJVJ...,audio_features,spotify:track:4gB7HrYHbJVJ5RFOjxmoq4,0.527
4320,Rae Sremmurd,58.0,2.2,95.98,5.97,"Yeah, yeah, yeah\nMike WiLL Made-It\nYeah, yea...",39.0,110.0,1179.0,1014.0,...,0.118,-4.75,0,0.15,173.948,4,https://api.spotify.com/v1/tracks/1BuZAIO8WZpa...,audio_features,spotify:track:1BuZAIO8WZpavWVbbq3Lci,0.584
4321,Dua Lipa,8.0,0.4,105.96,3.81,You call me all friendly\nTellin' me how much ...,33.0,57.0,483.0,441.0,...,0.0824,-5.975,1,0.0943,97.028,4,https://api.spotify.com/v1/tracks/76cy1WJvNGJT...,audio_features,spotify:track:76cy1WJvNGJTj78UqeA5zr,0.51
4322,J Balvin,41.0,3.3,82.2,8.58,Si el ritmo te lleva a mover la cabeza\nYa emp...,23.0,44.0,374.0,267.0,...,0.128,-4.818,0,0.0584,104.959,4,https://api.spotify.com/v1/tracks/4ipnJyDU3Lq1...,audio_features,spotify:track:4ipnJyDU3Lq15qBAYNqlqK,0.308
4323,Imagine Dragons,16.0,1.8,96.89,4.89,First things first\nI'mma say all the words in...,31.0,49.0,498.0,407.0,...,0.081,-4.374,0,0.128,124.949,4,https://api.spotify.com/v1/tracks/0pqnGHJpmpxL...,audio_features,spotify:track:0pqnGHJpmpxLKifKRmU6WP,0.666


# Columns to add to final dataset
Includes the following columns:
- **sentiment_compound** - the compound score per VADER sentiment analysis
- **decade** - which decade the song track is associated with (e.g. 1960s)
- **agg_genre** - 1 of 15 genres the song track is associated with (e.g. rock)

In [5]:
# add a column for compound sentiment analysis (negative: < -0.05; positive: > +0.05)
sent_list = []
for i in df_final_set['sentiment']:
    sent_list.append(i['compound'])

df_final_set['sentiment_compound'] = pd.Series(sent_list,index=df_final_set.index)


# add a column to indicate decades (e.g. 1950s, 1960s, etc.)
decade_list = []
for i in df_final_set['year']:
    #print(i)
    decade_list.append(str(i)[2:3]+'0s')

df_final_set['decade'] = pd.Series(decade_list,index=df_final_set.index)

### creating 15 aggregated genres

In [None]:
aggregate_genres = [{"rock": ["symphonic rock", "jazz-rock", "heartland rock", "rap rock", "garage rock", "folk-rock", "roots rock", "adult alternative pop rock", "rock roll", "punk rock", "arena rock", "pop-rock", "glam rock", "southern rock", "indie rock", "funk rock", "country rock", "piano rock", "art rock", "rockabilly", "acoustic rock", "progressive rock", "folk rock", "psychedelic rock", "rock & roll", "blues rock", "alternative rock", "rock and roll", "soft rock", "rock and indie", "hard rock", "pop/rock", "pop rock", "rock", "classic pop and rock", "psychedelic", "british psychedelia", "punk", "metal", "heavy metal"]},
{"alternative/indie": ["adult alternative pop rock", "alternative rock", "alternative metal", "alternative", "lo-fi indie", "indie", "indie folk", "indietronica", "indie pop", "indie rock", "rock and indie"]},
{"electronic/dance": ["dance and electronica", "electro house", "electronic", "electropop", "progressive house", "hip house", "house", "eurodance", "dancehall", "dance", "trap"]},
{"soul": ["psychedelic soul", "deep soul", "neo-soul", "neo soul", "southern soul", "smooth soul", "blue-eyed soul", "soul and reggae", "soul"]},
{"classical/soundtrack": ["classical", "orchestral", "film soundtrack", "composer"]},
{"pop": ["country-pop", "latin pop", "classical pop", "pop-metal", "orchestral pop", "instrumental pop", "indie pop", "sophisti-pop", "pop punk", "pop reggae", "britpop", "traditional pop", "power pop", "sunshine pop", "baroque pop", "synthpop", "art pop", "teen pop", "psychedelic pop", "folk pop", "country pop", "pop rap", "pop soul", "pop and chart", "dance-pop", "pop", "top 40"]},
{"hip-hop/rnb": ["conscious hip hop", "east coast hip hop", "hardcore hip hop", "west coast hip hop", "hiphop", "southern hip hop", "hip-hop", "hip hop", "hip hop rnb and dance hall", "contemporary r b", "gangsta rap", "rapper", "rap", "rhythm and blues", "contemporary rnb", "contemporary r&b", "rnb", "rhythm & blues","r&b", "blues"]},
{"disco": ["disco"]},
{"swing":  ["swing"]},
{"folk": ["contemporary folk", "folk"]},
{"country": ["country rock", "country-pop", "country pop", "contemporary country", "country"]},
{"jazz": ["vocal jazz", "jazz", "jazz-rock"]},
{"religious": ["christian", "christmas music", "gospel"]},
{"blues": ["delta blues", "rock blues", "urban blues", "electric blues", "acoustic blues", "soul blues", "country blues", "jump blues", "classic rock. blues rock", "jazz and blues", "piano blues", "british blues", "british rhythm & blues", "rhythm and blues", "blues", "blues rock", "rhythm & blues"]},
{"reggae": ["reggae fusion", "roots reggae", "reggaeton", "pop reggae", "reggae", "soul and reggae"]}]

### use cosine similarity to find aggregate genre per track
There are 15 aggregated genres curated; based on each track's genre tags, we want to fuzzy match the track to 1 of the 15 aggregated genre tags

In [None]:
import math
from collections import Counter

def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

# use below functions if you care about length of lists (do not think this is important)

def length_similarity(c1, c2):
    lenc1 = sum(c1.values())
    lenc2 = sum(c2.values())
    return min(lenc1, lenc2) / float(max(lenc1, lenc2))

def similarity_score(l1, l2):
    c1, c2 = Counter(l1), Counter(l2)
    return length_similarity(c1, c2) * counter_cosine_similarity(c1, c2)

### create a series that assigns an 'aggregated genre' to each song track

In [None]:
list_of_genres = []

# iterate through each track
for track_tags in df_final_set['tags']:
    
    sim_counter = 0
    final_genre = 'N/A'
    
    if type(track_tags) is not float:
        if (len(track_tags) != 0):

            # iterate through each aggregated genre
            for genre in aggregate_genres:

                # pull genres associated with each aggregated genre
                for agg_genre, genres in genre.items():

                    # calculate cosine similarity between track tags vs. genres associated with aggregated genre
                    sim_temp = counter_cosine_similarity(Counter(track_tags), Counter(genres))

                    # if cosine similarity value is greater than counter, then update
                    if sim_counter < sim_temp:
                        sim_counter = sim_temp
                        final_genre = agg_genre

    # add aggregated genre to list
    list_of_genres.append(final_genre)

### creating column 'agg_genre' in DataFrame

In [9]:
df_final_set['agg_genre'] = pd.Series(list_of_genres,index=df_final_set.index)

# Save/Load final dataframe here!

In [8]:
# pickle the final dataframe, removing songs without lyrics
#df_final_set.to_pickle("data/billboard_tracks_1950_2018_FINAL_v2.pkl")


# load dataframe, without audio features, but no missing lyrics
df_final_set = pd.read_pickle("data/billboard_tracks_1950_2018_FINAL_v2.pkl")