In [None]:
# We are trying to get the past year-end top 100 Billboard charts so we can analyze the elements of what makes 
# a "hit." Specifically, we are looking at the lyrics of top hits and running sentiment analysis on these lyrics to see if 
# there is a correlation between lyric sentiment and popularity. For example, are the top songs typically positive songs? 
# From there we can predict what songs are likely to make it on the top charts. 

#Lags forecasting model


In [None]:
import pandas as pd
df = pd.read_csv('tracks.csv')


In [None]:
pip install lyricsgenius

Collecting lyricsgenius
[?25l  Downloading https://files.pythonhosted.org/packages/0d/32/be32f6922f70fd1b9900b50b228f6585cd60a96bdf03589df738f627d388/lyricsgenius-3.0.1-py3-none-any.whl (59kB)
[K     |█████▌                          | 10kB 15.3MB/s eta 0:00:01[K     |███████████                     | 20kB 17.6MB/s eta 0:00:01[K     |████████████████▌               | 30kB 11.1MB/s eta 0:00:01[K     |██████████████████████          | 40kB 9.5MB/s eta 0:00:01[K     |███████████████████████████▋    | 51kB 8.2MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 4.9MB/s 
Installing collected packages: lyricsgenius
Successfully installed lyricsgenius-3.0.1


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lyricsgenius import Genius #using this library 
import json
import re
from requests import Timeout

genius = Genius('fwGCDyDv4nWf2SWH4gwUcdFHYIz9bonCZb7zP1Yx1XqresJ7l9ZUxODfU8XKTe-Y')  # initializing with token


# function to get lyrics of song based on artist and title (artist and title extracted in tophits(year))
def getLyrics(artist, title):
  originalArtist = artist
  #Genius seems to list the artists by the first name in the list, but billboard returns it as artist A featuring artist B, or artist A & artist B (perhaps some more combinations)
  if ("feat" in artist.lower()) :
      artist = artist[:artist.lower().find("feat")]
  if ("&" in artist.lower()) :
      artist = artist[:artist.lower().find("&")]

  global genius
  try :
      song = genius.search_song(title, artist)
  except Timeout: # there are timeout exceptions after 20-30 some requests sometimes, so refreshing the connection.
      genius = Genius('fwGCDyDv4nWf2SWH4gwUcdFHYIz9bonCZb7zP1Yx1XqresJ7l9ZUxODfU8XKTe-Y')
      print ("Genius session refreshed...")
      song = genius.search_song(title, artist)
      # print("getting lyrics..")
  # print(artist + ":" + title + ": " + song.lyrics)

  if (song is None): #checking if the song is not found, returning empty string to the caller method as a sign that we didn't find anything (ideally, return None but that breaks pandas import)
      #but still, it would be nice to see in console that lyrics were not found, for debugging
    print(originalArtist + "(" + artist + "):" + title + ": Lyrics not found")
    # return "Lyrics not found"
    return ""
  else:
    raw_lyrics = song.lyrics
    p = re.compile("\\[.*\\]", re.MULTILINE)
    clean_lyrics = p.sub("", raw_lyrics)
    return clean_lyrics #if the song is found, we return the lyrics

# function that was used in class to get sentiment analysis that we will use to run lyrics through 
def getSentiment(password,text):
    #if there were no lyrics found - there is nothing to send to this service - returning empty string right away.
    if (text == "") :
        return ""

    endpoint = "https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze"

    username = "apikey"
    body = {
        'features': {'emotion': {}, 'sentiment': {}},
        'version' : '2020-08-01',
        'text'    : text
    }

    resp = requests.post(endpoint, json=body, auth=(username, password))
    #if the response is not OK - print the response code and message for debug and return empty string
    if (resp.status_code != 200) :
        print("WatsonPlatform error: " + str(resp.status_code) + "(" + resp.text + ")")
        return ""
    data = resp.json()
    # print("PRINTING DATA ----------- \n", data)

    sentiment_score = data['sentiment']['document']['score']
    # print ("PRINTING SCORE ----------- ",sentiment_score)
    return sentiment_score

# function to return top 100 songs and their artists at the end of a given year
# data stored as a dictionary first, later as a pandas data frame


def tophits(year):

  print("Getting hits for " + year)
  tophits={'title':[], 'artist' :[], 'lyrics':[], 'sentiment_analysis':[], 'sadness':[], 'joy':[], 'fear':[], 'disgust':[], 'anger':[]}

  web_html = requests.get("https://www.billboard.com/charts/year-end/" + year + "/hot-100-songs")
  web_html.text
  soup = BeautifulSoup(web_html.text, 'html.parser')

  table = soup.find('div', {'class':'chart-details'})
 
  i=0

  for item in table.find_all('div', {'class': 'ye-chart-item__text'}):
    title = item.find('div', {'class': 'ye-chart-item__title'}).text.strip('\n').strip()
    tophits['title'].append(title)
    artist = item.find('div', {'class': 'ye-chart-item__artist'}).text.strip('\n').strip()
    tophits['artist'].append(artist)
    lyrics = getLyrics(artist, title)
    tophits['lyrics'].append(lyrics)
    # print(i, lyrics)
    sentiment_analysis = getSentiment(lyrics)
    # print (i, " PRINTING SCORE ----------- ",sentiment_analysis)
    tophits['sentiment_analysis'].append(sentiment_analysis)
    emotion = getEmotion(lyrics)
    # print (i, " PRINTING SCORE ----------- ",sentiment_analysis)
    # i dont think this is necessary for the df itself, we can just use it on the lyrics provided by user
    tophits['sadness'].append(emotion[0])
    tophits['joy'].append(emotion[1])
    tophits['fear'].append(emotion[2])
    tophits['disgust'].append(emotion[3])
    tophits['anger'].append(emotion[4])

    # print("adding song number " + str(i))
    # i+=1
    # if i==30: 
  df = pd.DataFrame(tophits)
  return df

In [None]:
def getEmotion(text):
    #if there were no lyrics found - there is nothing to send to this service - returning empty string right away.
    if (text == "") :
        return " "

    endpoint = "https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze"

    username = "apikey"
    password = "***"
    
    body = {
        'features': {'emotion': {}, 'sentiment': {}},
        'version' : '2018-11-16',
        'text'    : text,
        'language': 'en'
    }

    resp = requests.post(endpoint, json=body, auth=(username, password))
    # print ("RESP---------", resp)
    #if the response is not OK - print the response code and message for debug and return empty string
    if (resp.status_code != 200) :
        print("WatsonPlatform error: " + str(resp.status_code) + "(" + resp.text + ")")
        return ""
    data = resp.json()

    sadness = data['emotion']['document']['emotion']['sadness']
    joy = data['emotion']['document']['emotion']['joy']
    fear = data['emotion']['document']['emotion']['fear']
    disgust = data['emotion']['document']['emotion']['disgust']
    anger = data['emotion']['document']['emotion']['anger']
    # print ("PRINTING SCORE ----------- ",sentiment_score)
    return [sadness, joy, fear, disgust, anger]

In [None]:
# Calling the tophits function 

In [None]:
df2020 = tophits('2020')

In [None]:
df2019 = tophits('2019')

In [None]:
df2018 = tophits('2018')

In [None]:
df2017 = tophits('2017')

In [None]:
df2016 = tophits('2016')

In [None]:
df2015 = tophits('2015')

In [None]:
df2014 = tophits('2014')

In [None]:
df2013 = tophits('2013')

In [None]:
df2012 = tophits('2012')

In [None]:
df2011 = tophits('2011')

In [None]:
frames = [df2020,df2019,df2018,df2017,df2016,df2015,df2014,df2013,df2012,df2011]

completedf = pandas.concat(frames)

In [None]:
def get_info(track_id, TOKEN): 
    url = 'https://api.spotify.com/v1/tracks/' + track_id
    headers = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Authorization' : "Bearer " + TOKEN
    }
    response = requests.get(url, headers=headers)
    info = response.json()
    return info
  

In [None]:
import requests
import json
import pandas as pd

#Returns a list of spotify_ids
def get_id(song_name,release_date, artist, TOKEN): 
    url =  'https://api.spotify.com/v1/search?q=' + str(song_name) + '&type=track'
    headers = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Authorization' : "Bearer " + TOKEN
    }
    response = requests.get(url, headers = headers)
    tracks = response.json()
    tracks = tracks['tracks']['items']
    spotify_id = []
    for track in tracks: 
        name = track["artists"][0]['name']
        date = track["album"]["release_date"]
        if name == artist and date == release_date: 
            spotify_id += [track["id"]]
    return spotify_id

#Returns Audio Features from Spotify API
def get_info(track_id, TOKEN): 
    url = 'https://api.spotify.com/v1/audio-features/' + track_id
    headers = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Authorization' : "Bearer " + TOKEN
    }
    response = requests.get(url, headers=headers)
    info = response.json()
    return info

#Returns Audio Analysis APIs from Spotify 
def audio_analysis (track_id, TOKEN): 
    url = 'https://api.spotify.com/v1/audio-analysis/' + track_id
    headers = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Authorization' : "Bearer " + TOKEN
    }
    response = requests.get(url, headers=headers)
    analysis = response.json()
    return analysis


TOKEN = '***'




[]

In [None]:
# importing to sql DB

# Install the SQLAlchemy library if it is not installed
!sudo apt-get install python3-dev libmysqlclient-dev > /dev/null
!pip install mysqlclient > /dev/null
!sudo pip3 install -U sql_magic > /dev/null
!pip install psycopg2-binary > /dev/null



In [None]:
from sqlalchemy import create_engine
conn_string = 'mysql://{user}:{password}@{host}:{port}/{db}?charset=utf8'.format(
    user='escAPIng_from_new_york', #user naades to change
    password='***', 
    host = '***', 
    port=3306, 
    db='musicconsulting',
    encoding = 'utf8mb4'
)
engine = create_engine(conn_string)
engine 

In [None]:
# Prepare sql_magic library that enable to query to database easily.
%reload_ext sql_magic
%config SQL.conn_name = 'engine'

In [None]:
engine.execute('USE musicconsulting')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f4c8be00b90>

In [None]:
completedf['sentiment_analysis'] = pandas.to_numeric(completedf['sentiment_analysis'])
# completedf['lyrics'] = completedf['lyrics'].encode('utf-8',errors = 'replace')

In [None]:
df2014 = df2014.drop(columns='Unnamed: 0')

In [None]:
# df2014.to_sql(name='billboard2014', # name the table "billboard"
#                    con=engine, # use the connection to MySQL created earlier
#                    if_exists='replace', # if the table is already there, replace it
#                    index=False # do not write the index column in the database)
# )

In [None]:
#importing data into the sql database

# completedf.to_sql(name='billboard', # name the table "billboard"
#                    con=engine, # use the connection to MySQL created earlier
#                    if_exists='replace', # if the table is already there, replace it
# )


In [None]:
import pandas as pd
query = '''
CREATE TABLE billboard
SELECT *
FROM
(
      SELECT *
    FROM billboard2020
    UNION ALL
    SELECT *
    FROM billboard2019
    UNION ALL
    SELECT *
    FROM billboard2018
    UNION ALL
    SELECT *
    FROM billboard2017
    UNION ALL
    SELECT *
    FROM billboard2016
    UNION ALL
    SELECT *
    FROM billboard2015
    UNION ALL
    SELECT *
    FROM billboard2014
    UNION ALL
    SELECT *
    FROM billboard2013
    UNION ALL
    SELECT *
    FROM billboard2012
    UNION ALL
    SELECT *
    FROM billboard2011)a
    '''

df = pd.read_sql(query, con=engine)


In [None]:
query = '''SELECT * FROM billboard'''

billboard=pd.read_sql(query, con=engine)


Unnamed: 0,title,artist,lyrics,sentiment_analysis
0,Blinding Lights,The Weeknd,"b""\nYeah\n\n\nI've been tryna call\nI've been ...",-0.882600
1,Circles,Post Malone,"b""\nOh, oh, oh\nOh,\xe2\x80\x8a oh, oh\nOh,\xe...",-0.634431
2,The Box,Roddy Ricch,"b""\nPullin' out the coupe at the lot\nTold 'em...",-0.976443
3,Don't Start Now,Dua Lipa,b'\nIf you don\'t wanna see me\n\n\nDid a full...,-0.820467
4,Rockstar,DaBaby Featuring Roddy Ricch,"b'\nWoo, woo\nI pull up like\nHow you pull up,...",-0.568297
...,...,...,...,...
893,Are You Gonna Kiss Me Or Not,Thompson Square,\nWe were sitting up there on your momma's roo...,0.422095
894,Animal,Neon Trees,\nHere we go again\nI kinda wanna be more than...,-0.825462
895,You And Tequila,Kenny Chesney Featuring Grace Potter,"\nBaby, here I am again\nKicking dust in the c...",-0.899881
896,Colder Weather,Zac Brown Band,\nShe'd trade Colorado if he'd take her with h...,-0.564249


In [None]:
#Additional column for release dates

#Returns a list of spotify_ids
TOKEN = '***'
def get_id(song_name, artist, TOKEN): 
    url =  'https://api.spotify.com/v1/search?q=' + str(song_name) + '&type=track'
    headers = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Authorization' : "Bearer " + TOKEN
    } 
    response = requests.get(url, headers = headers)
    tracks = response.json()
    id = tracks['tracks']['items']
    spotify_id = []
    for track in id: 
        name = track['name']
        artist_name = track['artists'][0]['name']
        if name == song_name and artist == artist_name: 
          id = track['id']
          spotify_id += [id]
    return spotify_id


song_name = billboard.iloc[0].title #songname
artist = billboard.iloc[0].artist #artists

id = get_id (song_name, artist, TOKEN = TOKEN)
id



In [None]:
import pandas as pd

query1 = """
SELECT title, artist FROM billboard
"""
df = pd.read_sql(query1, con=engine)
df

Unnamed: 0,title,artist
0,Blinding Lights,The Weeknd
1,Circles,Post Malone
2,The Box,Roddy Ricch
3,Don't Start Now,Dua Lipa
4,Rockstar,DaBaby Featuring Roddy Ricch
...,...,...
893,Are You Gonna Kiss Me Or Not,Thompson Square
894,Animal,Neon Trees
895,You And Tequila,Kenny Chesney Featuring Grace Potter
896,Colder Weather,Zac Brown Band


In [None]:
ids = []
for i in range(len(df)): 
  song_name = df.iloc[i].title
  artist = df.iloc[i].artist
  id = get_id(song_name, artist, TOKEN = TOKEN)
  ids += [id]
#getting ids for each song
ids



In [None]:
df2011 = pd.read_csv('spotify_id2011.csv')

In [None]:
import pandas as pd 
import ast
import re
from lyricsgenius import Genius #using this library
from requests import Timeout
from multiprocessing import Process, Pool

genius = Genius('fwGCDyDv4nWf2SWH4gwUcdFHYIz9bonCZb7zP1Yx1XqresJ7l9ZUxODfU8XKTe-Y')  # initializing with token

# function to get lyrics of song based on artist and title (artist and title extracted in tophits(year))
def getLyrics(artist, title):
    originalArtist = artist
    #Genius seems to list the artists by the first name in the list, but billboard returns it as artist A featuring artist B, or artist A & artist B (perhaps some more combinations)
    if ("feat" in artist.lower()) :
        artist = artist[:artist.lower().find("feat")]
    if ("&" in artist.lower()) :
        artist = artist[:artist.lower().find("&")]

    global genius
    try :
        song = genius.search_song(title, artist)
    except Timeout: # there are timeout exceptions after 20-30 some requests sometimes, so refreshing the connection.
        genius = Genius('fwGCDyDv4nWf2SWH4gwUcdFHYIz9bonCZb7zP1Yx1XqresJ7l9ZUxODfU8XKTe-Y')
        print ("Genius session refreshed...")
        song = genius.search_song(title, artist)
        # print("getting lyrics..")
    # print(artist + ":" + title + ": " + song.lyrics)

    if (song is None): #checking if the song is not found, returning empty string to the caller method as a sign that we didn't find anything (ideally, return None but that breaks pandas import)
        #but still, it would be nice to see in console that lyrics were not found, for debugging
      print(originalArtist + "(" + artist + "):" + title + ": Lyrics not found")
      # return "Lyrics not found"
      return ""
    else:
      raw_lyrics = song.lyrics
      p = re.compile("\\[.*\\]", re.MULTILINE)
      clean_lyrics = p.sub("", raw_lyrics)
      return clean_lyrics #if the song is found, we return the lyrics

username = "apikey"
password = "***"
def getSentiment(username,password,text):
    #if there were no lyrics found - there is nothing to send to this service - returning empty string right away.
    if (text == "") :
        return ""

    endpoint = "https://api.us-south.natural-language-understanding.watson.cloud.ibm.com/instances/6a2e1e87-7d1f-4c77-a353-a1ad953fdcf5/v1/analyze"
    body = {
        'features': {'emotion': {}, 'sentiment': {}},
        'version' : '2020-08-01',
        'text'    : text, 
    }

    resp = requests.get(endpoint, body, auth=(username, password))
    # print ("RESP---------", resp)
    #if the response is not OK - print the response code and message for debug and return empty string
    if (resp.status_code != 200) :
        print("WatsonPlatform error: " + str(resp.status_code) + "(" + resp.text + ")")
        return ""
    else: 
      data = resp.json()
      sentiment_score = data['sentiment']['document']['score']
      return sentiment_score

def get_out_data_row(record):
    row = record[1]
    artists = ast.literal_eval(row['artists'])  # parsing list of artists from string
    artist = artists[0]  # taking the first artist (main) only
    title = row['name']
    artists = ast.literal_eval(row['artists'])  # parsing list of artists from string
    artist = artists[0]  # taking the first artist (main) only
    title = row['name']
    lyrics = getLyrics(artist, title)  # now trying to get the lyrics
    if lyrics != "" and len(lyrics) < 5000:  # and if we've managed to get those - adding the whole row to the output data (ignoring the results > 5000 symbols)
        out_row = dict()
        try: 
            score=getSentiment(username=username, password=password, text=lyrics)
        except:
            out_row['sentiment_analysis'] = ''
        else: 
            out_row['sentiment_analysis'] = score
        out_row['artist'] = artist
        out_row['title'] = title
        out_row['lyrics'] = lyrics
        out_row['danceability'] = row['danceability']
        out_row['energy'] = row['energy']
        out_row['loudness'] = row['loudness']
        out_row['speechiness'] = row['speechiness']
        out_row['acousticness'] = row['acousticness']
        out_row['instrumentalness'] = row['instrumentalness']
        out_row['liveness'] = row['liveness']
        out_row['valence'] = row['valence']
        out_row['tempo'] = row['tempo']
        return out_row
    else:
        return None


In [None]:
import pandas as pd

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/spotify_1922-2020_data.csv')

In [None]:
df['release_date'] =  pd.to_datetime(df['release_date'], format='%Y/%m/%d')

In [None]:
df2010=df.loc[(df['release_date']>='2010/1/1') & (df['release_date']<='2010/12/31')]
df2010

In [None]:
df2000=df.loc[(df['release_date']>='2000/1/1') & (df['release_date']<='2000/12/31')]
df2000sample=df2000.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2000sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2000_data_with_lyrics.csv")
    # !cp spotify_2000_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2001=df.loc[(df['release_date']>='2001/1/1') & (df['release_date']<='2001/12/31')]
df2001sample=df2001.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2001sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2001_data_with_lyrics.csv")
    # !cp spotify_2001_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2002=df.loc[(df['release_date']>='2002/1/1') & (df['release_date']<='2002/12/31')]
df2002sample=df2002.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2002sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2002_data_with_lyrics.csv")
    # !cp spotify_2002_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2003=df.loc[(df['release_date']>='2003/1/1') & (df['release_date']<='2003/12/31')]
df2003sample=df2003.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2003sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2003_data_with_lyrics.csv")
    # !cp spotify_2003_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2004=df.loc[(df['release_date']>='2004/1/1') & (df['release_date']<='2004/12/31')]
df2004sample=df2004.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2004sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2004_data_with_lyrics.csv")
    # !cp spotify_2004_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2005=df.loc[(df['release_date']>='2005/1/1') & (df['release_date']<='2005/12/31')]
df2005sample=df2005.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2005sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2005_data_with_lyrics.csv")
    # !cp spotify_2005_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2006=df.loc[(df['release_date']>='2006/1/1') & (df['release_date']<='2006/12/31')]
df2006sample=df2006.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2006sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2006_data_with_lyrics.csv")
    # !cp spotify_2006_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2007=df.loc[(df['release_date']>='2007/1/1') & (df['release_date']<='2007/12/31')]
df2007sample=df2007.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2007sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2007_data_with_lyrics.csv")
    # !cp spotify_2007_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2008=df.loc[(df['release_date']>='2008/1/1') & (df['release_date']<='2008/12/31')]
df2008sample=df2008.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2008sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2008_data_with_lyrics.csv")
    # !cp spotify_2008_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/spotify_2007_data_with_lyrics.csv')
# df2[df2['sentiment_analysis'].isnull()==True]
df2['sentiment_analysis']

In [None]:
df2009=df.loc[(df['release_date']>='2009/1/1') & (df['release_date']<='2009/12/31')]
df2009sample=df2009.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2009sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2009_data_with_lyrics.csv")
    # !cp spotify_2009_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/spotify_2011_data_with_lyrics.csv')
# df1[df1['sentiment_analysis'].isnull()==True]

In [None]:
df2010=df.loc[(df['release_date']>='2010/1/1') & (df['release_date']<='2010/12/31')]
df2010sample=df2010.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2010sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2010_data_with_lyrics.csv")
    # !cp spotify_2010_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2011=df.loc[(df['release_date']>='2011/1/1') & (df['release_date']<='2011/12/31')]
df2011sample=df2011.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2011sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2011_data_with_lyrics.csv")
    # !cp spotify_2011_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2012=df.loc[(df['release_date']>='2012/1/1') & (df['release_date']<='2012/12/31')]
df2012sample=df2012.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2012sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2012_data_with_lyrics.csv")
    # !cp spotify_2012_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2013=df.loc[(df['release_date']>='2013/1/1') & (df['release_date']<='2013/12/31')]
df2013sample=df2013.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2013sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2013_data_with_lyrics.csv")
    # !cp spotify_2013_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2014=df.loc[(df['release_date']>='2014/1/1') & (df['release_date']<='2014/12/31')]
df2014sample=df2014.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2014sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2014_data_with_lyrics.csv")
    # !cp spotify_2014_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2015=df.loc[(df['release_date']>='2015/1/1') & (df['release_date']<='2015/12/31')]
df2015sample=df2015.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2015sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2015_data_with_lyrics.csv")
    # !cp spotify_2015_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2016=df.loc[(df['release_date']>='2016/1/1') & (df['release_date']<='2016/12/31')]
df2016sample=df2016.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2016sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2016_data_with_lyrics.csv")
    # !cp spotify_2016_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2017=df.loc[(df['release_date']>='2017/1/1') & (df['release_date']<='2017/12/31')]
df2017sample=df2010.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2017sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2017_data_with_lyrics.csv")
    # !cp spotify_2017_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2018=df.loc[(df['release_date']>='2018/1/1') & (df['release_date']<='2018/12/31')]
df2018sample=df2018.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2018sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2018_data_with_lyrics.csv")
    # !cp spotify_2018_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2019=df.loc[(df['release_date']>='2019/1/1') & (df['release_date']<='2019/12/31')]
df2019sample=df2019.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2019sample.iterrows()) if x is not None] # filtering out empty results

    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2019_data_with_lyrics.csv")
    # !cp spotify_2019_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df2020=df.loc[(df['release_date']>='2020/1/1') & (df['release_date']<='2020/12/31')]
df2020sample=df2020.sample(n=2000)

if __name__ == '__main__':
    pool = Pool(processes=10) # run the data gathering in 10 threads
    output = [x for x in pool.map(get_out_data_row, df2010sample.iterrows()) if x is not None] # filtering out empty results
    out_df = pd.DataFrame(output)
    out_df.to_csv("spotify_2020_data_with_lyrics.csv")
    # !cp spotify_2020_data_with_lyrics.csv "drive/My Drive/"

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd 
year='1981-2000'
df = pd.read_csv('/content/drive/MyDrive/Project Programming/spotify_'+(year)+'_data_with_lyrics.csv')
df

In [None]:
lyrics = df1.lyrics.tolist()
lyrics #lyrics is in list

In [None]:
username = "apikey"
password = "***"
def getSentiment(username,password,text):
    #if there were no lyrics found - there is nothing to send to this service - returning empty string right away.
    if (text == "") :
        return ""

    endpoint = "https://api.us-south.natural-language-understanding.watson.cloud.ibm.com/instances/6a2e1e87-7d1f-4c77-a353-a1ad953fdcf5/v1/analyze"
    body = {
        'features': {'emotion': {}, 'sentiment': {}},
        'version' : '2020-08-01',
        'text'    : text, 
    }

    resp = requests.get(endpoint, body, auth=(username, password))
    # print ("RESP---------", resp)
    #if the response is not OK - print the response code and message for debug and return empty string
    if (resp.status_code != 200) :
        print("WatsonPlatform error: " + str(resp.status_code) + "(" + resp.text + ")")
        return ""
    else: 
      data = resp.json()
      sentiment_score = data['sentiment']['document']['score']
      return sentiment_score

In [None]:
import requests
scores=[]
songs = 0
for lyric in lyrics:
  try: 
    score=getSentiment(username=username, password=password, text=lyric)
  except:
    scores+=[''] 
  else: 
    scores+=[score]
  print(f'{score}: {lyric}')
scores


In [None]:
df1['sentiment_analysis']=scores
# df1

Unnamed: 0.1,Unnamed: 0,sentiment_analysis,artist,title,lyrics,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,0.829528,Liquid Soul,Hypnotic Energy,"Listen, can you hear it?\nThe music\nI can her...",0.657,0.820,-6.796,0.0669,0.03220,0.911000,0.0632,0.0574,134.988
1,1,-0.713173,Gym Class Heroes,Ass Back Home (feat. Neon Hitch),"\nOh, it's so sexy, yo\n\n\nI don't know where...",0.716,0.838,-4.289,0.0513,0.13400,0.000000,0.1480,0.6460,130.034
2,2,,Erin,Vanha sydän,"Vanha, vanha sydämeni, älä mulle laula\n\nVanh...",0.469,0.456,-6.874,0.0273,0.69200,0.000000,0.1170,0.2020,137.874
3,3,0.787667,Benjamin Francis Leftwich,Atlas Hands,"\nTake me to the docks, there's a ship without...",0.630,0.316,-11.885,0.0293,0.89800,0.002170,0.0616,0.4770,82.529
4,4,0.974078,Cassius,I <3 U SO,Oooh I love you so\nBut why I loved you\nI'll ...,0.126,0.798,-5.620,0.0424,0.00793,0.237000,0.2210,0.1360,188.718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202,1202,-0.563769,Havasi,The Storm,\nSpring break at Lake Havasu\nShe matriculate...,0.350,0.654,-11.562,0.0350,0.39600,0.942000,0.1090,0.1230,107.749
1203,1203,0.997011,Scorpions,Still Loving You,"\nTime, it needs time\nTo win back your love a...",0.282,0.605,-4.916,0.0294,0.00470,0.001440,0.1030,0.0783,103.929
1204,1204,-0.574395,ConeCrewDiretoria,Chama os Mulekes,"\nChama os muleke, eu tô chamado, convocado, e...",0.589,0.803,-4.460,0.3370,0.35700,0.000000,0.1270,0.5230,88.105
1205,1205,-0.62169,Toby Keith,Made in America,\nMy old man's that old man\nSpent his life li...,0.497,0.701,-5.996,0.0306,0.05070,0.000000,0.1030,0.3760,172.126


In [None]:
df1 = df1.drop(['Unnamed: 0'], axis=1)
df1

Unnamed: 0,sentiment_analysis,artist,title,lyrics,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.829528,Liquid Soul,Hypnotic Energy,"Listen, can you hear it?\nThe music\nI can her...",0.657,0.820,-6.796,0.0669,0.03220,0.911000,0.0632,0.0574,134.988
1,-0.713173,Gym Class Heroes,Ass Back Home (feat. Neon Hitch),"\nOh, it's so sexy, yo\n\n\nI don't know where...",0.716,0.838,-4.289,0.0513,0.13400,0.000000,0.1480,0.6460,130.034
2,,Erin,Vanha sydän,"Vanha, vanha sydämeni, älä mulle laula\n\nVanh...",0.469,0.456,-6.874,0.0273,0.69200,0.000000,0.1170,0.2020,137.874
3,0.787667,Benjamin Francis Leftwich,Atlas Hands,"\nTake me to the docks, there's a ship without...",0.630,0.316,-11.885,0.0293,0.89800,0.002170,0.0616,0.4770,82.529
4,0.974078,Cassius,I <3 U SO,Oooh I love you so\nBut why I loved you\nI'll ...,0.126,0.798,-5.620,0.0424,0.00793,0.237000,0.2210,0.1360,188.718
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202,-0.563769,Havasi,The Storm,\nSpring break at Lake Havasu\nShe matriculate...,0.350,0.654,-11.562,0.0350,0.39600,0.942000,0.1090,0.1230,107.749
1203,0.997011,Scorpions,Still Loving You,"\nTime, it needs time\nTo win back your love a...",0.282,0.605,-4.916,0.0294,0.00470,0.001440,0.1030,0.0783,103.929
1204,-0.574395,ConeCrewDiretoria,Chama os Mulekes,"\nChama os muleke, eu tô chamado, convocado, e...",0.589,0.803,-4.460,0.3370,0.35700,0.000000,0.1270,0.5230,88.105
1205,-0.62169,Toby Keith,Made in America,\nMy old man's that old man\nSpent his life li...,0.497,0.701,-5.996,0.0306,0.05070,0.000000,0.1030,0.3760,172.126


In [None]:
df1.to_csv("spotify_2011_data_with_lyrics.csv")
# !cp spotify_2011_data_with_lyrics.csv "drive/My Drive/"

In [None]:
df.to_csv('spotify_'+(year)+'_data_with_lyrics.csv')