<a href="https://colab.research.google.com/github/sessex/billboard-sentiment-analysis/blob/master/data-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Billboard Hot 100 Sentiment Analysis**
Sentiment analysis of the lyrics from Billboard Hot 100 songs from 1965-2015. NLP was used to evaluate the emotion of each word in the song and assign each song an emotion rating based on the proportion between number of words with given emotion to total word count. Results were explored and visualized using d3.js at the link below. 

Final visualization: https://observablehq.com/d/9f91ebecdcb70699


# **Import tools & files**

In [None]:
import pandas as pd
import nltk 
nltk.download('punkt')
from nltk import tokenize
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

import numpy as np

from tqdm import tqdm_notebook as tqdm
from tqdm import trange

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
allBillboardHot100 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='latin-1')

In [None]:
allBillboardHot100.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0


# **Create dataframe that calculates sentiment values for each song based on lyrics**




In [None]:
def text_emotion(df, column):
  new_df = df.copy()
  
  emolex_df = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
                          names=["words", "emotion", "association"],
                          sep='\t')
  emolex_words = emolex_df.pivot(index='words',
                                 columns='emotion',
                                 values='association').reset_index()
  
  emotions = emolex_words.columns.drop('words')
  emo_df = pd.DataFrame(0, index=df.index, columns=emotions)

  stemmer = SnowballStemmer("english")

  with tqdm(total=len(list(new_df.iterrows()))) as pbar:
    for i, row in new_df.iterrows():
      pbar.update(1)
      lyrics_string = str(new_df.loc[i][column])
      lyrics = word_tokenize(lyrics_string)
      if lyrics != '':
        for word in lyrics:      
          word = stemmer.stem(word.lower())
          emo_score = emolex_words[emolex_words.words == word]
          if not emo_score.empty:
            for emotion in list(emotions):
              emo_df.at[i, emotion] += emo_score[emotion]

  new_df = pd.concat([new_df, emo_df], axis=1)
  return new_df

In [None]:
billboard_df = text_emotion(allBillboardHot100, 'Lyrics')

HBox(children=(IntProgress(value=0, max=5100), HTML(value='')))




In [None]:
billboard_df.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,2,1,2,0,0,2,2,0,0,0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0,1,3,1,0,8,2,15,1,1,0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0,0,0,0,0,0,0,0,0,0,0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0,3,3,0,5,0,5,0,6,0,0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0,0,2,0,0,10,7,10,5,2,2


# **Normalize sentiment values by song's word count**

In [None]:
billboard_df['word_count'] = billboard_df['Lyrics'].apply(str).apply(tokenize.word_tokenize).apply(len)

In [None]:
billboard_df.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,2,1,2,0,0,2,2,0,0,0,125
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0,1,3,1,0,8,2,15,1,1,0,205
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0,0,0,0,0,0,0,0,0,0,0,0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0,3,3,0,5,0,5,0,6,0,0,152
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0,0,2,0,0,10,7,10,5,2,2,232


In [None]:
emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

In [None]:
for emotion in emotions:
  billboard_df[emotion] = billboard_df[emotion] / billboard_df['word_count']

In [None]:
billboard_df.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,0.016,0.008,0.016,0.0,0.0,0.016,0.016,0.0,0.0,0.0,125
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0,0.004878,0.014634,0.004878,0.0,0.039024,0.009756,0.073171,0.004878,0.004878,0.0,205
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0,,,,,,,,,,,0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0,0.019737,0.019737,0.0,0.032895,0.0,0.032895,0.0,0.039474,0.0,0.0,152
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0,0.0,0.008621,0.0,0.0,0.043103,0.030172,0.043103,0.021552,0.008621,0.008621,232


# **Clean results**

In [None]:
billboard_df['Lyrics'].replace('', np.nan, inplace=True)
billboard_df.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,0.016,0.008,0.016,0.0,0.0,0.016,0.016,0.0,0.0,0.0,125
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0,0.004878,0.014634,0.004878,0.0,0.039024,0.009756,0.073171,0.004878,0.004878,0.0,205
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0,0.019737,0.019737,0.0,0.032895,0.0,0.032895,0.0,0.039474,0.0,0.0,152
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0,0.0,0.008621,0.0,0.0,0.043103,0.030172,0.043103,0.021552,0.008621,0.008621,232
5,6,downtown,petula clark,1965,when youre alone and life is making you lonel...,1.0,0.008333,0.025,0.008333,0.0125,0.008333,0.058333,0.016667,0.020833,0.004167,0.008333,240


In [None]:
billboard_df.dropna(inplace=True)

# **Export results**

In [None]:
# Download to google drive
from google.colab import drive
drive.mount('drive')

In [None]:
billboard_df.to_csv('billboard_sentiment_analysis.csv')
!cp billboard_sentiment_analysis.csv "drive/My Drive/"

# **Additional: Compare songs ranked #1**

In [None]:
rankOne_df = billboard_df[billboard_df.Rank == 1]
rankOne_df

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,0.016,0.008,0.016,0.0,0.0,0.016,0.016,0.0,0.0,0.0,125
200,1,to sir with love,lulu,1967,those school girl days of telling tales and b...,1.0,0.007143,0.028571,0.007143,0.007143,0.021429,0.021429,0.064286,0.007143,0.0,0.042857,140
300,1,hey jude,the beatles,1968,hey jude dont make it bad take a sad song and ...,3.0,0.007117,0.010676,0.010676,0.014235,0.003559,0.021352,0.014235,0.010676,0.0,0.010676,281
400,1,sugar sugar,the archies,1969,sugar honey honey you are my candy girl and y...,1.0,0.0,0.045226,0.0,0.0,0.055276,0.0,0.180905,0.0,0.045226,0.035176,199
500,1,bridge over troubled water,simon garfunkel,1970,when youre weary feeling small when tears are ...,3.0,0.006135,0.018405,0.006135,0.01227,0.02454,0.02454,0.03681,0.02454,0.0,0.02454,163
600,1,joy to the world,three dog night,1971,jeremiah was a bullfrog was a good friend of ...,1.0,0.0,0.01145,0.030534,0.003817,0.09542,0.034351,0.129771,0.022901,0.007634,0.015267,262
700,1,the first time ever i saw your face,roberta flack,1972,the first time ever i saw your face i thought...,1.0,0.014706,0.073529,0.0,0.014706,0.073529,0.014706,0.080882,0.029412,0.029412,0.036765,136
800,1,tie a yellow ribbon round the ole oak tree,tony orlando and dawn,1973,im comin home ive done my time now ive got to ...,3.0,0.130137,0.119863,0.068493,0.013699,0.109589,0.027397,0.160959,0.010274,0.05137,0.05137,292
900,1,the way we were,barbra streisand,1974,memories light the corners of my mind misty w...,1.0,0.0,0.023077,0.0,0.007692,0.015385,0.007692,0.015385,0.007692,0.015385,0.015385,130
1000,1,love will keep us together,captain tennille,1975,love love will keep us together think of me ba...,3.0,0.017241,0.021552,0.021552,0.017241,0.060345,0.025862,0.064655,0.017241,0.008621,0.008621,232
