In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import seaborn as sns
import string
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
sns.set()
%matplotlib inline
analyzer = SentimentIntensityAnalyzer()

In [None]:
songdata = pd.read_csv('data/top10songslyrics.csv')
lyricaldata = songdata[songdata['Lyrics'].notnull()].reset_index(drop=True) # Make a new dataframe that contains only non-instrumentals, and reset the index.

In [None]:
# Pre-processing to prepare for sentiment analysis.

for i in range(0,len(lyricaldata)):
    lyrics = lyricaldata.loc[i,'Lyrics']
    lyrics = re.sub(r'[Gg]onna','going to',lyrics)
    lyrics = re.sub(r'[Ww]anna','want to',lyrics)
    lyrics = re.sub(r'[Yy]\'all','you all',lyrics)
    lyrics = re.sub(r'\'cause','because',lyrics)
    lyrics = re.sub(r'[Gg]otta','have to',lyrics)
    lyrics = re.sub(r'gon\'','going to',lyrics)
    lyrics = re.sub(r'in\'','ing',lyrics)
    lyrics = re.sub(r'ingt','in\'t',lyrics) # Fix 'ain't'.
    lyrics = re.sub(r'\'em','them',lyrics)
    lyricaldata.loc[i,'Lyrics'] = lyrics

In [None]:
for i in range(0,len(lyricaldata)):
    my_text = lyricaldata.loc[i,'Lyrics']
    token_text = word_tokenize(my_text)
    token_text = [l.lower() for l in token_text if not re.fullmatch('[' + string.punctuation + ']+', l)] #Remove tokens that are just punctuation.
    port_stem = PorterStemmer() # The following code will stem the words in the lyrics. This means that, for example, "stop", "stopping", and "stops" will only count as one unique word.
    stem_text = []
    for k in range(len(token_text)):
        stem_text.append(port_stem.stem(token_text[k]))
    lyricaldata.loc[i,'Unique Words'] = len(set(stem_text)) # Redoing the unique/total words here after data cleaning.
    lyricaldata.loc[i,'Total Words'] = len(token_text)

In [None]:
lyricaldata['Uniqueness'] = lyricaldata['Total Words']/lyricaldata['Unique Words'] # This metric is the amount of times the average word in a song is repeated.

lyricaldata['Positive'] = 0
lyricaldata['Neutral'] = 0
lyricaldata['Negative'] = 0
trueneutral = []

for i in range(0,len(lyricaldata)):
    lyricaldata.loc[i,'Lyrics'] = re.sub(r'in\'','ing',lyricaldata.loc[i,'Lyrics'])
    token_text = regexp_tokenize(lyricaldata.loc[i,'Lyrics'],pattern=r'\w[^\n]*') # Split lyrics by line for sentiment analysis.
    pos = 0
    neg = 0
    neu = 0
    length = 0
    for sentence in token_text:
        vs = analyzer.polarity_scores(sentence)
        if vs['neu'] != 1:
            pos += float(vs['pos'])
            neg += float(vs['neg'])
            neu += float(vs['neu'])
            length += 1
    try:
        lyricaldata.loc[i,'Positive'] = pos/length
        lyricaldata.loc[i,'Neutral'] = neu/length
        lyricaldata.loc[i,'Negative'] = neg/length
    except ZeroDivisionError:
        print('Song ' + lyricaldata.loc[i,'Title'] + ' at position ' + str(i) + ' returned neutral.') # This only happens if Vader can't detect sentiments in any line of the song.
        trueneutral.append(i)

In [None]:
# The following code block generates a dataframe containing the average uniqueness, positive sentiment, neutral sentiment, and negative sentiment values for each year, weighted by how long each individual song spent in the top ten. This essentially reconstructs every individual Billboard Hot 100 chart and evaluates them - i.e., if a song appeared in the top ten on three charts, it is counted three times.

years = []
unqmeans = []
posmeans = []
neumeans = []
negmeans = []
for i in range(lyricaldata['Year'].min(),lyricaldata['Year'].max()+1):
    years.append(i)
    unqmeans.append(sum(lyricaldata[lyricaldata['Year'] == i]['Uniqueness']*lyricaldata.loc[lyricaldata['Year'] == i,'Number of weeks in top ten'])/sum(lyricaldata[lyricaldata['Year'] == i]['Number of weeks in top ten']))
    posmeans.append(sum(lyricaldata[lyricaldata['Year'] == i]['Positive']*lyricaldata.loc[lyricaldata['Year'] == i,'Number of weeks in top ten'])/sum(lyricaldata[lyricaldata['Year'] == i]['Number of weeks in top ten']))
    neumeans.append(sum(lyricaldata[lyricaldata['Year'] == i]['Neutral']*lyricaldata.loc[lyricaldata['Year'] == i,'Number of weeks in top ten'])/sum(lyricaldata[lyricaldata['Year'] == i]['Number of weeks in top ten']))
    negmeans.append(sum(lyricaldata[lyricaldata['Year'] == i]['Negative']*lyricaldata.loc[lyricaldata['Year'] == i,'Number of weeks in top ten'])/sum(lyricaldata[lyricaldata['Year'] == i]['Number of weeks in top ten']))
weightedmeandata = pd.DataFrame(columns=['Uniqueness','Positive','Neutral','Negative'],index=years)
weightedmeandata['Uniqueness'] = unqmeans
weightedmeandata['Positive'] = posmeans
weightedmeandata['Neutral'] = neumeans
weightedmeandata['Negative'] = negmeans
weightedmeandata.head()

In [None]:
sentdata = weightedmeandata.drop('Uniqueness',axis=1).drop('Neutral',axis=1)
sentdata['Year']=sentdata.index
sns.set()
with sns.axes_style('white'):
    g = sns.FacetGrid(sentdata,height=4,aspect=1.5)
sns.despine(left=True,bottom=True)
g.map(sns.regplot,'Year','Positive',color='#4c72FF',truncate=True,order=3)
g.map(sns.regplot,'Year','Negative',color='#F03600',truncate=True,order=3)
plt.ylabel('Sentiment Value',fontsize=18)
plt.title('Sentiment Analysis on Lyrics of All Top 10 Songs',fontsize=18)
plt.xlabel('')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.vlines(x=range(1960,2020,5),ymin=weightedmeandata['Negative'].min(),ymax=weightedmeandata['Positive'].max(),alpha=0.1)
plt.figtext(0.99, 0.01, 'Generated with Vader Sentiment Analysis\nDoes not include lyrics with no sentiment detected', fontsize=10,horizontalalignment='right',**{'fontname':'Arial'})
plt.annotate('Positive',xy=(2019+0.5,sentdata.loc[2019,'Positive']+0.005),fontsize=15,color='#4c72FF')
plt.annotate('Negative',xy=(2019+0.5,sentdata.loc[2019,'Negative']-0.005),fontsize=15,color='#F03600')
g.savefig('visualizations/sentiment.png',dpi=160)

In [None]:
# Word repetition plot

sns.set()
sns.set_style('white')
plot = sns.regplot(x=list(weightedmeandata.index),y='Uniqueness',data=weightedmeandata)
sns.despine(left=True,bottom=True)
plt.xlabel('',fontsize=13)
plt.ylabel('Avg Repetitions per Word',fontsize=15)
plt.title('Repetitiveness of Top 10 Songs Over Time',fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.figtext(0.99, 0.01, 'Songs are grouped by year and weighted by how long they stayed in the top 10', fontsize=9.5,horizontalalignment='right',**{'fontname':'Arial'})
plt.tight_layout()
plt.show()
plot.get_figure().savefig('visualizations/weightedrepetition.png',dpi=160)

In [None]:
# 1958 is excluded from the following analysis because the Billboard Hot 100 only existed August-December of that year.

top10songs = []
years = []
for i in range(lyricaldata['Year'].min()+1,lyricaldata['Year'].max()+1):
    years.append(i)
    top10songs.append(len(lyricaldata[lyricaldata['Year'] == i]))
top10songsdf = pd.DataFrame(columns=['Number of Top 10 Songs'],index=years)
top10songsdf['Number of Top 10 Songs'] = top10songs

In [None]:
# Top 10 Songs Per Year plot

sns.set()
sns.set_style('white')
plot = sns.lineplot(data=top10songsdf,palette=sns.set_palette(['#4c72FF']))
plot.get_legend().remove()
sns.despine(left=True,bottom=True)
plt.scatter(1991,top10songsdf.loc[1991,'Number of Top 10 Songs'],color='black',s=20)
plt.annotate('[1]',xy=(1991+0.5,top10songsdf.loc[1991,'Number of Top 10 Songs']),fontsize=10)
plt.xlabel('',fontsize=13)
plt.ylabel('Songs',fontsize=13)
plt.title('Songs Reaching the Top 10 Each Year',fontsize=14,**{'fontname':'Helvetica'})
plt.xticks(fontsize=12,**{'fontname':'Arial'})
plt.yticks(fontsize=12,**{'fontname':'Arial'})
plt.figtext(0.99, 0.01, '[1] November 31st, 1991: Billboard Hot 100 begins new data tracking method', fontsize=9,horizontalalignment='right',**{'fontname':'Arial'})
plot.get_figure().savefig('visualizations/top10songsperyear.png',dpi=160)
plt.show()

In [None]:
#The next four cells generate and then save the wordcloud. Step one: Throw all the text into a bucket. Step two: Add la las, na nas, and bebopdoopetywhops to stopwords. Step three: ??? Step four: Wordcloud.

fulltext = ' '.join(i for i in lyricaldata['Lyrics'])
fulltext = re.sub(r'(\n|\r)',' ',fulltext)

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(['got','oh','ooh','na','la','doo','now'])
wordcloud = WordCloud(stopwords=stopwords,width=1920,height=1080,max_words=300,background_color='white',max_font_size=600,colormap='viridis',collocations=False).generate(fulltext.lower())

In [None]:
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file("visualizations/overall_wordcloud.png")

In [None]:
wordcounts = pd.Series(fulltext.lower().split(' ')).value_counts().to_frame()
print(wordcounts)

In [None]:
# Now we're going to generate the barplot showing word frequency distribution.

stopwords = set(STOPWORDS)
stopwords.update(['got','oh','ooh','na','la','doo','now'])
wordfilter = wordcounts.filter(items=stopwords,axis=0)
wordcounts['Word'] = wordcounts.index
wordcounts = wordcounts.merge(wordfilter,how='left',indicator=True)
wordcounts = wordcounts[wordcounts['_merge'] != 'both'][1:].drop('_merge',axis=1)

In [None]:
wordcounts.columns = ['Count','Word']
wordcounts.columns

In [None]:
sns.set()
plt.style.use('seaborn')
plot = sns.barplot(x='Word',y='Count',data=wordcounts[0:15])
plot.set_xticklabels(plot.get_xticklabels(),rotation=45,fontsize=13)
plot.set_yticklabels(range(0,14001,2000),fontsize=13)
plt.xlabel('Word',fontsize=18)
plt.ylabel('Frequency',fontsize=18)
plt.tight_layout()
plt.show()
plot.get_figure().savefig('visualizations/top10songswordsbarplot.png',dpi=160)