# Sentiment Analysis

In [136]:
import numpy as np 
import pandas as pd 
import re
import nltk 
from nltk.sentiment import SentimentIntensityAnalyzer
%matplotlib inline

import altair as alt

In [230]:
# Disabling max rows set by alt air
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

#### Loading Data

In [96]:
df_cty_all = pd.read_csv('../Data/df_cty_final.csv')
df_rock_all = pd.read_csv('../Data/df_rock_final.csv')
df_rb_all = pd.read_csv('../Data/df_rb_final.csv')

In [74]:
df_cty = pd.read_csv('../Data/df_for_cty_ngrams.csv').fillna('NA')
df_rock = pd.read_csv('../Data/df_for_rock_ngrams.csv').fillna('NA')
df_rb = pd.read_csv('../Data/df_for_rb_ngrams').fillna('NA')

### Fucntions

Finding all strings that match my regex, the specific regex that I am using returns any word or number, also will return certain special characters

In [38]:
def remove_stop(line):
    #WORD_RE = re.compile(r"[A-Za-z0-9_\-]+")
    WORD_RE = re.compile(r'\b\S+\b')
    token = WORD_RE.findall(line)
    tokens = [w.lower().replace('\\u',' ').replace("205f",'').replace("2005",'').replace("\\",'') for w in token if w.lower()]
    return tokens

### Sentiment Intensity Analyzer

Returns if a corpus of words are negative , neu , pos , and compound

In [89]:
def get_sentiment(df, col):
    dicts = {'neg': [], 'neu': [], 'pos': [], 'compound':[]}
    sia = SentimentIntensityAnalyzer()
    
     
    for i in range(len(df)):
        for key,value in sia.polarity_scores(' '.join(remove_stop(df[col].iloc[i].strip('][')))).items():
            dicts[key].append(value)

    return pd.DataFrame(dicts)

In [111]:
get_sentiment(df_rb_all , 'words').describe()

Unnamed: 0,neg,neu,pos,compound
count,1004.0,1004.0,1004.0,1004.0
mean,0.159663,0.604218,0.236122,0.29004
std,0.108938,0.135197,0.119651,0.877743
min,0.0,0.146,0.0,-0.9999
25%,0.078,0.523,0.153,-0.895075
50%,0.14,0.602,0.218,0.9413
75%,0.219,0.688,0.30825,0.993525
max,0.715,1.0,0.747,1.0


In [110]:
get_sentiment(df_rock_all , 'words').describe()

Unnamed: 0,neg,neu,pos,compound
count,1066.0,1066.0,1066.0,1066.0
mean,0.164172,0.591174,0.244656,0.329583
std,0.104154,0.137673,0.119618,0.843333
min,0.0,0.118,0.0,-0.9999
25%,0.08725,0.497,0.155,-0.770025
50%,0.1485,0.585,0.2345,0.93375
75%,0.22375,0.678,0.318,0.9911
max,0.59,1.0,0.808,0.9998


In [109]:
get_sentiment(df_cty_all , 'words').describe()

Unnamed: 0,neg,neu,pos,compound
count,1015.0,1015.0,1015.0,1015.0
mean,0.115074,0.631357,0.253572,0.609056
std,0.084524,0.126109,0.11749,0.681318
min,0.0,0.251,0.0,-0.9999
25%,0.0505,0.553,0.1665,0.68765
50%,0.099,0.64,0.236,0.9731
75%,0.163,0.726,0.3315,0.99295
max,0.55,0.972,0.715,1.0


# Graphing the Sentiment of song segments for each Genre

### Hip-Hop/Rap

In [186]:
# RB
r = pd.DataFrame(columns = ['neg','pos','structure'] )
features_for_analysis = ['Intro', 'Chorus' ,'Verse 1', 'Chorus', 'Verse 2' ,'Chorus']
num = 0
for v,i in enumerate(features_for_analysis):
    temp = get_sentiment(df_rb, i)[['neg','pos']]
    if v%2 != 0:
        num +=1
        temp['structure'] = i + str(num)
    else:
        temp['structure'] = i

    
    r = pd.concat((r,temp))

x_rb = r.groupby('structure', as_index = False).mean()   

In [220]:
rb = alt.Chart(x_rb).mark_line(size = 5,opacity = .5).encode(
    x=alt.X('structure:N', sort = ['Intro','Chorus1','Verse 1','Chorus2', 'Verse 2','Chorus3'],title = None),
    y='pos:Q'
).properties(width = 750, height = 100)

### Rock

In [225]:
# Rock
r = pd.DataFrame(columns = ['neg','pos','structure'] )
features_for_analysis =['Verse 1', 'Pre-Chorus' ,'Chorus', 'Verse 2' ,'Pre-Chorus' ,'Chorus' ,'Bridge' ,'Chorus']
num = 0
watch_list = [2 , 5 , 7]
for v,i in enumerate(features_for_analysis):
    temp = get_sentiment(df_rock, i)[['neg','pos']]
    if v in watch_list:
        num +=1
        temp['structure'] = i + str(num)
    else:
        temp['structure'] = i

    
    r = pd.concat((r,temp))

x_rock = r.groupby('structure', as_index = False).mean()

In [226]:
rock = alt.Chart(x_rock).mark_line(size = 5,opacity = .5,color = '#a24857').encode(
    x=alt.X('structure:N', sort = ['Verse 1', 'Pre-Chorus' ,'Chorus1', 'Verse 2' ,'Pre-Chorus' ,'Chorus2' ,'Bridge' ,'Chorus3'],title = None),
    y='pos:Q'
).properties(width = 750, height = 100)

### Country

In [227]:
# Country
r = pd.DataFrame(columns = ['neg','pos','structure'] )
features_for_analysis = ['Verse 1' ,'Chorus', 'Verse 2' ,'Chorus', 'Bridge' ,'Chorus']
num = 0
watch_list = [1 , 3 , 5]
for v,i in enumerate(features_for_analysis):
    temp = get_sentiment(df_cty, i)[['neg','pos']]
    if v in watch_list:
        num +=1
        temp['structure'] = i + str(num)
    else:
        temp['structure'] = i

    
    r = pd.concat((r,temp))

x_cty = r.groupby('structure', as_index = False).mean()

In [228]:
cty = alt.Chart(x_cty).mark_line(size = 5, opacity = .5,color = '#D2B48C').encode(
    x=alt.X('structure:N', sort = ['Verse 1' ,'Chorus1', 'Verse 2' ,'Chorus2', 'Bridge' ,'Chorus3'],title = None),
    y='pos:Q'
).properties(width = 750, height =100)

In [229]:
alt.vconcat(rock , rb , cty)