# Sentiment Analyzer for News Quotes

1. Read cleaned data into a pandas dataframe
2. Pass quotes from each article into sentiment analyzer
3. Save output into new columns 'negative', 'neutral', 'positive', 'compound'
4. Save output into new excel sheet with two sheets, one for quotes and one for non-quotes

In [1]:
# run this code if connecting to a Google drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Extracting Data from Excel Files

In [43]:
# replace with quotes_input.xlsx
fp = '/content/drive/My Drive/evaluation_quotes/quotes_input.xlsx'

quotes_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "quote", "speaker", "verb"])

non_quotes_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "non_quoted_text"])

In [44]:
# add new columns to dataframes
quotes_df['negative'] = pd.Series(dtype='float')
quotes_df['neutral'] = pd.Series(dtype='float')
quotes_df['positive'] = pd.Series(dtype='float')
quotes_df['compound'] = pd.Series(dtype='float')

non_quotes_df['negative'] = pd.Series(dtype='float')
non_quotes_df['neutral'] = pd.Series(dtype='float')
non_quotes_df['positive'] = pd.Series(dtype='float')
non_quotes_df['compound'] = pd.Series(dtype='float')

## Running Quotes through VADER

Vader Documentation: https://github.com/cjhutto/vaderSentiment

Guide to using Vader: https://medium.com/@rslavanyageetha/vader-a-comprehensive-guide-to-sentiment-analysis-in-python-c4f1868b0d2e

Vader sentiment analyzer returns a dictionary of sentiment intensity scores for
a particular text input with the following sentiments: negative, neutral,
positive, and compound for overall sentiment intensity. The negative, neutral,
and positive scores have a value from 0 to 1 and compound scores have a
value from -1 to 1, with -1 indicating entirely negative, 0 indicating
entirely neutral, and 1 indicating entirely positive.


In [34]:
# helper function to extract scores for each story
# args: dataframe, column name of text to be analyzed as a string
def get_sentiment_score(df, col):

  for index, row in df.iterrows():
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(df[col][index])

    df.loc[index, 'negative'] = score['neg']
    df.loc[index, 'neutral'] = score['neu']
    df.loc[index, 'positive'] = score['pos']
    df.loc[index, 'compound'] = score['compound']

In [45]:
get_sentiment_score(quotes_df, 'quote')

In [46]:
quotes_df.head()

Unnamed: 0,text_id,text_name,quote,speaker,verb,negative,neutral,positive,compound
0,01aceb45444212877ad3c6b8a340ac85,2021_02_05_ShaniaOBrien,In the official statement made this morning\nt...,the NUS\nthe NUS\nJackie Chen from the SA Labo...,said\nsaid\nreporting\ncontinues\nnan\nreached,0.175,0.781,0.044,-0.9531
1,054c82651b895adb42592c3b55b04fde,2021_10_10_MaxShanahan,"that ""when casuals do claim the actual hours t...",Staff\nA spokesperson for the USyd Casuals Net...,told\ntelling\ntold\ncriticised\nnan\nhighligh...,0.059,0.872,0.069,0.4393
2,0740ab6bebf7c4c8575f950bfce8d8a8,2021_05_02_ClaireOllivain,it's pretty obvious that there's no threat her...,The security\nThe security\nEAG member Holly H...,said\nsaid\nsaid\ntold\ntold\nsaid\nsaid\ntold...,0.073,0.86,0.067,0.0776
3,07d7f15966bdf625f5358fbb179b5033,2021_11_27_MarlowHurst_ShaniaOBrien_SamuelGarrett,that they were now available in the bathrooms ...,Mills\n2021 Sydney University Dramatic Society...,reported\nsaid\nindicated\nnoted\naccording to...,0.04,0.893,0.067,0.5423
4,0a7f70b8d6612b7964adca2db0ae0242,2022_05_11_RileyVaughan,The University of Sydney Union has been using ...,USU President Prudence Wilkins-Wheat\nnan,told\nnan,0.0,0.953,0.047,0.4019


In [47]:
get_sentiment_score(non_quotes_df, 'non_quoted_text')

In [48]:
non_quotes_df.head()

Unnamed: 0,text_id,text_name,non_quoted_text,negative,neutral,positive,compound
0,01aceb45444212877ad3c6b8a340ac85,2021_02_05_ShaniaOBrien,"US condemns ""horrific"" assault on internationa...",0.205,0.789,0.006,-0.9888
1,054c82651b895adb42592c3b55b04fde,2021_10_10_MaxShanahan,"fter USyd's denial, Fair Work Ombudsman issues...",0.022,0.9,0.078,0.875
2,0740ab6bebf7c4c8575f950bfce8d8a8,2021_05_02_ClaireOllivain,ensions escalate at UTSSA; President calls sec...,0.096,0.856,0.048,-0.9805
3,07d7f15966bdf625f5358fbb179b5033,2021_11_27_MarlowHurst_ShaniaOBrien_SamuelGarrett,SU Board Meeting: Honourary Secretary resigns\...,0.056,0.831,0.113,0.9973
4,0a7f70b8d6612b7964adca2db0ae0242,2022_05_11_RileyVaughan,REAKING: USU election loophole allows voter fr...,0.043,0.879,0.078,0.925


In [49]:
# create a third dataframe to directly compare compound scores between quotes and non quotes
scores_comp_df = pd.merge(quotes_df[['text_id', 'text_name', 'compound']],
                          non_quotes_df[['text_id', 'text_name', 'compound']],
                          on=['text_id', 'text_name'],
                          suffixes=('_quotes', '_non_quotes'))

scores_comp_df.rename(columns={'compound_quotes': 'quote_score', 'compound_non_quotes': 'non_quote_score'}, inplace=True)

scores_comp_df.head()

Unnamed: 0,text_id,text_name,quote_score,non_quote_score
0,01aceb45444212877ad3c6b8a340ac85,2021_02_05_ShaniaOBrien,-0.9531,-0.9888
1,054c82651b895adb42592c3b55b04fde,2021_10_10_MaxShanahan,0.4393,0.875
2,0740ab6bebf7c4c8575f950bfce8d8a8,2021_05_02_ClaireOllivain,0.0776,-0.9805
3,07d7f15966bdf625f5358fbb179b5033,2021_11_27_MarlowHurst_ShaniaOBrien_SamuelGarrett,0.5423,0.9973
4,0a7f70b8d6612b7964adca2db0ae0242,2022_05_11_RileyVaughan,0.4019,0.925


## Write Output to New Workbook

In [13]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/159.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m153.6/159.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


In [50]:
# replace with 'quotes_sentiment.xlsx'
output = '/content/drive/My Drive/evaluation_quotes/quote_sentiment.xlsx'

# create excel writer object to initialize new workbook
writer = pd.ExcelWriter(output, engine="xlsxwriter")

# write dataframes to different worksheets
quotes_df.to_excel(writer, sheet_name="quotes", index=False)
non_quotes_df.to_excel(writer, sheet_name="non_quotes", index=False)
scores_comp_df.to_excel(writer, sheet_name="scores_comp", index=False)

# close the excel writer and output file
writer.close()