# Sentiment Analyzer for News Quotes

1. Read cleaned data into a pandas dataframe
2. Pass quotes from each article into sentiment analyzer
3. Save output into new columns 'negative', 'neutral', 'positive', 'compound'
4. Save output into new excel sheet with two sheets, one for quotes and one for non-quotes

In [None]:
# run this code if connecting to a Google drive
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!pip install vaderSentiment

In [3]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Extracting Data from Excel Files

In [233]:
# replace with quotes_input.xlsx
fp = '/content/drive/My Drive/evaluation_quotes/CBC_qt_output/2023_12_dec_CBC_news_clean_new.xlsx'

quotes_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "quote"])

non_quotes_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "non_quoted_text"])

speakers_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "speaker"])

verbs_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "verb"])

In [234]:
# add new columns to dataframes
quotes_df['positive'] = pd.Series(dtype='float')
quotes_df['negative'] = pd.Series(dtype='float')
quotes_df['neutral'] = pd.Series(dtype='float')
quotes_df['compound'] = pd.Series(dtype='float')

non_quotes_df['positive'] = pd.Series(dtype='float')
non_quotes_df['negative'] = pd.Series(dtype='float')
non_quotes_df['neutral'] = pd.Series(dtype='float')
non_quotes_df['compound'] = pd.Series(dtype='float')

speakers_df['positive'] = pd.Series(dtype='float')
speakers_df['negative'] = pd.Series(dtype='float')
speakers_df['neutral'] = pd.Series(dtype='float')
speakers_df['compound'] = pd.Series(dtype='float')

verbs_df['positive'] = pd.Series(dtype='float')
verbs_df['negative'] = pd.Series(dtype='float')
verbs_df['neutral'] = pd.Series(dtype='float')
verbs_df['compound'] = pd.Series(dtype='float')

## Running Quotes through VADER

Vader Documentation: https://github.com/cjhutto/vaderSentiment

Guide to using Vader: https://medium.com/@rslavanyageetha/vader-a-comprehensive-guide-to-sentiment-analysis-in-python-c4f1868b0d2e

Vader sentiment analyzer returns a dictionary of sentiment intensity scores for
a particular text input with the following sentiments: negative, neutral,
positive, and compound for overall sentiment intensity. The negative, neutral,
and positive scores have a value from 0 to 1 and compound scores have a
value from -1 to 1, with -1 indicating entirely negative, 0 indicating
entirely neutral, and 1 indicating entirely positive.


In [30]:
# helper function to extract scores for each story
# args: dataframe, column name of text to be analyzed as a string
def get_sentiment_score(df, col):

  for index, row in df.iterrows():
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(df[col][index])

    df.loc[index, 'positive'] = score['pos']
    df.loc[index, 'negative'] = score['neg']
    df.loc[index, 'neutral'] = score['neu']
    df.loc[index, 'compound'] = score['compound']

In [235]:
# get sentiment scores for all four dataframes
get_sentiment_score(quotes_df, 'quote')

get_sentiment_score(non_quotes_df, 'non_quoted_text')

get_sentiment_score(speakers_df, 'speaker')

get_sentiment_score(verbs_df, 'verb')

In [236]:
quotes_df.head()

Unnamed: 0,text_id,text_name,quote,positive,negative,neutral,compound
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,that reports of Vancouver police denying acces...,0.042,0.152,0.806,-0.9327
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,that the federal government will begin a consu...,0.0,0.056,0.944,-0.5994
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,he believes airlines are dropping base fares a...,0.061,0.058,0.881,0.6617
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,Moore was working on the top of one of the hom...,0.09,0.026,0.884,0.9756
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,"his bid would have cost $3.5 million per year,...",0.102,0.027,0.871,0.9371


In [237]:
non_quotes_df.head()

Unnamed: 0,text_id,text_name,non_quoted_text,positive,negative,neutral,compound
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,British Columbia's human rights commissioner h...,0.105,0.0,0.895,0.6059
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,The federal government is planning to launch a...,0.0,0.0,1.0,0.0
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,Can you fly round-trip from Toronto to Lisbon ...,0.063,0.04,0.897,0.8901
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,Jurors in a coroner's inquest on Monday heard ...,0.06,0.054,0.886,0.4299
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,Dr. Todd Young owns virtual health clinic Medi...,0.146,0.026,0.828,0.9935


In [238]:
speakers_df.head()

Unnamed: 0,text_id,text_name,speaker,positive,negative,neutral,compound
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,Kasari Govender\nGovender\nShe\nVancouver Poli...,0.0,0.0,1.0,0.0
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,Housing Minister Sean Fraser\nFraser\nFraser\n...,0.0,0.0,1.0,0.0
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,"Jeff Verman, CEO of Plus Travel Group,\nVerman...",0.0,0.024,0.976,-0.296
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,"Cyr\nCyr\nCyr\nTim Tompkins, owner of TNT Insu...",0.103,0.0,0.897,0.6808
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,Young\nYoung\nYoung\nHe\nhe\nspokesperson Trac...,0.0,0.0,1.0,0.0


In [239]:
verbs_df.head()

Unnamed: 0,text_id,text_name,verb,positive,negative,neutral,compound
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,says\nsays\nsays\nsaid\nsaid,0.0,0.0,1.0,0.0
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,announced\nsaid\nsaid\nsays,0.0,0.0,1.0,0.0
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,said\nsaid\nsaid\nsays\nsaid\ntold\nsays\nsaid...,0.0,0.0,1.0,0.0
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,said\nsaid\nsaid\nsaid\nsaid\ntestified\nsaid\...,0.0,0.0,1.0,0.0
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,told\nsaid\nsays\nsaid\nsaid\nsaid\nsaid\nadde...,0.362,0.0,0.638,0.6249


In [240]:
# create a third dataframe to directly compare compound scores between quotes and non quotes
scores_all_df = pd.merge(quotes_df[['text_id', 'text_name', 'positive', 'negative', 'compound']],
                          non_quotes_df[['text_id', 'text_name', 'positive', 'negative', 'compound']],
                          on=['text_id', 'text_name'],
                          suffixes=('_quotes', '_non_quotes'))

scores_all_df.rename(columns={'positive_quotes': 'quote_pos',
                              'negative_quotes': 'quote_neg',
                              'compound_quotes': 'quote_comp',
                              'positive_non_quotes': 'non_quote_pos',
                              'negative_non_quotes': 'non_quote_neg',
                              'compound_non_quotes': 'non_quote_comp',
                              }, inplace=True)

In [241]:
scores_all_df.head()

Unnamed: 0,text_id,text_name,quote_pos,quote_neg,quote_comp,non_quote_pos,non_quote_neg,non_quote_comp
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,0.042,0.152,-0.9327,0.105,0.0,0.6059
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,0.0,0.056,-0.5994,0.0,0.0,0.0
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,0.061,0.058,0.6617,0.063,0.04,0.8901
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,0.09,0.026,0.9756,0.06,0.054,0.4299
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,0.102,0.027,0.9371,0.146,0.026,0.9935


### Summary Calculations
Add the following columns to the dataframe:
* quote_prop_difference: positive score - negative score
* quote_prop_direction: if it leans positive (>0), negative (<0), or neutral (=0)
* same for non-quotes: non_quote_prop_difference and non_quote_prop_direction

In [242]:
# calculate score differences for quotes and non_quotes
scores_all_df.insert(5, 'quote_prop_difference', scores_all_df['quote_pos'] - scores_all_df['quote_neg'])

scores_all_df.insert(9, 'non_quote_prop_difference', scores_all_df['non_quote_pos'] - scores_all_df['non_quote_neg'])

In [243]:
scores_all_df.head()

Unnamed: 0,text_id,text_name,quote_pos,quote_neg,quote_comp,quote_prop_difference,non_quote_pos,non_quote_neg,non_quote_comp,non_quote_prop_difference
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,0.042,0.152,-0.9327,-0.11,0.105,0.0,0.6059,0.105
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,0.0,0.056,-0.5994,-0.056,0.0,0.0,0.0,0.0
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,0.061,0.058,0.6617,0.003,0.063,0.04,0.8901,0.023
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,0.09,0.026,0.9756,0.064,0.06,0.054,0.4299,0.006
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,0.102,0.027,0.9371,0.075,0.146,0.026,0.9935,0.12


In [244]:
# determine sentiment direction
direction = ['positive', 'negative', 'neutral']

quote_conds = [
    scores_all_df.quote_prop_difference > 0,
    scores_all_df.quote_prop_difference < 0,
    scores_all_df.quote_prop_difference == 0,
]

quote_prop_direction = []

quote_prop_direction = np.select(quote_conds, direction)

scores_all_df.insert(6, 'quote_prop_direction', quote_prop_direction)

non_quote_conds = [
    scores_all_df.non_quote_prop_difference > 0,
    scores_all_df.non_quote_prop_difference < 0,
    scores_all_df.non_quote_prop_difference == 0,
]

scores_all_df['non_quote_prop_direction'] = np.select(non_quote_conds, direction)

In [245]:
scores_all_df.head()

Unnamed: 0,text_id,text_name,quote_pos,quote_neg,quote_comp,quote_prop_difference,quote_prop_direction,non_quote_pos,non_quote_neg,non_quote_comp,non_quote_prop_difference,non_quote_prop_direction
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,0.042,0.152,-0.9327,-0.11,negative,0.105,0.0,0.6059,0.105,positive
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,0.0,0.056,-0.5994,-0.056,negative,0.0,0.0,0.0,0.0,neutral
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,0.061,0.058,0.6617,0.003,positive,0.063,0.04,0.8901,0.023,positive
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,0.09,0.026,0.9756,0.064,positive,0.06,0.054,0.4299,0.006,positive
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,0.102,0.027,0.9371,0.075,positive,0.146,0.026,0.9935,0.12,positive


## Aggregate Data
Add a new sheet (dataframe) called 'summary' with the following columns:
* Col 1 values: quote, non_quote, verbs, speakers
* Headers:
  * compound_max: max value of compound scores
  * compound_min: min value of compound scores
  * compound_avg: average value of compound scores
  * pos_comp_count: number of compound scores that lean positive (exclude everything in the range [-0.05, 0.05])
  * neg_comp_count: number of compound scores that lean negative
  * avg_prop_diff: average value of proportion difference
  * pos_prop_dir_count: number of texts that lean positive from difference
  * neg_prop_dir_count: number of texts that lean negative from difference
  * avg_pos_prop_diff: average value of proportion difference between positive differences
  * avg_neg_prop_diff: average value of proportion difference between negative differences

  add one: compound avg no polarity (take abs)

In [19]:
# helper function for extracting all the necessary data for each category
# where df1 is the original dataframe of each category
# and df2 is the scores all dataframe
def aggregate_data(df1, df2, category):
  row = []

  # max/min/avg of compound scores
  row.append(df1['compound'].max())
  row.append(df1['compound'].min())
  row.append(df1['compound'].mean())
  row.append(df1['compound'].abs().mean())

  # pos and neg comp count
  row.append(df1.compound[df1.compound > 0.05].count())
  row.append(df1.compound[df1.compound < -0.05].count())

  # if category is quotes or non quotes, find the following values
  # otherwise append 0s
  if category != '':
    prop_diff = category + '_prop_difference'
    row.append(df2[prop_diff].mean())

    # pos and neg direction counts
    prop_dir = category + '_prop_direction'
    row.append(df2[prop_dir][df2[prop_dir] == 'positive'].count())
    row.append(df2[prop_dir][df2[prop_dir] == 'negative'].count())

    # pos and neg difference means
    row.append(df2[prop_diff][df2[prop_diff] > 0].mean())
    row.append(df2[prop_diff][df2[prop_diff] < 0].mean())

  else:
    row = row + [0, 0, 0, 0, 0]

  return row

In [246]:
# create summary dataframe
data_summary = []

data_summary.append(aggregate_data(quotes_df, scores_all_df, 'quote'))
data_summary.append(aggregate_data(non_quotes_df, scores_all_df, 'non_quote'))
data_summary.append(aggregate_data(speakers_df, scores_all_df, ''))
data_summary.append(aggregate_data(verbs_df, scores_all_df, ''))

summary_df = pd.DataFrame(data_summary)

# row and column headers
row_labels = ['quote', 'non_quote', 'speaker', 'verb']
col_labels = ['compound_max', 'compound_min', 'compound_avg', 'no_polarity_comp_avg',
              'pos_comp_count', 'neg_comp_count', 'avg_prop_diff',
              'pos_prop_direction_count', 'neg_prop_direction_count',
              'avg_pos_prop_difference', 'avg_neg_prop_difference']

summary_df.index = row_labels
summary_df.columns = col_labels

In [247]:
summary_df.head()

Unnamed: 0,compound_max,compound_min,compound_avg,no_polarity_comp_avg,pos_comp_count,neg_comp_count,avg_prop_diff,pos_prop_direction_count,neg_prop_direction_count,avg_pos_prop_difference,avg_neg_prop_difference
quote,0.9994,-0.9996,0.257418,0.766441,2181,1217,0.027849,2216,1205,0.085558,-0.075991
non_quote,0.9999,-0.9999,0.164688,0.794369,2039,1415,0.009751,2057,1412,0.058409,-0.060783
speaker,0.9781,-0.969,0.066639,0.183614,917,430,0.0,0,0,0.0,0.0
verb,0.8555,-0.9747,-0.008579,0.07064,317,344,0.0,0,0,0.0,0.0


## Write Output to New Workbook

In [248]:
# map urls to file names
metadata = '/content/drive/My Drive/evaluation_quotes/metadata/2023_12_dec_CBC_news_metadata.xlsx'

metadata_df = pd.read_excel(metadata, usecols = ["filename", "url"])

metadata_df.rename(columns={'filename': 'text_name'}, inplace=True)

metadata_df.head()

Unnamed: 0,text_name,url
0,6569e97958b434ef00898608,https://subscriptions.cbc.ca/newsletter_static...
1,65692bb658b434ef0082ff04,https://www.cbc.ca/news/canada/calgary/gift-ca...
2,6569248a58b434ef0082b088,https://www.cbc.ca/news/canada/british-columbi...
3,6569252c58b434ef0082c1c3,https://www.cbc.ca/news/canada/toronto/ago-rev...
4,656924e558b434ef0082ba26,https://www.cbc.ca/news/canada/nova-scotia/sea...


In [249]:
new_quotes_df = quotes_df.merge(metadata_df[['text_name', 'url']], on='text_name', how='left')

new_non_quotes_df = non_quotes_df.merge(metadata_df[['text_name', 'url']], on='text_name', how='left')

In [250]:
new_quotes_df.head()

Unnamed: 0,text_id,text_name,quote,positive,negative,neutral,compound,url
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,that reports of Vancouver police denying acces...,0.042,0.152,0.806,-0.9327,https://www.cbc.ca/news/canada/british-columbi...
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,that the federal government will begin a consu...,0.0,0.056,0.944,-0.5994,https://www.cbc.ca/news/politics/canada-home-d...
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,he believes airlines are dropping base fares a...,0.061,0.058,0.881,0.6617,https://www.cbc.ca/news/canada/toronto/air-tra...
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,Moore was working on the top of one of the hom...,0.09,0.026,0.884,0.9756,https://www.cbc.ca/news/canada/new-brunswick/d...
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,"his bid would have cost $3.5 million per year,...",0.102,0.027,0.871,0.9371,https://www.cbc.ca/news/canada/newfoundland-la...


In [251]:
new_non_quotes_df.head()

Unnamed: 0,text_id,text_name,non_quoted_text,positive,negative,neutral,compound,url
0,001b0621b7d3f51059f7916f8e7667d4,6578feec58b434ef00050084,British Columbia's human rights commissioner h...,0.105,0.0,0.895,0.6059,https://www.cbc.ca/news/canada/british-columbi...
1,0047ea8b7faf501b9b6ddb8763f1c8b5,6578876058b434ef0001648e,The federal government is planning to launch a...,0.0,0.0,1.0,0.0,https://www.cbc.ca/news/politics/canada-home-d...
2,005daa355633d0162360c026c3b67079,65718d7e58b434ef00c88628,Can you fly round-trip from Toronto to Lisbon ...,0.063,0.04,0.897,0.8901,https://www.cbc.ca/news/canada/toronto/air-tra...
3,0065db8f249888574dffa4da85775cd5,6570eb2258b434ef00c361fa,Jurors in a coroner's inquest on Monday heard ...,0.06,0.054,0.886,0.4299,https://www.cbc.ca/news/canada/new-brunswick/d...
4,0070093b109fa990775e758fd1b3b0cc,6569aaf258b434ef00879ae6,Dr. Todd Young owns virtual health clinic Medi...,0.146,0.026,0.828,0.9935,https://www.cbc.ca/news/canada/newfoundland-la...


In [26]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


In [252]:
# replace with 'quotes_sentiment.xlsx'
output = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_12_dec_CBC_news_sentiment.xlsx'

# create excel writer object to initialize new workbook
writer = pd.ExcelWriter(output, engine="xlsxwriter")

# write dataframes to different worksheets
new_quotes_df.to_excel(writer, sheet_name="quotes", index=False)
new_non_quotes_df.to_excel(writer, sheet_name="non_quotes", index=False)
speakers_df.to_excel(writer, sheet_name="speakers", index=False)
verbs_df.to_excel(writer, sheet_name="verbs", index=False)
scores_all_df.to_excel(writer, sheet_name="scores_all", index=False)
summary_df.to_excel(writer, sheet_name="summary")

# close the excel writer and output file
writer.close()