# Sentiment Analyzer for News Quotes

1. Read cleaned data into a pandas dataframe
2. Pass quotes from each article into sentiment analyzer
3. Save output into new columns 'negative', 'neutral', 'positive', 'compound'
4. Save output into new excel sheet with two sheets, one for quotes and one for non-quotes

In [None]:
# run this code if connecting to a Google drive
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!pip install vaderSentiment

In [None]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Extracting Data from Excel Files

In [None]:
# replace with quotes_input.xlsx
fp = '/content/drive/My Drive/evaluation_quotes/CBC_qt_output/2023_01_jan_CBC_news_clean.xlsx'

quotes_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "quote"])

non_quotes_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "non_quoted_text"])

speakers_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "speaker"])

verbs_df = pd.read_excel(fp, usecols = ["text_id", "text_name", "verb"])

In [None]:
# add new columns to dataframes
quotes_df['positive'] = pd.Series(dtype='float')
quotes_df['negative'] = pd.Series(dtype='float')
quotes_df['neutral'] = pd.Series(dtype='float')
quotes_df['compound'] = pd.Series(dtype='float')

non_quotes_df['positive'] = pd.Series(dtype='float')
non_quotes_df['negative'] = pd.Series(dtype='float')
non_quotes_df['neutral'] = pd.Series(dtype='float')
non_quotes_df['compound'] = pd.Series(dtype='float')

speakers_df['positive'] = pd.Series(dtype='float')
speakers_df['negative'] = pd.Series(dtype='float')
speakers_df['neutral'] = pd.Series(dtype='float')
speakers_df['compound'] = pd.Series(dtype='float')

verbs_df['positive'] = pd.Series(dtype='float')
verbs_df['negative'] = pd.Series(dtype='float')
verbs_df['neutral'] = pd.Series(dtype='float')
verbs_df['compound'] = pd.Series(dtype='float')

## Running Quotes through VADER

Vader Documentation: https://github.com/cjhutto/vaderSentiment

Guide to using Vader: https://medium.com/@rslavanyageetha/vader-a-comprehensive-guide-to-sentiment-analysis-in-python-c4f1868b0d2e

Vader sentiment analyzer returns a dictionary of sentiment intensity scores for
a particular text input with the following sentiments: negative, neutral,
positive, and compound for overall sentiment intensity. The negative, neutral,
and positive scores have a value from 0 to 1 and compound scores have a
value from -1 to 1, with -1 indicating entirely negative, 0 indicating
entirely neutral, and 1 indicating entirely positive.


In [None]:
# helper function to extract scores for each story
# args: dataframe, column name of text to be analyzed as a string
def get_sentiment_score(df, col):

  for index, row in df.iterrows():
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(df[col][index])

    df.loc[index, 'positive'] = score['pos']
    df.loc[index, 'negative'] = score['neg']
    df.loc[index, 'neutral'] = score['neu']
    df.loc[index, 'compound'] = score['compound']

In [None]:
# get sentiment scores for all four dataframes
get_sentiment_score(quotes_df, 'quote')

get_sentiment_score(non_quotes_df, 'non_quoted_text')

get_sentiment_score(speakers_df, 'speaker')

get_sentiment_score(verbs_df, 'verb')

In [None]:
quotes_df.head()

In [None]:
non_quotes_df.head()

In [None]:
speakers_df.head()

In [None]:
verbs_df.head()

In [None]:
# create a third dataframe to directly compare compound scores between quotes and non quotes
scores_all_df = pd.merge(quotes_df[['text_id', 'text_name', 'positive', 'negative', 'compound']],
                          non_quotes_df[['text_id', 'text_name', 'positive', 'negative', 'compound']],
                          on=['text_id', 'text_name'],
                          suffixes=('_quotes', '_non_quotes'))

scores_all_df.rename(columns={'positive_quotes': 'quote_pos',
                              'negative_quotes': 'quote_neg',
                              'compound_quotes': 'quote_comp',
                              'positive_non_quotes': 'non_quote_pos',
                              'negative_non_quotes': 'non_quote_neg',
                              'compound_non_quotes': 'non_quote_comp',
                              }, inplace=True)

In [None]:
scores_all_df.head()

### Summary Calculations
Add the following columns to the dataframe:
* quote_prop_difference: positive score - negative score
* quote_prop_direction: if it leans positive (>0), negative (<0), or neutral (=0)
* same for non-quotes: non_quote_prop_difference and non_quote_prop_direction

In [None]:
# calculate score differences for quotes and non_quotes
scores_all_df.insert(5, 'quote_prop_difference', scores_all_df['quote_pos'] - scores_all_df['quote_neg'])

scores_all_df.insert(9, 'non_quote_prop_difference', scores_all_df['non_quote_pos'] - scores_all_df['non_quote_neg'])

In [None]:
scores_all_df.head()

In [None]:
# determine sentiment direction
direction = ['positive', 'negative', 'neutral']

quote_conds = [
    scores_all_df.quote_prop_difference > 0,
    scores_all_df.quote_prop_difference < 0,
    scores_all_df.quote_prop_difference == 0,
]

quote_prop_direction = []

quote_prop_direction = np.select(quote_conds, direction)

scores_all_df.insert(6, 'quote_prop_direction', quote_prop_direction)

non_quote_conds = [
    scores_all_df.non_quote_prop_difference > 0,
    scores_all_df.non_quote_prop_difference < 0,
    scores_all_df.non_quote_prop_difference == 0,
]

scores_all_df['non_quote_prop_direction'] = np.select(non_quote_conds, direction)

In [None]:
scores_all_df.head()

## Aggregate Data
Add a new sheet (dataframe) called 'summary' with the following columns:
* Col 1 values: quote, non_quote, verbs, speakers
* Headers:
  * compound_max: max value of compound scores
  * compound_min: min value of compound scores
  * compound_avg: average value of compound scores
  * pos_comp_count: number of compound scores that lean positive (exclude everything in the range [-0.05, 0.05])
  * neg_comp_count: number of compound scores that lean negative
  * avg_prop_diff: average value of proportion difference
  * pos_prop_dir_count: number of texts that lean positive from difference
  * neg_prop_dir_count: number of texts that lean negative from difference
  * avg_pos_prop_diff: average value of proportion difference between positive differences
  * avg_neg_prop_diff: average value of proportion difference between negative differences

In [None]:
# helper function for extracting all the necessary data for each category
# where df1 is the original dataframe of each category
# and df2 is the scores all dataframe
def aggregate_data(df1, df2, category):
  row = []

  # max/min/avg of compound scores
  row.append(df1['compound'].max())
  row.append(df1['compound'].min())
  row.append(df1['compound'].mean())

  # pos and neg comp count
  row.append(df1.compound[df1.compound > 0.05].count())
  row.append(df1.compound[df1.compound < -0.05].count())

  # if category is quotes or non quotes, find the following values
  # otherwise append 0s
  if category != '':
    prop_diff = category + '_prop_difference'
    row.append(df2[prop_diff].mean())

    # pos and neg direction counts
    prop_dir = category + '_prop_direction'
    row.append(df2[prop_dir][df2[prop_dir] == 'positive'].count())
    row.append(df2[prop_dir][df2[prop_dir] == 'negative'].count())

    # pos and neg difference means
    row.append(df2[prop_diff][df2[prop_diff] > 0].mean())
    row.append(df2[prop_diff][df2[prop_diff] < 0].mean())

  else:
    row = row + [0, 0, 0, 0, 0]

  return row

In [None]:
# create summary dataframe
data_summary = []

data_summary.append(aggregate_data(quotes_df, scores_all_df, 'quote'))
data_summary.append(aggregate_data(non_quotes_df, scores_all_df, 'non_quote'))
data_summary.append(aggregate_data(speakers_df, scores_all_df, ''))
data_summary.append(aggregate_data(verbs_df, scores_all_df, ''))

summary_df = pd.DataFrame(data_summary)

# row and column headers
row_labels = ['quote', 'non_quote', 'speaker', 'verb']
col_labels = ['compound_max', 'compound_min', 'compound_avg',
              'pos_comp_count', 'neg_comp_count', 'avg_prop_diff',
              'pos_prop_direction_count', 'neg_prop_direction_count',
              'avg_pos_prop_difference', 'avg_neg_prop_difference']

summary_df.index = row_labels
summary_df.columns = col_labels

In [None]:
summary_df.head()

## Write Output to New Workbook

In [None]:
!pip install xlsxwriter

In [None]:
# replace with 'quotes_sentiment.xlsx'
output = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_01_jan_CBC_news_sentiment.xlsx'

# create excel writer object to initialize new workbook
writer = pd.ExcelWriter(output, engine="xlsxwriter")

# write dataframes to different worksheets
quotes_df.to_excel(writer, sheet_name="quotes", index=False)
non_quotes_df.to_excel(writer, sheet_name="non_quotes", index=False)
scores_all_df.to_excel(writer, sheet_name="scores_all", index=False)
summary_df.to_excel(writer, sheet_name="summary")

# close the excel writer and output file
writer.close()