In [1]:
# run this code to connect to google drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install xlsxwriter # for writing multiple sheets to an excel file

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/159.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m153.6/159.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


In [3]:
import pandas as pd
import re

Script for aggregating data from the summary sheets across all the months.

Each sheet should contain the summary sheet of the month named 'month', with the aggregate data as the sheet at the end labelled 'all'.

The file itself should be called 'CBC_news_sentiment_summary.xlsx'

In [4]:
# get summary from each month
jan = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_01_jan_CBC_news_sentiment.xlsx'
feb = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_02_feb_CBC_news_sentiment.xlsx'
mar = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_03_mar_CBC_news_sentiment.xlsx'
apr = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_04_apr_CBC_news_sentiment.xlsx'
may = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_05_may_CBC_news_sentiment.xlsx'
jun = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_06_jun_CBC_news_sentiment.xlsx'
jul = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_07_jul_CBC_news_sentiment.xlsx'
aug = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_08_aug_CBC_news_sentiment.xlsx'
sep = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_09_sep_CBC_news_sentiment.xlsx'
oct = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_10_oct_CBC_news_sentiment.xlsx'
nov = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_11_nov_CBC_news_sentiment.xlsx'
dec = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/2023_12_dec_CBC_news_sentiment.xlsx'

jan_df = pd.read_excel(jan, sheet_name='summary', index_col=0)
feb_df = pd.read_excel(feb, sheet_name='summary', index_col=0)
mar_df = pd.read_excel(mar, sheet_name='summary', index_col=0)
apr_df = pd.read_excel(apr, sheet_name='summary', index_col=0)
may_df = pd.read_excel(may, sheet_name='summary', index_col=0)
jun_df = pd.read_excel(jun, sheet_name='summary', index_col=0)
jul_df = pd.read_excel(jul, sheet_name='summary', index_col=0)
aug_df = pd.read_excel(aug, sheet_name='summary', index_col=0)
sep_df = pd.read_excel(sep, sheet_name='summary', index_col=0)
oct_df = pd.read_excel(aug, sheet_name='summary', index_col=0)
nov_df = pd.read_excel(nov, sheet_name='summary', index_col=0)
dec_df = pd.read_excel(dec, sheet_name='summary', index_col=0)

In [5]:
jan_df.head()

Unnamed: 0,compound_max,compound_min,compound_avg,no_polarity_comp_avg,pos_comp_count,neg_comp_count,avg_prop_diff,pos_prop_direction_count,neg_prop_direction_count,avg_pos_prop_difference,avg_neg_prop_difference
quote,0.999,-0.999,0.264394,0.758353,2639,1425,0.029789,2687,1425,0.085919,-0.073877
non_quote,1.0,-0.9995,0.155418,0.795217,2414,1731,0.007646,2435,1733,0.057766,-0.062566
speaker,0.9854,-0.9545,0.059527,0.175481,1065,517,0.0,0,0,0.0,0.0
verb,0.7717,-0.9702,0.003021,0.066017,417,357,0.0,0,0,0.0,0.0


For the summary columns:
* **compound_max** - take average of all compound_max columns across each month; represents overall max compound score found for the category (quotes, non-quotes, speaker, verb)
* **compound_min** - take average of all compound_min columns across each month; represents overall min compound score found for the category
* **compound_avg** - take average of all compound_avg columns across each month; represents overall average compound score for the category
* **no_polarity_comp_avg** - take average of all no_polarity_comp_avg columns across each month; represents overall average intensity of the compound scores regardless of positive/negative sentiment
* **pos_comp_count** - take sum of all pos_comp_count columns across each month; represents the overall total number of quotes/non-quotes that leaned positive, exluding everything under 0.05
* **neg_comp_count** - take sum of all neg_comp_count columns across each month; represents the overall total number of quotes/non-quotes that leaned negative, excluding everything above -0.05
* **avg_prop_diff** - take average of all the proportional difference in sentiment for quotes/non-quotes across all months; proportional differences are calculated by taking the difference between the positive and negative sentiment scores assigned to the quotes/non-quotes, where a positive prop_diff represents an overall positive leaning and a negative prop_diff represents an overall negative leaning. this may or may not line up with the compound score, as vader uses other weighings (including the neutral score) when calculating the compound score
* **pos_prop_direction_count** - take sum of all proportional differences greater than 0; counts all texts with a positive proportional difference
* **neg_prop_direction_count** - take sum of all proportional differences less than 0; counts all texts with a negative proportional difference
* **avg_pos_prop_difference** - take average of positive proportional differences; measures intensity of positive leaning differences
* **avg_neg_prop_difference** - take average of negative proportional differences; measures intensity of negative leaning differences

In [6]:
# sum dataframes together
months = [jan_df, feb_df, mar_df, apr_df, may_df, jun_df, jul_df, aug_df, sep_df, oct_df, nov_df, dec_df]

sum_df = months[0]
for month in months[1:]:
  sum_df = sum_df.add(month)

# take the average of certain columns
avg_cols = ['compound_max', 'compound_min', 'compound_avg', 'no_polarity_comp_avg', 'avg_prop_diff', 'avg_pos_prop_difference', 'avg_neg_prop_difference']

for col in avg_cols:
  sum_df[col] = sum_df[col] / 12

In [7]:
sum_df

Unnamed: 0,compound_max,compound_min,compound_avg,no_polarity_comp_avg,pos_comp_count,neg_comp_count,avg_prop_diff,pos_prop_direction_count,neg_prop_direction_count,avg_pos_prop_difference,avg_neg_prop_difference
quote,0.999258,-0.999275,0.269928,0.758211,30461,16321,0.02904,30914,16206,0.084959,-0.075079
non_quote,0.999925,-0.999633,0.160355,0.792944,27962,19648,0.00849,28294,19610,0.057409,-0.061805
speaker,0.988042,-0.974575,0.061284,0.185275,12379,6388,0.0,0,0,0.0,0.0
verb,0.832408,-0.944308,-0.002612,0.068597,4603,4473,0.0,0,0,0.0,0.0


In [8]:
# write results to new excel sheet
output = '/content/drive/My Drive/evaluation_quotes/CBC_sentiment_output/sentiment_summary.xlsx'

writer = pd.ExcelWriter(output, engine='xlsxwriter')

sheets = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
i = 0

for month in months:
  month.to_excel(writer, sheet_name=sheets[i])
  i += 1

sum_df.to_excel(writer, sheet_name='aggregate_all')

writer.close()