# Count quotes and words for the first table

## Count total number of quotes

These come from the "CBC_qt_output/Quotes_new" dir. This is the 'last' place where we can count quotes, as in the sentiment output, the quotes for each article are merged into one document.

In [1]:
import pandas as pd
import numpy as np
import json
from pandas import json_normalize
import ast
from ast import literal_eval
import os
import glob
import re
import matplotlib.pyplot as plt

In [2]:
directory = 'C:/Maite/MOD/projects/Monika_Bednarek/Evaluation_quotes/Data/CBC_qt_output/Quotes_new'

In [6]:
excel_files = glob.glob(os.path.join(directory, '2023*.xlsx'))

results = []

for file in excel_files:
    try:
        # read the full sheet and the quotes column
        df = pd.read_excel(file, sheet_name='full')

        # count the number of rows
        num_rows_quotes = len(df)

        # Store the result in the list
        results.append({
            'file': os.path.basename(file),
            'rows_in_full': num_rows_quotes,
        })

    except Exception as e:
        # Handle any errors (e.g., sheet not found)
        print(f"Could not process {file}: {e}")

In [7]:
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,file,rows_in_full
0,2023_01_jan_CBC_news_qt_new.xlsx,55161
1,2023_03_mar_CBC_news_qt_new.xlsx,60157
2,2023_04_apr_CBC_news_qt_new.xlsx,51383
3,2023_05_may_CBC_news_qt_new.xlsx,58306
4,2023_07_jul_CBC_news_qt_new.xlsx,51754
5,2023_08_aug_CBC_news_qt_new.xlsx,51011
6,2023_09_sep_CBC_news_qt_new.xlsx,52230
7,2023_10_oct_CBC_news_qt_new.xlsx,55202
8,2023_11_nov_CBC_news_qt_new.xlsx,55528
9,2023_12_dec_CBC_news_qt_new.xlsx,47561


In [8]:
# Calculate the total row by summing all numeric columns
total_row = results_df[['rows_in_full']].sum()

# Add 'Total' as a label for the 'Category' column in the total row
total_row['file'] = 'Total'

# Append the total row to the DataFrame
results_df = results_df.append(total_row, ignore_index=True)

# Display the updated DataFrame
results_df

Unnamed: 0,file,rows_in_full
0,2023_01_jan_CBC_news_qt_new.xlsx,55161
1,2023_03_mar_CBC_news_qt_new.xlsx,60157
2,2023_04_apr_CBC_news_qt_new.xlsx,51383
3,2023_05_may_CBC_news_qt_new.xlsx,58306
4,2023_07_jul_CBC_news_qt_new.xlsx,51754
5,2023_08_aug_CBC_news_qt_new.xlsx,51011
6,2023_09_sep_CBC_news_qt_new.xlsx,52230
7,2023_10_oct_CBC_news_qt_new.xlsx,55202
8,2023_11_nov_CBC_news_qt_new.xlsx,55528
9,2023_12_dec_CBC_news_qt_new.xlsx,47561


## Count words in quotes and non quotes

Input dir is sentiment_output. We count the words in the respective tab/columns of each excel, i.e, for each month:

* quotes tab
* quote colum
* non-quotes tab
* non-quoted text

In [21]:
os.chdir(r'C:\Maite\MOD\projects\Monika_Bednarek\Evaluation_quotes\Data\CBC_output_corpus')

This one counts with the \n

In [23]:
with open('2023_02_feb_CBC_news_sentiment_quote_text.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

df = pd.DataFrame(lines, columns=['text'])

# count the number of words in each line
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

total_words = df['word_count'].sum()

print(f"Total number of words: {total_words}")

Total number of words: 895688


This one without the \n

In [25]:
with open('2023_02_feb_CBC_news_sentiment_quote_text.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    # Remove the newline character and strip any extra whitespace from each line
    lines = [line.strip().replace('\n', '') for line in lines]

df = pd.DataFrame(lines, columns=['text'])

# count the number of words in each line
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

total_words = df['word_count'].sum()

print(f"Total number of words: {total_words}")

Total number of words: 895688


From this, I conclude that somehow pandas doesn't see the \n

In [15]:
print(df)

                                                   text  word_count
0     Wednesday's order from the 6th U.S. Circuit Co...         182
1     that thousands of people were travelling acros...         146
2     13 state landmarks — including One World Trade...          94
3     it's going to take a lot more than talk to put...         617
4     We're in a situation where we have members say...          26
...                                                 ...         ...
4211  that while more government money is needed, th...         807
4212  "A dollar invested today, and making sure you ...         314
4213  he had accepted the resignation of Bryce Phill...          78
4214  she felt things were going well right up until...         543
4215  the heaviest snow will be centred in the north...          43

[4216 rows x 2 columns]


In [18]:
df.iloc[2,0]

'13 state landmarks — including One World Trade Center, Empire State Plaza and Niagara Falls — would be illuminated Monday evening in red, black and green"Amidst the recent rise in bigotry, division and hate across this country, it is more important than ever that we remember Dr. Martin Luther King Jr.\'s teachings on justice, peace, equality and loveMarching bands, dance teams, local politicians and activists made their way along a route that "traces the steps" of King during visits to Miami,"the sacred right to vote""one of the country\'s oldest and largest""I am the dream,"'

So now I'll proceed to do this for the quotes vs. non-quotes in the dir

In [28]:
directory = 'C:/Maite/MOD/projects/Monika_Bednarek/Evaluation_quotes/Data/CBC_output_corpus'

In [30]:
pwd

'C:\\Maite\\MOD\\projects\\Monika_Bednarek\\Evaluation_quotes\\Data\\CBC_output_corpus'

In [34]:
# Step 2: Get all text files in the directory that contain "quote" between underscore in their name
quotes_files = [f for f in os.listdir(directory) if '_quote_' in f.lower() and f.endswith('.txt')]

# Initialize a variable to store the total word count
total_word_count = 0

# Step 3: Process each file
for file_name in quotes_files:
    file_path = os.path.join(directory, file_name)
    
    # Read the text file into a list of lines
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Convert the lines into a DataFrame
    df = pd.DataFrame(lines, columns=['text'])
    
    # Count the number of words in each line and sum them
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    total_word_count += df['word_count'].sum()

# Step 4: Print the total word count
print(f"Total number of words in all 'quotes' related text files: {total_word_count}")


Total number of words in all 'quotes' related text files: 28188065


This is actually the total number of everyting, as I cannot tell _quote_ from _non_quote

In [35]:
# Step 2: Get all text files in the directory that contain "quote" between underscore in their name
quotes_files = [f for f in os.listdir(directory) if '_non_quote_' in f.lower() and f.endswith('.txt')]

# Initialize a variable to store the total word count
total_word_count = 0

# Step 3: Process each file
for file_name in quotes_files:
    file_path = os.path.join(directory, file_name)
    
    # Read the text file into a list of lines
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Convert the lines into a DataFrame
    df = pd.DataFrame(lines, columns=['text'])
    
    # Count the number of words in each line and sum them
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    total_word_count += df['word_count'].sum()

# Step 4: Print the total word count
print(f"Total number of words in all 'non quotes' related text files: {total_word_count}")


Total number of words in all 'non quotes' related text files: 17314398


## Now, I'm counting the original corpus words, because the quotes vs. non-quotes excludes verbs and speakers

In [36]:
directory = 'C:/Maite/MOD/projects/Monika_Bednarek/Evaluation_quotes/Data/CBC_input_correct_encoding'

In [37]:
xlsx_files = [f for f in os.listdir(directory) if f.endswith('.xlsx')]

column_name = 'body' 

total_word_count = 0

for file_name in xlsx_files:
    file_path = os.path.join(directory, file_name)
    
    # Read the Excel file into a pandas ExcelFile object to handle multiple sheets
    xlsx_file = pd.ExcelFile(file_path)
    
    # Step 4: Process each sheet in the Excel file
    for sheet_name in xlsx_file.sheet_names:
        df = xlsx_file.parse(sheet_name)
        
        # Step 5: Check if the specified column exists in the sheet
        if column_name in df.columns:
            # Step 6: Count the number of words in the specified column (excluding NaN values)
            df[column_name] = df[column_name].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
            
            # Add up the word counts in this column
            total_word_count += df[column_name].sum()

# Step 7: Print the total word count
print(f"Total number of words in column '{column_name}' across all xlsx files: {total_word_count}")


Total number of words in column 'body' across all xlsx files: 26772464
