In [1]:
import pandas as pd
from quote_extractor import QuoteExtractor, run_pool

In [2]:
class DatasetConfig():
    input_path: str
    output_path: str
    quote_annotations_path: str
    sheet_name: str
    id_col: str
    usecols: list[str]

    def __init__(self, input_path: str, output_path: str, quote_annotations_path: str, sheet_name: str, id_col: str, usecols: list[str]):
        self.input_path = input_path
        self.output_path = output_path
        self.quote_annotations_path = quote_annotations_path
        self.sheet_name = sheet_name
        self.id_col = id_col
        self.usecols = usecols

In [3]:
fakespeak_config = DatasetConfig(
    # file_path="/content/drive/My Drive/fake_news_over_time/Fakespeak_ENG_modified.xlsx",
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_quotes.xlsx",
    quote_annotations_path="./data/Fakespeak-ENG/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    id_col="ID",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_quotes.xlsx",
    quote_annotations_path="./data/MisInfoText/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    id_col="factcheckURL",
    usecols=None,
)

In [4]:
using_dataset = fakespeak_config

In [5]:
dataset_df = pd.read_excel(
    using_dataset.input_path, 
    sheet_name=using_dataset.sheet_name, 
    usecols=using_dataset.usecols)
dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019
...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021


In [6]:
# TODO: figure out if need to fix multi-line journalism quotes
# that start with a quotation mark on each line, but only
# last line has closing quotation mark.
quote_extractor = QuoteExtractor("en_core_web_lg", "./quote_verb_list.txt")

  import pkg_resources


In [None]:
quote_annotations = dataset_df.apply(
    lambda row: quote_extractor.run(row[using_dataset.id_col], row["originalBodyText"]), 
    axis=1
)

In [None]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quote_lengths"] = [[doc["quote_token_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df

In [None]:
all_quotes_df = dataset_df.explode(["quotes", "quote_lengths"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quote_lengths": "quote_length"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]
all_quotes_df

In [None]:
grouped_by_year = all_quotes_df.groupby(by="originalDateYear")
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

In [None]:
num_quotes_per_year = grouped_by_year["quote"].count()
num_quotes_per_year

In [None]:
quote_length_summary_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_dfs],
    index=pd.Index(data=years, name="year")
)
quote_length_summary_df

In [None]:
only_news_blog_social_media_df = all_quotes_df[(all_quotes_df["originalTextType"] == "News and blog") | (all_quotes_df["originalTextType"] == "Social media")]
only_news_blog_social_media_df

In [None]:
grouped_by_year_news_blog_social_media = only_news_blog_social_media_df.groupby(by="originalDateYear")
years_news_blog_social_media = grouped_by_year_news_blog_social_media.groups
years_news_blog_social_media_dfs = [grouped_by_year_news_blog_social_media.get_group(year) for year in years_news_blog_social_media]
years_news_blog_social_media_dfs[0].head()

In [None]:
num_quotes_per_year_news_blog_social_media = grouped_by_year_news_blog_social_media["quote"].count()
num_quotes_per_year_news_blog_social_media

In [None]:
quote_length_summary_news_blog_social_media_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_news_blog_social_media_dfs],
    index=pd.Index(data=years_news_blog_social_media, name="year")
)
quote_length_summary_news_blog_social_media_df

In [None]:
writer = pd.ExcelWriter(using_dataset.output_path, engine="xlsxwriter")

for df, year in zip(years_dfs, years):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()

In [None]:
output_path = using_dataset.output_path
output_path_split = output_path.split("/")
output_path_split.insert(len(output_path_split) - 1, "news_blog_and_social_media")
output_path_news_blog_social_media = "/".join(output_path_split)
output_path_news_blog_social_media

In [None]:
writer = pd.ExcelWriter(output_path_news_blog_social_media, engine="xlsxwriter")

for df, year in zip(years_news_blog_social_media_dfs, years_news_blog_social_media):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year_news_blog_social_media.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_news_blog_social_media_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()