In [65]:
import pandas as pd
from quote_extractor import QuoteExtractor
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

In [66]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "save_cols": [BASE_FAKESPEAK_CONFIG["id_col"], "quote", "quote_length"]
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG | {
    "save_cols": [BASE_MISINFOTEXT_CONFIG["id_col"], "quote", "quote_length"]
}

In [67]:
using_dataset = fakespeak_config

In [68]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"])
dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019


In [69]:
quote_extractor = QuoteExtractor("en_core_web_lg", "./config/quote_verb_list.txt")

In [70]:
quote_annotations = quote_extractor.run_multiple(
    dataset_df[using_dataset["id_col"]], 
    dataset_df[using_dataset["text_col"]]
)

Preprocessing texts...
Creating spacy docs...
Extracting quotes...
Done extracting quotes


In [71]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quote_lengths"] = [[doc["quote_token_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quotes,quote_lengths
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,[],[]
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"[""why should American citizens be responsible ...",[18]
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,[],[]
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,[],[]
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,[],[]
...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023,[that the United States would not tolerate fur...,"[12, 13]"
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023,"[""One of these Joe’s is not like the other… on...",[21]
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"[were never needed.\n , they were cured with, ...","[5, 4, 18, 12, 13, 15, 51, 42, 14]"
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021,[],[]


In [72]:
all_quotes_df = dataset_df.explode(["quotes", "quote_lengths"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quote_lengths": "quote_length"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]
all_quotes_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",18
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",26
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",11
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11
...,...,...,...,...,...,...,...
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,", treatment in ICUs is useless if thromboembol...",13
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,", inflammation induces thrombosis through a co...",15
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"""Thanks to 50 autopsies performed on patients ...",51
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"""If we ventilate a lung where blood does not c...",42


In [73]:
years, years_dfs = get_groups(all_quotes_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",18
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",26
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",11
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11


In [74]:
types, types_dfs = get_groups(all_quotes_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"""He’s an answer to our problems. We need to ge...",55
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"""Donald is funny, playful, and colorful, but m...",65
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"""I am very disappointed at the talk show hosts...",65
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,""" to all of them. To all that are criticizing ...",33


In [75]:
def get_quote_length_summary_df(dfs: list[pd.DataFrame]):
    return pd.DataFrame(
        [df["quote_length"].convert_dtypes().describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

In [76]:
def save_years(writer: pd.ExcelWriter, years: list[int], years_dfs: list[pd.DataFrame]):
    for year, df in zip(years, years_dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=using_dataset["save_cols"]
        )
    
    get_quote_length_summary_df(years_dfs).to_excel(writer, sheet_name="Summary")

In [77]:
output_path = make_output_path(using_dataset, "quotes")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

In [78]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "quotes")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()