In [1]:
import pandas as pd
import spacy
from quote_extractor import QuoteExtractor
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG
misinfotext_config = BASE_MISINFOTEXT_CONFIG

In [3]:
using_dataset = fakespeak_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"])
dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019


In [5]:
quote_extractor = QuoteExtractor("en_core_web_lg", "./config/quote_verb_list.txt")

  import pkg_resources


In [6]:
quote_annotations = quote_extractor.run_multiple(
    dataset_df[using_dataset["id_col"]], 
    dataset_df[using_dataset["text_col"]]
)

Preprocessing texts...
Creating spacy docs...
Extracting quotes...
Done extracting quotes


The resulting columns will contain a list of quotes (and the below stats) for each article.
- `quotes_num_words`: the number of words in each quote
- `proportions_of_total_words`: the number of words divided by total article words for each quote
- `proportion_quote_words_to_total_words`: the sum of number of words in quotes in an article divided by total words in that article, i.e. "how much of this article is made up of quotes?"

We count a word to be a token that is not space or punctuation.

In [7]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quotes_num_words"] = [[doc["quote_word_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["proportions_of_total_words"] = [[doc["proportion_of_total_words"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["proportion_quote_words_to_total_words"] = dataset_df["proportions_of_total_words"].apply(sum)

dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quotes,quotes_num_words,proportions_of_total_words,proportion_quote_words_to_total_words
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,[],[],[],0.0
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"[""why should American citizens be responsible ...",[14],[0.32558139534883723],0.325581
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,[],[],[],0.0
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,[],[],[],0.0
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,[],[],[],0.0


Create a new DF that has one quote on each row.

In [8]:
all_quotes_df = dataset_df.explode(["quotes", "quotes_num_words", "proportions_of_total_words"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quotes_num_words": "num_words"})\
    .rename(columns={"proportions_of_total_words": "proportion_of_total_words"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]

all_quotes_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",14,0.325581,0.325581
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",23,0.69697,0.69697
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",7,0.538462,0.538462
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9,0.409091,0.409091
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11,0.032164,0.725146


In [9]:
years, years_dfs = get_groups(all_quotes_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",14,0.325581,0.325581
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",23,0.69697,0.69697
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",7,0.538462,0.538462
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9,0.409091,0.409091
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11,0.032164,0.725146


In [10]:
types, types_dfs = get_groups(all_quotes_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11,0.032164,0.725146
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"""He’s an answer to our problems. We need to ge...",47,0.137427,0.725146
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"""Donald is funny, playful, and colorful, but m...",52,0.152047,0.725146
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"""I am very disappointed at the talk show hosts...",53,0.154971,0.725146
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,""" to all of them. To all that are criticizing ...",28,0.081871,0.725146


## Writing dataframes to excel spreadsheet
There are three parts to the spreadsheet:
- The `year` sheets contain one quote on each line
    - Column `proportion_of_total_words` represents the number of words in the individual quote divided by the total number of quotes in the doc for its corresponding article
- The `year_proportion` sheets contain one article on each line
    - Column `proportion_quote_words_to_total_words` represents the sum of the quote words divided by the total number of quotes in the doc, i.e. "how much of this article is made up of quotes?"
- The `Proportion_Summary` sheet shows summary statistics for each year's `proportion_quote_words_to_total_words`

In [11]:
def get_quote_proportion_summary_df(dfs: list[pd.DataFrame]):
    return pd.DataFrame(
        [df["proportion_quote_words_to_total_words"].convert_dtypes().describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

In [12]:
def save_years(writer: pd.ExcelWriter, years: list[int], years_dfs: list[pd.DataFrame]):
    proportion_dfs: list[pd.DataFrame] = []
    
    for year, df in zip(years, years_dfs):
        # Save individual quotes
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=[using_dataset["id_col"], "quote", "num_words", "proportion_of_total_words"]
        )

        # Save proportion of quote words to total words in separate sheet
        proportion_df = df.drop_duplicates(subset=using_dataset["id_col"])
        proportion_df.to_excel(
            writer,
            sheet_name=f"{str(year)}_proportion",
            index=False,
            columns=[using_dataset["id_col"], "proportion_quote_words_to_total_words"]
        )

        proportion_dfs.append(proportion_df)

    get_quote_proportion_summary_df(proportion_dfs).to_excel(writer, sheet_name="Proportion Summary")

In [13]:
output_path = make_output_path(using_dataset, "quotes")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

In [14]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "quotes")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()