In [1]:
import pandas as pd
import spacy
from quote_extractor import QuoteExtractor
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG
misinfotext_config = BASE_MISINFOTEXT_CONFIG

In [3]:
using_dataset = misinfotext_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"])
dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017


In [5]:
quote_extractor = QuoteExtractor("en_core_web_lg", "./config/quote_verb_list.txt")

  import pkg_resources


In [6]:
quote_annotations = quote_extractor.run_multiple(
    dataset_df[using_dataset["id_col"]], 
    dataset_df[using_dataset["text_col"]]
)

Preprocessing texts...
Creating spacy docs...
Extracting quotes...
Done extracting quotes


The resulting columns will contain a list of quotes (and the below stats) for each article.
- `quotes_num_words`: the number of words in each quote
- `proportions_of_total_words`: the number of words divided by total article words for each quote
- `proportion_quote_words_to_total_words`: the sum of number of words in quotes in an article divided by total words in that article, i.e. "how much of this article is made up of quotes?"

We count a word to be a token that is not space or punctuation.

In [7]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quotes_num_words"] = [[doc["quote_word_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["proportions_of_total_words"] = [[doc["proportion_of_total_words"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["proportion_quote_words_to_total_words"] = dataset_df["proportions_of_total_words"].apply(sum)

dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quotes,quotes_num_words,proportions_of_total_words,proportion_quote_words_to_total_words
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"[, abandoned pet rates have sky-rocketed in Te...","[16, 20, 15, 41, 9]","[0.032719836400818, 0.0408997955010225, 0.0306...",0.206544
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,[that deaths in marijuana-related car crashes ...,"[14, 57]","[0.07446808510638298, 0.30319148936170215]",0.37766
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,[Organized crime gangs are buying hundreds or ...,"[21, 4, 30, 8, 23, 13, 19]","[0.059322033898305086, 0.011299435028248588, 0...",0.333333
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,[hes going to raise their gas taxes to the hig...,"[22, 15, 13, 15, 28, 18, 45]","[0.04932735426008968, 0.033632286995515695, 0....",0.349776
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,[],[],[],0.0


Create a new DF that has one quote on each row.

In [8]:
all_quotes_df = dataset_df.explode(["quotes", "quotes_num_words", "proportions_of_total_words"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quotes_num_words": "num_words"})\
    .rename(columns={"proportions_of_total_words": "proportion_of_total_words"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]

all_quotes_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", abandoned pet rates have sky-rocketed in Tex...",16,0.03272,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", pet owners within Texas, Arizona, and Missou...",20,0.0409,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"a state funded, mandatory ‘pet registration’ p...",15,0.030675,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", an incentive program may be implemented to e...",41,0.083845,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"""domestic animal the size of a cat or larger""",9,0.018405,0.206544


In [9]:
years, years_dfs = get_groups(all_quotes_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,that the surge of troops in Iraq was 'working,9,0.071429,0.777778
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,the surge 'has failed' and that we should 'beg...,15,0.119048,0.777778
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,"""The fact that the New York senator can revers...",74,0.587302,0.777778


In [10]:
types, types_dfs = get_groups(all_quotes_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", abandoned pet rates have sky-rocketed in Tex...",16,0.03272,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", pet owners within Texas, Arizona, and Missou...",20,0.0409,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"a state funded, mandatory ‘pet registration’ p...",15,0.030675,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", an incentive program may be implemented to e...",41,0.083845,0.206544
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"""domestic animal the size of a cat or larger""",9,0.018405,0.206544


Calculate proportion of number of quotes to number of articles per year

In [11]:
def get_proportions_num_quotes_to_num_articles_ratios(df: pd.DataFrame):
    grouped_by_year = df.groupby(using_dataset["year_col"])
    years = grouped_by_year.groups.keys()

    def loop():
        for year in grouped_by_year.groups:
            year_df = grouped_by_year.get_group(year)
            yield year_df["quotes"].apply(lambda quotes_list: len(quotes_list)).sum() / year_df.shape[0]
    
    return pd.DataFrame({
        "year": years,
        "proportion_num_quotes_to_num_articles": loop()
    })

In [12]:
proportions_df = get_proportions_num_quotes_to_num_articles_ratios(dataset_df)
proportions_df

Unnamed: 0,year,proportion_num_quotes_to_num_articles
0,2007,3.0
1,2008,9.8
2,2009,6.176471
3,2010,4.217391
4,2011,3.954545
5,2012,5.714286
6,2013,3.327869
7,2014,3.911765
8,2015,3.714286
9,2016,3.252747


## Writing dataframes to excel spreadsheet
There are three parts to the spreadsheet:
- The `year` sheets contain one quote on each line
    - Column `proportion_of_total_words` represents the number of words in the individual quote divided by the total number of quotes in the doc for its corresponding article
- The `year_proportion` sheets contain one article on each line
    - Column `proportion_quote_words_to_total_words` represents the sum of the quote words divided by the total number of quotes in the doc, i.e. "how much of this article is made up of quotes?"
- The `Proportion_Summary` sheet shows summary statistics for each year's `proportion_quote_words_to_total_words`

In [13]:
def get_quote_proportion_summary_df(dfs: list[pd.DataFrame]):
    return pd.DataFrame(
        [df["proportion_quote_words_to_total_words"].convert_dtypes().describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

In [14]:
def save_years(writer: pd.ExcelWriter, years: list[int], years_dfs: list[pd.DataFrame]):
    proportion_dfs: list[pd.DataFrame] = []
    
    for year, df in zip(years, years_dfs):
        # Save individual quotes
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=[using_dataset["id_col"], "quote", "num_words", "proportion_of_total_words"]
        )

        # Save proportion of quote words to total words in separate sheet
        proportion_df = df.drop_duplicates(subset=using_dataset["id_col"])
        proportion_df.to_excel(
            writer,
            sheet_name=f"{str(year)}_proportion",
            index=False,
            columns=[using_dataset["id_col"], "proportion_quote_words_to_total_words"]
        )

        proportion_dfs.append(proportion_df)

    get_quote_proportion_summary_df(proportion_dfs).to_excel(writer, sheet_name="Proportion Summary")

In [15]:
output_path = make_output_path(using_dataset, "quotes")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")

save_years(writer, years, years_dfs)
proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)

writer.close()

  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)


In [16]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "quotes")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    
    save_years(writer, years, years_dfs)
    
    full_type_df = dataset_df[dataset_df[using_dataset["type_col"]] == type]
    proportions_df = get_proportions_num_quotes_to_num_articles_ratios(full_type_df)
    proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)

    writer.close()

  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)
  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)
  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)
