In [1]:
import pandas as pd
import spacy
from quote_extractor import QuoteExtractor
from helpers import load_data, get_groups

In [2]:
dataset_df = load_data()
dataset_df.head()

Unnamed: 0,id,text,headline,text_type,year
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016
1,http://www.politifact.com/california/statement...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017
4,http://www.politifact.com/california/statement...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017


In [3]:
quote_extractor = QuoteExtractor("en_core_web_lg", "./config/quote_verb_list.txt")

  import pkg_resources


In [4]:
quote_annotations = quote_extractor.run_multiple(
    dataset_df["id"], 
    dataset_df["text"]
)

Preprocessing texts...
Creating spacy docs...
Extracting quotes...
Done extracting quotes


The resulting columns will contain a list of quotes (and the below stats) for each article.
- `quotes_num_words`: the number of words in each quote
- `proportions_of_total_words`: the number of words divided by total article words for each quote
- `proportion_quote_words_to_total_words`: the sum of number of words in quotes in an article divided by total words in that article, i.e. "how much of this article is made up of quotes?"

We count a word to be a token that is not space or punctuation.

In [5]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quotes_num_words"] = [[doc["quote_word_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["proportions_of_total_words"] = [[doc["proportion_of_total_words"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["proportion_quote_words_to_total_words"] = dataset_df["proportions_of_total_words"].apply(sum)

dataset_df.head()

Unnamed: 0,id,text,headline,text_type,year,quotes,quotes_num_words,proportions_of_total_words,proportion_quote_words_to_total_words
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"[, abandoned pet rates have sky-rocketed in Te...","[16, 20, 15, 41, 9]","[0.032719836400818, 0.0408997955010225, 0.0306...",0.206544
1,http://www.politifact.com/california/statement...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016,[that deaths in marijuana-related car crashes ...,"[14, 57]","[0.07446808510638298, 0.30319148936170215]",0.37766
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017,[Organized crime gangs are buying hundreds or ...,"[21, 4, 30, 8, 23, 13, 19]","[0.059322033898305086, 0.011299435028248588, 0...",0.333333
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017,[hes going to raise their gas taxes to the hig...,"[22, 15, 13, 15, 28, 18, 45]","[0.04932735426008968, 0.033632286995515695, 0....",0.349776
4,http://www.politifact.com/california/statement...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017,[],[],[],0.0


Create a new DF that has one quote on each row.

In [6]:
all_quotes_df = dataset_df.explode(["quotes", "quotes_num_words", "proportions_of_total_words"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quotes_num_words": "num_words"})\
    .rename(columns={"proportions_of_total_words": "proportion_of_total_words"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]

all_quotes_df.head()

Unnamed: 0,id,text,headline,text_type,year,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,", abandoned pet rates have sky-rocketed in Tex...",16,0.03272,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,", pet owners within Texas, Arizona, and Missou...",20,0.0409,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"a state funded, mandatory ‘pet registration’ p...",15,0.030675,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,", an incentive program may be implemented to e...",41,0.083845,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"""domestic animal the size of a cat or larger""",9,0.018405,0.206544


In [7]:
years, years_dfs = get_groups(all_quotes_df, "year")
years_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
433,http://www.politifact.com/truth-o-meter/statem...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009,"""Yesterday, during a Financial Services Commit...",29,0.113725,0.329412
433,http://www.politifact.com/truth-o-meter/statem...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009,he was open to supporting it,6,0.023529,0.329412
433,http://www.politifact.com/truth-o-meter/statem...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009,the President is principally responsible for t...,13,0.05098,0.329412
433,http://www.politifact.com/truth-o-meter/statem...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009,"""And President Obama gave the nation the same ...",36,0.141176,0.329412
434,http://www.politifact.com/truth-o-meter/statem...,When most Americans talk about the need for he...,Taxpayer-Funded Abortion Is Not Health-Care Re...,News and blog,2009,health-care legislation should include expande...,9,0.010405,0.068208


In [8]:
types, types_dfs = get_groups(all_quotes_df, "text_type")
types_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year,quote,num_words,proportion_of_total_words,proportion_quote_words_to_total_words
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,", abandoned pet rates have sky-rocketed in Tex...",16,0.03272,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,", pet owners within Texas, Arizona, and Missou...",20,0.0409,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"a state funded, mandatory ‘pet registration’ p...",15,0.030675,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,", an incentive program may be implemented to e...",41,0.083845,0.206544
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"""domestic animal the size of a cat or larger""",9,0.018405,0.206544


Calculate proportion of number of quotes to number of articles per year

In [9]:
def get_proportions_num_quotes_to_num_articles_ratios(df: pd.DataFrame):
    grouped_by_year = df.groupby("year")
    years = grouped_by_year.groups.keys()

    def loop():
        for year in grouped_by_year.groups:
            year_df = grouped_by_year.get_group(year)
            yield year_df["quotes"].apply(lambda quotes_list: len(quotes_list)).sum() / year_df.shape[0]
    
    return pd.DataFrame({
        "year": years,
        "proportion_num_quotes_to_num_articles": loop()
    })

In [10]:
proportions_df = get_proportions_num_quotes_to_num_articles_ratios(dataset_df)
proportions_df

Unnamed: 0,year,proportion_num_quotes_to_num_articles
0,2009,6.176471
1,2010,4.217391
2,2011,3.954545
3,2012,5.714286
4,2013,3.327869
5,2014,3.911765
6,2015,3.714286
7,2016,3.252747
8,2017,2.910314
9,2018,3.491803


## Writing dataframes to excel spreadsheet
There are three parts to the spreadsheet:
- The `year` sheets contain one quote on each line
    - Column `proportion_of_total_words` represents the number of words in the individual quote divided by the total number of quotes in the doc for its corresponding article
- The `year_proportion` sheets contain one article on each line
    - Column `proportion_quote_words_to_total_words` represents the sum of the quote words divided by the total number of quotes in the doc, i.e. "how much of this article is made up of quotes?"
- The `Proportion_Summary` sheet shows summary statistics for each year's `proportion_quote_words_to_total_words`

In [11]:
def get_quote_proportion_summary_df(dfs: list[pd.DataFrame]):
    return pd.DataFrame(
        [df["proportion_quote_words_to_total_words"].convert_dtypes().describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

In [12]:
def save_years(writer: pd.ExcelWriter, years: list[int], years_dfs: list[pd.DataFrame]):
    proportion_dfs: list[pd.DataFrame] = []
    
    for year, df in zip(years, years_dfs):
        # Save individual quotes
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=["id", "quote", "num_words", "proportion_of_total_words", "year"]
        )

        # Save proportion of quote words to total words in separate sheet
        proportion_df = df.drop_duplicates(subset="id")
        proportion_df.to_excel(
            writer,
            sheet_name=f"{str(year)}_proportion",
            index=False,
            columns=["id", "proportion_quote_words_to_total_words", "year"]
        )

        proportion_dfs.append(proportion_df)

    get_quote_proportion_summary_df(proportion_dfs).to_excel(writer, sheet_name="Proportion Summary")

In [13]:
writer = pd.ExcelWriter("./output/quotes.xlsx", engine="xlsxwriter")

save_years(writer, years, years_dfs)
proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)

writer.close()

  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)


In [14]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, "year")
    
    type_str = str(type).lower().replace(" ", "_")

    writer = pd.ExcelWriter(f"./output/{type_str}/quotes_{type_str}.xlsx", engine="xlsxwriter")
    
    save_years(writer, years, years_dfs)
    
    full_type_df = dataset_df[dataset_df["text_type"] == type]
    proportions_df = get_proportions_num_quotes_to_num_articles_ratios(full_type_df)
    proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)

    writer.close()

  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)
  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)
  proportions_df.to_excel(writer, "Num Quotes Proportions", index=False)
