In [1]:
import pandas as pd
import spacy
from spacy.tokens.doc import Doc
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type, is_word

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG
misinfotext_config = BASE_MISINFOTEXT_CONFIG

In [3]:
using_dataset = misinfotext_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"],
    sheet_name=using_dataset["sheet_name"],
    usecols=using_dataset["usecols"]
)
dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017


In [5]:
nlp = spacy.load("en_core_web_md")

  import pkg_resources


In [6]:
dataset_df["doc"] = list(nlp.pipe(dataset_df[using_dataset["text_col"]]))
dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"(Residents, of, multiple, states, will, be, as..."
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,"(Sacramento, ,, CA, -, United, States, Senator..."
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"(We, should, anticipate, black, and, gray, mar..."
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"(As, a, ballot, initiative, calling, for, repe..."
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,"(WASHINGTON, ,, DC, , The, House, of, Represe..."


In [7]:
def get_proportion_upper_to_alpha_tokens(doc: Doc):
    try:
        word_tokens = [token for token in doc if is_word(token)]

        num_upper_tokens = sum(
            1 for token in word_tokens 
            if token.is_upper
        )
        
        num_alpha_tokens = sum(1 for token in word_tokens if token.is_alpha)

        # Sometimes spacy counts non-alpha tokens as uppercase (e.g. WW3).
        # If the proportion exceeds 1, then it's reasonable that
        # the whole text is uppercase.
        if num_upper_tokens > num_alpha_tokens:
            return 1

        return num_upper_tokens / num_alpha_tokens
    except:
        # Some articles don't have any words at all (e.g. just a link),
        # which ends up dividing by zero and throwing an exception.
        # So just return 0 in that case.
        return 0

In [9]:
dataset_df["proportion_upper_to_alpha_tokens"] = dataset_df["doc"].apply(get_proportion_upper_to_alpha_tokens)
dataset_df = dataset_df.sort_values("proportion_upper_to_alpha_tokens", ascending=False)

dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,proportion_upper_to_alpha_tokens
506,http://www.politifact.com/truth-o-meter/statem...,https://www.instagram.com/p/BCYF4O6mhex/,"""FIRST THEY IGNORE YOU, THEN THEY LAUGH AT YOU...",,Social media,2016-02-29,2016,"("", FIRST, THEY, IGNORE, YOU, ,, THEN, THEY, L...",1.0
183,http://www.politifact.com/punditfact/statement...,https://x.com/cbrandonellis/status/89526051024...,BILL CLINTON GAVE NORTH KOREA $5 BILLION AND T...,,Social media,2017-08-09,2017,"(BILL, CLINTON, GAVE, NORTH, KOREA, $, 5, BILL...",1.0
502,http://www.politifact.com/truth-o-meter/statem...,https://www.facebook.com/LiberalsAreCool/posts...,TUITION-FREE PUBLIC UNIVERSITIES AND COLLEGES ...,,Social media,2016-06-02,2016,"(TUITION, -, FREE, PUBLIC, UNIVERSITIES, AND, ...",1.0
514,http://www.politifact.com/truth-o-meter/statem...,https://www.facebook.com/OccupyDemocrats/photo...,HAS TAKEN LESS VACATION DAYS THAN ANY OTHER PR...,,Social media,2016-08-20,2016,"(HAS, TAKEN, LESS, VACATION, DAYS, THAN, ANY, ...",1.0
634,http://www.politifact.com/wisconsin/statements...,http://www.wisn.com/politics/chris-larson-on-c...,Chris Larson on Chris Abele: 'He finally showe...,Chris Larson on Chris Abele: 'He finally showe...,Press release,2016-04-06,2016,"(Chris, Larson, on, Chris, Abele, :, ', He, fi...",0.810526


In [10]:
years, years_dfs = get_groups(dataset_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,proportion_upper_to_alpha_tokens
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,"("", On, Monday, ,, Senator, Clinton, told, an,...",0.008


In [11]:
types, types_dfs = get_groups(dataset_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,proportion_upper_to_alpha_tokens
220,http://www.politifact.com/punditfact/statement...,http://usanewstoday.org/index.php/2017/01/02/c...,If this is true it will mean an opening for an...,CONFIRMED ! JUSTICE RUTH BADER GINSBURG IS RES...,News and blog,2017-01-02,2017,"(If, this, is, true, it, will, mean, an, openi...",0.612963
394,http://www.politifact.com/texas/statements/201...,https://truethevote.org/news/attny-gen-eric-ho...,Holder is against Texas Plans for Redistrictin...,Attny Gen. Eric Holder is Coming to Austin - W...,News and blog,2011-12-12,2011,"(Holder, is, against, Texas, Plans, for, Redis...",0.483516
325,http://www.politifact.com/punditfact/statement...,https://internationalflashnews.blogspot.com/20...,"AMID AN INTERVIEW ON CNN’S “AC360,” FORMER PRE...",MITT ROMNEY: ‘TRUMP HAS SOME VERY SERIOUS EMOT...,News and blog,2018-06-06,2018,"(AMID, AN, INTERVIEW, ON, CNN, ’S, “, AC360, ,...",0.172414
167,http://www.politifact.com/punditfact/statement...,http://dailysnark.com/harambe-dead-gorilla-got...,You may asked what the Unites States did to na...,"Harambe, A Dead Gorilla, Got Over 15,000 Votes...",News and blog,2016-11-08,2016,"(You, may, asked, what, the, Unites, States, d...",0.120773
211,http://www.politifact.com/punditfact/statement...,https://donaldtrumppotus45.com/2017/01/26/brea...,"For years, the debate has raged over whether o...",BREAKING! President Trump Makes ENGLISH The OF...,News and blog,2017-01-29,2017,"(For, years, ,, the, debate, has, raged, over,...",0.096154


In [12]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["proportion_upper_to_alpha_tokens"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

In [13]:
save_cols = [
    using_dataset["id_col"], 
    using_dataset["text_col"], 
    using_dataset["type_col"], 
    "proportion_upper_to_alpha_tokens"
]

def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=save_cols
        )

    get_summary_df(dfs, years).to_excel(writer, sheet_name="Proportion Summary")

In [14]:
output_path = make_output_path(using_dataset, "all_caps")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

In [15]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "all_caps")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()