In [1]:
import pandas as pd
import spacy
from spacy.tokens.doc import Doc
from helpers import load_data, get_groups, is_word

In [2]:
dataset_df = load_data()
dataset_df.head()

Unnamed: 0,id,text,headline,text_type,year
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016
1,http://www.politifact.com/california/statement...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017
4,http://www.politifact.com/california/statement...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017


In [3]:
nlp = spacy.load("en_core_web_md")

  import pkg_resources


In [4]:
dataset_df["doc"] = list(nlp.pipe(dataset_df["text"]))

In [5]:
def get_proportion_upper_to_alpha_tokens(doc: Doc):
    try:
        word_tokens = [token for token in doc if is_word(token)]

        num_upper_tokens = sum(
            1 for token in word_tokens 
            if token.is_upper
        )
        
        num_alpha_tokens = sum(1 for token in word_tokens if token.is_alpha)

        # Sometimes spacy counts non-alpha tokens as uppercase (e.g. WW3).
        # If the proportion exceeds 1, then it's reasonable that
        # the whole text is uppercase.
        if num_upper_tokens > num_alpha_tokens:
            return 1

        return num_upper_tokens / num_alpha_tokens
    except:
        # Some articles don't have any words at all (e.g. just a link),
        # which ends up dividing by zero and throwing an exception.
        # So just return 0 in that case.
        return 0

In [6]:
dataset_df["proportion_upper_to_alpha_tokens"] = dataset_df["doc"].apply(get_proportion_upper_to_alpha_tokens)
dataset_df = dataset_df.sort_values("proportion_upper_to_alpha_tokens", ascending=False)

dataset_df.head()

Unnamed: 0,id,text,headline,text_type,year,doc,proportion_upper_to_alpha_tokens
2960,Politifact_Pants on Fire_Social media_621529,ANYBODY ELSE FIND IT FUNNY THAT ISRAEL WAS ATT...,,Social media,2023,"(ANYBODY, ELSE, FIND, IT, FUNNY, THAT, ISRAEL,...",1.0
1437,Politifact_FALSE_Social media_314046,ITS IMPOSSIBLE TO RIGHTLY GOVERN A\nNATION WIT...,,Social media,2019,"(ITS, IMPOSSIBLE, TO, RIGHTLY, GOVERN, A, \n, ...",1.0
1528,Politifact_FALSE_Social media_50740,PRESIDENT TRUMP IS ADDRESSING TO THE NATION TO...,,Social media,2023,"(PRESIDENT, TRUMP, IS, ADDRESSING, TO, THE, NA...",1.0
1523,Politifact_FALSE_Social media_843029,"IF THE SHOT ACTUALLY\nSAVED LIVES, THEY'D BE P...",,Social media,2021,"(IF, THE, SHOT, ACTUALLY, \n, SAVED, LIVES, ,,...",1.0
1522,Politifact_FALSE_Social media_727906,JOHN F KENNEDY WAS PLANNING TO END THE FEDERAL...,,Social media,2021,"(JOHN, F, KENNEDY, WAS, PLANNING, TO, END, THE...",1.0


In [7]:
years, years_dfs = get_groups(dataset_df, "year")
years_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year,doc,proportion_upper_to_alpha_tokens
433,http://www.politifact.com/truth-o-meter/statem...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009,"(Washington, ,, D.C., ,, Mar, 25, -, In, respo...",0.036585
450,http://www.politifact.com/truth-o-meter/statem...,"""There's an inherent conflict of interest....T...",AARP: Helping Seniors Or Helping Itself?\nAdmi...,Press release,2009,"("", There, 's, an, inherent, conflict, of, int...",0.033195
447,http://www.politifact.com/truth-o-meter/statem...,"Global Warming: Given rising voter anger, memb...",\nThe Euro-Meddlers,News and blog,2009,"(Global, Warming, :, Given, rising, voter, ang...",0.020408
435,http://www.politifact.com/truth-o-meter/statem...,A number of people in the news analysis busine...,One of these things is not like the other,News and blog,2009,"(A, number, of, people, in, the, news, analysi...",0.017316
443,http://www.politifact.com/truth-o-meter/statem...,"For weeks, President Obama has told us that hi...",Miller Newsletter on Stimulus,Press release,2009,"(For, weeks, ,, President, Obama, has, told, u...",0.013587


In [8]:
types, types_dfs = get_groups(dataset_df, "text_type")
types_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year,doc,proportion_upper_to_alpha_tokens
220,http://www.politifact.com/punditfact/statement...,If this is true it will mean an opening for an...,CONFIRMED ! JUSTICE RUTH BADER GINSBURG IS RES...,News and blog,2017,"(If, this, is, true, it, will, mean, an, openi...",0.612963
394,http://www.politifact.com/texas/statements/201...,Holder is against Texas Plans for Redistrictin...,Attny Gen. Eric Holder is Coming to Austin - W...,News and blog,2011,"(Holder, is, against, Texas, Plans, for, Redis...",0.483516
325,http://www.politifact.com/punditfact/statement...,"AMID AN INTERVIEW ON CNN’S “AC360,” FORMER PRE...",MITT ROMNEY: ‘TRUMP HAS SOME VERY SERIOUS EMOT...,News and blog,2018,"(AMID, AN, INTERVIEW, ON, CNN, ’S, “, AC360, ,...",0.172414
699,Politifact_FALSE_News and blog_701529,"This is BIG NEWS.\nAfter December 31, 2021, th...",HUGE. CDC Withdraws Use of PCR Test for COVID ...,News and blog,2021,"(This, is, BIG, NEWS, ., \n, After, December, ...",0.150685
1114,Politifact_FALSE_News and blog_576637,Citizens in **MANY** areas of the U.S.A. are s...,UPDATED **AGAIN ** 8:12 PM EDT -- TANKS & TROO...,News and blog,2023,"(Citizens, in, *, *, MANY, *, *, areas, of, th...",0.135065


In [9]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["proportion_upper_to_alpha_tokens"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

In [12]:
def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=["id", "year", "proportion_upper_to_alpha_tokens"]
        )

    get_summary_df(dfs, years).to_excel(writer, sheet_name="Proportion Summary")

In [13]:
writer = pd.ExcelWriter("./output/all_caps.xlsx", engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

In [14]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, "year")

    type_str = str(type).lower().replace(" ", "_")

    writer = pd.ExcelWriter(f"./output/{type_str}/all_caps_{type_str}.xlsx", engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()