In [1]:
import pandas as pd
import spacy
from spacy.tokens.doc import Doc
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type, is_word

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG
misinfotext_config = BASE_MISINFOTEXT_CONFIG

In [3]:
using_dataset = fakespeak_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"],
    sheet_name=using_dataset["sheet_name"],
    usecols=using_dataset["usecols"]
)

# Removing 2007 and 2008 years because little data in them
dataset_df = dataset_df[~(dataset_df[using_dataset["year_col"]] == 2007) & ~(dataset_df[using_dataset["year_col"]] == 2008)]

dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019


In [5]:
nlp = spacy.load("en_core_web_md")

  import pkg_resources


In [6]:
dataset_df["doc"] = list(nlp.pipe(dataset_df[using_dataset["text_col"]]))
dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,"(Mexico, is, paying, for, the, Wall, through, ..."
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"(Chuck, Schumer, :, "", why, should, American, ..."
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,"(Billions, of, dollars, are, sent, to, the, St..."
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,"(If, 50, Billion, $, $, were, set, aside, to, ..."
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,"(Huge@#CD, 9, news, ., \n, @ncsbe, \n , sent, ..."


In [7]:
def get_proportion_upper_to_alpha_tokens(doc: Doc):
    try:
        word_tokens = [token for token in doc if is_word(token)]

        num_upper_tokens = sum(
            1 for token in word_tokens 
            if token.is_upper
        )
        
        num_alpha_tokens = sum(1 for token in word_tokens if token.is_alpha)

        # Sometimes spacy counts non-alpha tokens as uppercase (e.g. WW3).
        # If the proportion exceeds 1, then it's reasonable that
        # the whole text is uppercase.
        if num_upper_tokens > num_alpha_tokens:
            return 1

        return num_upper_tokens / num_alpha_tokens
    except:
        # Some articles don't have any words at all (e.g. just a link),
        # which ends up dividing by zero and throwing an exception.
        # So just return 0 in that case.
        return 0

In [8]:
dataset_df["proportion_upper_to_alpha_tokens"] = dataset_df["doc"].apply(get_proportion_upper_to_alpha_tokens)
dataset_df = dataset_df.sort_values("proportion_upper_to_alpha_tokens", ascending=False)

dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc,proportion_upper_to_alpha_tokens
2960,Politifact_Pants on Fire_Social media_621529,Pants on Fire,Social media,ANYBODY ELSE FIND IT FUNNY THAT ISRAEL WAS ATT...,2023,"(ANYBODY, ELSE, FIND, IT, FUNNY, THAT, ISRAEL,...",1.0
593,Politifact_FALSE_Social media_404153,False,Social media,PFIZER LAB IN MADRID ON FIRE,2021,"(PFIZER, LAB, IN, MADRID, ON, FIRE)",1.0
1852,Politifact_FALSE_Social media_949849,False,Social media,"IN 2010, MAXINE WATERS STEERED $12 MILLION IN\...",2019,"(IN, 2010, ,, MAXINE, WATERS, STEERED, $, 12, ...",1.0
330,Politifact_FALSE_Social media_240337,False,Social media,MORE POLL WORKERS CAUGHT FILLING OUT BALLOTS I...,2020,"(MORE, POLL, WORKERS, CAUGHT, FILLING, OUT, BA...",1.0
591,Politifact_FALSE_Social media_813786,False,Social media,YOU WILL NEVER TRUST A CELEBRITY AGAIN AFTER W...,2021,"(YOU, WILL, NEVER, TRUST, A, CELEBRITY, AGAIN,...",1.0


In [9]:
years, years_dfs = get_groups(dataset_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc,proportion_upper_to_alpha_tokens
1852,Politifact_FALSE_Social media_949849,False,Social media,"IN 2010, MAXINE WATERS STEERED $12 MILLION IN\...",2019,"(IN, 2010, ,, MAXINE, WATERS, STEERED, $, 12, ...",1.0
1552,Politifact_FALSE_Social media_762742,False,Social media,TWAS BANNED FROM TELEVISION FOR BEING TOO VIDL...,2019,"(TWAS, BANNED, FROM, TELEVISION, FOR, BEING, T...",1.0
2301,Politifact_Mostly False_Social media_158446,Mostly False,Social media,"MORE THAN 100,000 DACA APPLICANTS HAVE BEEN AR...",2019,"(MORE, THAN, 100,000, DACA, APPLICANTS, HAVE, ...",1.0
1807,Politifact_FALSE_Social media_213672,False,Social media,THIS PHOTO WAS TAKEN AT THE\n1924 DEMOCRATIC N...,2019,"(THIS, PHOTO, WAS, TAKEN, AT, THE, \n, 1924, D...",1.0
2864,Politifact_Pants on Fire_Social media_79335,Pants on Fire,Social media,THIS IS JACK GORDON. HE WAS CHELSEA CLINTONS\n...,2019,"(THIS, IS, JACK, GORDON, ., HE, WAS, CHELSEA, ...",1.0


In [10]:
types, types_dfs = get_groups(dataset_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc,proportion_upper_to_alpha_tokens
699,Politifact_FALSE_News and blog_701529,False,News and blog,"This is BIG NEWS.\nAfter December 31, 2021, th...",2021,"(This, is, BIG, NEWS, ., \n, After, December, ...",0.150685
1114,Politifact_FALSE_News and blog_576637,False,News and blog,Citizens in **MANY** areas of the U.S.A. are s...,2023,"(Citizens, in, *, *, MANY, *, *, areas, of, th...",0.135065
748,Politifact_FALSE_News and blog_536995,False,News and blog,The United States is under attack from within;...,2022,"(The, United, States, is, under, attack, from,...",0.12522
2577,Politifact_Pants on Fire_News and blog_964743,Pants on Fire,News and blog,A whole lot of people have a lot of explaining...,2021,"(A, whole, lot, of, people, have, a, lot, of, ...",0.125
724,Politifact_FALSE_News and blog_238924,False,News and blog,Peer Reviewed Medical Papers Submitted To Vari...,2022,"(Peer, Reviewed, Medical, Papers, Submitted, T...",0.113949


In [11]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["proportion_upper_to_alpha_tokens"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

In [12]:
save_cols = [
    using_dataset["id_col"], 
    using_dataset["text_col"], 
    using_dataset["type_col"], 
    "proportion_upper_to_alpha_tokens"
]

def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=save_cols
        )

    get_summary_df(dfs, years).to_excel(writer, sheet_name="Proportion Summary")

In [13]:
output_path = make_output_path(using_dataset, "all_caps")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

Christine Grady, RN, Ph.D., and her husband, Dr. Anthony Fauci (left). She heads the Dept. of Bioethics at NIH (National Institute of Health).

https://www.cc.nih.gov/meet-our-doctors/cgrady.html

https://pubmed.ncbi.nlm.nih.gov/30157295/
To get Americans vaccinated.
Why does the federal government need a secret plan?
Because half of the American citizens refuse to take vaccines.
https://apnews.com/article/dacdc8bc428dd4df6511bfa259cfec44
According to Francis Collins, M.D.
He heads the National Institute of Health (NIH).
https://www.cnn.com/world/live-news/coronavirus-pandemic-10-23-20-intl/h_ba1397022dc57efb7b26a221ca07bfef
The solution?
A secret plan to drug public drinking water with psych drugs.
The federal government can do that?
If it is “for the public good.”
Who determines the public good?
Christine Grady, RN, Ph.D.?
She is Dr. Anthony Fauci’s wife.
Or their boss?
Dr. Francis Collins, head of the National Institutes of Health (NIH).
Or Congress?

Francis Collins, M.D., is in th

In [None]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "all_caps")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()

Christine Grady, RN, Ph.D., and her husband, Dr. Anthony Fauci (left). She heads the Dept. of Bioethics at NIH (National Institute of Health).

https://www.cc.nih.gov/meet-our-doctors/cgrady.html

https://pubmed.ncbi.nlm.nih.gov/30157295/
To get Americans vaccinated.
Why does the federal government need a secret plan?
Because half of the American citizens refuse to take vaccines.
https://apnews.com/article/dacdc8bc428dd4df6511bfa259cfec44
According to Francis Collins, M.D.
He heads the National Institute of Health (NIH).
https://www.cnn.com/world/live-news/coronavirus-pandemic-10-23-20-intl/h_ba1397022dc57efb7b26a221ca07bfef
The solution?
A secret plan to drug public drinking water with psych drugs.
The federal government can do that?
If it is “for the public good.”
Who determines the public good?
Christine Grady, RN, Ph.D.?
She is Dr. Anthony Fauci’s wife.
Or their boss?
Dr. Francis Collins, head of the National Institutes of Health (NIH).
Or Congress?

Francis Collins, M.D., is in th

: 