In [26]:
import pandas as pd
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

In [27]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "save_cols": [BASE_FAKESPEAK_CONFIG["id_col"], BASE_FAKESPEAK_CONFIG["text_col"], "text_length"]
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG | {
    "save_cols": [BASE_MISINFOTEXT_CONFIG["id_col"], BASE_FAKESPEAK_CONFIG["text_col"], "text_length"]
}

In [28]:
using_dataset = misinfotext_config

In [None]:
dataset_df = pd.read_excel(
    using_dataset["input_path"],
    sheet_name=using_dataset["sheet_name"],
    usecols=using_dataset["usecols"]
)
dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017
...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018


In [30]:
def get_words(text: str):
    return [token for token in text.split() if token.isalpha()]

def count_uppercase_words(words: list[str]):
    return sum(1 for word in words if word.isupper())

def get_text_length(text: str):
    return len(text.split())

In [31]:
dataset_df["words"] = dataset_df[using_dataset["text_col"]].apply(get_words)
dataset_df["num_words"] = dataset_df["words"].apply(lambda words: len(words))
dataset_df["num_uppercase_words"] = dataset_df["words"].apply(count_uppercase_words)
dataset_df["proportion_uppercase_words"] = dataset_df["num_uppercase_words"] / dataset_df["num_words"]
dataset_df["text_length"] = dataset_df[using_dataset["text_col"]].apply(get_text_length)

dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,words,num_words,num_uppercase_words,proportion_uppercase_words,text_length
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"[Residents, of, multiple, states, will, be, as...",407,0,0.000000,481
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,"[CA, United, States, Senator, Dianne, Feinstei...",160,4,0.025000,187
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"[We, should, anticipate, black, and, gray, mar...",272,4,0.014706,335
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"[As, a, ballot, initiative, calling, for, repe...",372,0,0.000000,433
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,"[DC, The, House, of, Representatives, today, v...",577,2,0.003466,669
...,...,...,...,...,...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,"[Road, projects, across, the, state, are, stay...",26,0,0.000000,27
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,"[The, last, thing, we, need, is, more, Madison...",39,0,0.000000,45
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,"[When, told, firefighters, we, need, to, worry...",37,1,0.027027,47
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018,"[Now, that, its, an, election, I, would, like,...",511,2,0.003914,572


In [32]:
all_caps_df = dataset_df[dataset_df["proportion_uppercase_words"] == 1.0]
all_caps_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,words,num_words,num_uppercase_words,proportion_uppercase_words,text_length
183,http://www.politifact.com/punditfact/statement...,https://x.com/cbrandonellis/status/89526051024...,BILL CLINTON GAVE NORTH KOREA $5 BILLION AND T...,,Social media,2017-08-09,2017,"[BILL, CLINTON, GAVE, NORTH, KOREA, BILLION, A...",15,15,1.0,17
502,http://www.politifact.com/truth-o-meter/statem...,https://www.facebook.com/LiberalsAreCool/posts...,TUITION-FREE PUBLIC UNIVERSITIES AND COLLEGES ...,,Social media,2016-06-02,2016,"[PUBLIC, UNIVERSITIES, AND, COLLEGES, ARE, JUS...",17,17,1.0,20
506,http://www.politifact.com/truth-o-meter/statem...,https://www.instagram.com/p/BCYF4O6mhex/,"""FIRST THEY IGNORE YOU, THEN THEY LAUGH AT YOU...",,Social media,2016-02-29,2016,"[THEY, IGNORE, THEN, THEY, LAUGH, AT, THEN, TH...",13,13,1.0,18
514,http://www.politifact.com/truth-o-meter/statem...,https://www.facebook.com/OccupyDemocrats/photo...,HAS TAKEN LESS VACATION DAYS THAN ANY OTHER PR...,,Social media,2016-08-20,2016,"[HAS, TAKEN, LESS, VACATION, DAYS, THAN, ANY, ...",23,23,1.0,23


In [33]:
years, years_dfs = get_groups(dataset_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,words,num_words,num_uppercase_words,proportion_uppercase_words,text_length
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,"[Senator, Clinton, told, an, audience, at, the...",108,0,0.0,125


In [34]:
types, types_dfs = get_groups(dataset_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,words,num_words,num_uppercase_words,proportion_uppercase_words,text_length
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"[Residents, of, multiple, states, will, be, as...",407,0,0.0,481
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"[We, should, anticipate, black, and, gray, mar...",272,4,0.014706,335
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"[As, a, ballot, initiative, calling, for, repe...",372,0,0.0,433
6,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/opinion/openforum/a...,"Recently, a group of special interests threate...","Repeal Californias gas tax increase, says GOP ...",News and blog,2017-10-19,2017,"[a, group, of, special, interests, threatened,...",422,0,0.0,495
7,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/politics/article/Th...,"COSTA MESA, Orange County It was a surreal vi...","The pro-Russia, pro-weed, pro-Assange GOP cong...",News and blog,2017-09-14,2017,"[COSTA, Orange, County, It, was, a, surreal, v...",898,5,0.005568,1052


In [35]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["text_length"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

In [36]:
def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=using_dataset["save_cols"]
        )

    summary_df = get_summary_df(dfs, years)
    summary_df.to_excel(writer, sheet_name="Summary")

In [37]:
output_path = make_output_path(using_dataset, "all_caps")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

In [38]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "all_caps")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()