In [27]:
import pandas as pd

In [28]:
class DatasetConfig():
    input_path: str
    output_path: str
    sheet_name: str
    usecols: list[str]
    cols_to_save: list[str]

    def __init__(self, input_path: str, output_path: str, sheet_name: str, usecols: list[str], cols_to_save: list[str]):
        self.input_path = input_path
        self.output_path = output_path
        self.sheet_name = sheet_name
        self.usecols = usecols
        self.cols_to_save = cols_to_save

In [29]:
fakespeak_config = DatasetConfig(
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_lexical_density.xlsx",
    sheet_name="Working",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
    cols_to_save=["ID", "combinedLabel", "originalTextType", "originalBodyText", "lexical_density"]
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_lexical_density.xlsx",
    sheet_name="Working",
    usecols=None,
    cols_to_save=["factcheckURL", "originalURL", "originalTextType", "originalBodyText", "lexical_density"]
)

In [30]:
using_dataset = fakespeak_config

In [31]:
dataset_df = pd.read_excel(
    using_dataset.input_path,
    sheet_name=using_dataset.sheet_name,
    usecols=using_dataset.usecols
)
dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019
...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021


In [32]:
def get_words(text: str):
    return [token for token in text.split() if token.isalpha()]

def count_uppercase_words(words: list[str]):
    return sum(1 for word in words if word.isupper())

def get_text_length(text: str):
    return len(text.split())

In [33]:
dataset_df["words"] = dataset_df["originalBodyText"].apply(get_words)
dataset_df["num_words"] = dataset_df["words"].apply(lambda words: len(words))
dataset_df["num_uppercase_words"] = dataset_df["words"].apply(count_uppercase_words)
dataset_df["proportion_uppercase_words"] = dataset_df["num_uppercase_words"] / dataset_df["num_words"]
dataset_df["text_length"] = dataset_df["originalBodyText"].apply(get_text_length)

dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,words,num_words,num_uppercase_words,proportion_uppercase_words,text_length
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,"[Mexico, is, paying, for, the, Wall, through, ...",44,1,0.022727,50
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"[Chuck, should, American, citizens, be, respon...",36,2,0.055556,42
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,"[Billions, of, dollars, are, sent, to, the, St...",40,2,0.050000,48
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,"[If, Billion, were, set, aside, to, go, toward...",32,0,0.000000,46
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,"[sent, letter, to, every, person, requesting, ...",32,0,0.000000,41
...,...,...,...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023,"[A, great, lesson, in, Optics, The, Monroe, Do...",625,30,0.048000,780
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023,"[of, these, is, not, like, the, one, of, these...",11,0,0.000000,17
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"[Autopsies, Prove, that, is, a, Disseminated, ...",552,4,0.007246,664
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021,"[She, collapsed, when, she, saw, jfk, as, she,...",66,5,0.075758,77


In [34]:
all_caps_df = dataset_df[dataset_df["proportion_uppercase_words"] == 1.0]
all_caps_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,words,num_words,num_uppercase_words,proportion_uppercase_words,text_length
41,Politifact_FALSE_Social media_831395,False,Social media,WHO CHANGED THE LONG STANDING WHISTLEBLOWER RU...,2019,"[WHO, CHANGED, THE, LONG, STANDING, WHISTLEBLO...",16,16,1.0,18
50,Politifact_FALSE_Social media_15704,False,Social media,PRESIDENT TRUMP DONATED HIS SECOND QUARTER SAL...,2019,"[PRESIDENT, TRUMP, DONATED, HIS, SECOND, QUART...",24,24,1.0,27
55,Politifact_FALSE_Social media_427373,False,Social media,LET ME GET THIS STRAIGHT BIDEN BLACKMAILED UKR...,2019,"[LET, ME, GET, THIS, STRAIGHT, BIDEN, BLACKMAI...",34,34,1.0,35
62,Politifact_FALSE_Social media_427060,False,Social media,THE MISSILES THAT WERE LAUNCHED AT AMERICAN FO...,2020,"[THE, MISSILES, THAT, WERE, LAUNCHED, AT, AMER...",21,21,1.0,25
134,Politifact_FALSE_Social media_510570,False,Social media,REMEMBER WHEN ANONYMOUS SAID THEY WOULD EXPOSE...,2020,"[REMEMBER, WHEN, ANONYMOUS, SAID, THEY, WOULD,...",29,29,1.0,34
...,...,...,...,...,...,...,...,...,...,...
2933,Politifact_Pants on Fire_Social media_312815,Pants on Fire,Social media,WELL NEVER HAVE TO WORRY\nABOUT CHINA ATTACKIN...,2019,"[WELL, NEVER, HAVE, TO, WORRY, ABOUT, CHINA, A...",20,20,1.0,25
2936,Politifact_Pants on Fire_Social media_938988,Pants on Fire,Social media,FEMA CAMPS ARE OPEN FOR THE UPCOMING ARRESTS F...,2023,"[FEMA, CAMPS, ARE, OPEN, FOR, THE, UPCOMING, A...",11,11,1.0,12
2941,Politifact_Pants on Fire_Social media_152853,Pants on Fire,Social media,"TO ANGER A CONSERVATIVE,\nLIE TO HIM. TO ANGER...",2019,"[TO, ANGER, A, LIE, TO, TO, ANGER, A, TELL, HI...",12,12,1.0,17
2944,Politifact_Pants on Fire_Social media_508882,Pants on Fire,Social media,"THIS IS NEW HAMPSHIRE CONGRESSMAN\nARNIM ZOLA,...",2019,"[THIS, IS, NEW, HAMPSHIRE, CONGRESSMAN, ARNIM,...",20,20,1.0,23
