In [1]:
import pandas as pd

In [2]:
class DatasetConfig():
    input_path: str
    output_path: str
    sheet_name: str
    usecols: list[str]
    save_cols: list[str]
    id_col: str

    def __init__(self, input_path: str, output_path: str, sheet_name: str, usecols: list[str], save_cols: list[str], id_col: str):
        self.input_path = input_path
        self.output_path = output_path
        self.sheet_name = sheet_name
        self.usecols = usecols
        self.save_cols = save_cols
        self.id_col = id_col

In [3]:
fakespeak_config = DatasetConfig(
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_liwc.xlsx",
    sheet_name="Working",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
    save_cols=["ID", "originalBodyText"],
    id_col="ID"
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_liwc.xlsx",
    sheet_name="Working",
    usecols=None,
    save_cols=["factcheckURL", "originalBodyText"],
    id_col="factcheckURL"
)

In [4]:
using_dataset = misinfotext_config

In [5]:
dataset_df = pd.read_excel(
    using_dataset.input_path, 
    sheet_name=using_dataset.sheet_name, 
    usecols=using_dataset.usecols
)

In [6]:
dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017


In [7]:
dataset_name = using_dataset.input_path.split('/')[2]
data_for_liwc = dataset_df[[using_dataset.id_col, "originalBodyText"]]
data_for_liwc.to_csv(f"./{dataset_name}_for_liwc.csv", index=False)

Now that you've run the above, use the generated file as input for LIWC

Then, we will take the file generated by LIWC and continue with the analysis

In [8]:
liwc_file = "./MisInfoText_liwc_results.csv"
liwc_df = pd.read_csv(liwc_file)
liwc_df

Unnamed: 0,factcheckURL,Segment,WC,Analytic,Clout,Authentic,Tone,WPS,BigWords,Dic,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
0,http://www.politifact.com/arizona/statements/2...,1,481,96.43,82.60,16.18,45.23,28.29,29.31,82.12,...,0.0,0.0,14.76,3.53,6.65,0.0,0.0,1.04,3.53,0.0
1,http://www.politifact.com/california/statement...,1,184,93.86,71.47,7.87,6.28,20.44,38.59,78.80,...,0.0,0.0,14.67,4.35,3.80,0.0,0.0,1.09,5.43,0.0
2,http://www.politifact.com/california/statement...,1,339,93.61,31.80,42.71,20.23,16.95,25.37,79.35,...,0.0,0.0,18.88,6.78,5.60,0.0,0.0,1.77,4.72,0.0
3,http://www.politifact.com/california/statement...,1,433,89.52,45.52,28.63,23.14,22.79,21.02,79.21,...,0.0,0.0,13.63,4.39,6.47,0.0,0.0,0.00,2.77,0.0
4,http://www.politifact.com/california/statement...,1,670,89.11,56.98,18.27,22.08,23.93,28.51,83.28,...,0.0,0.0,13.73,4.48,5.37,0.0,0.0,0.30,3.58,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,1,27,99.00,40.06,52.89,99.00,27.00,22.22,85.19,...,0.0,0.0,3.70,3.70,0.00,0.0,0.0,0.00,0.00,0.0
651,http://www.politifact.com/wisconsin/statements...,1,44,45.12,99.00,66.42,1.00,14.67,20.45,90.91,...,0.0,0.0,15.91,6.82,2.27,0.0,0.0,0.00,6.82,0.0
652,http://www.politifact.com/wisconsin/statements...,1,47,4.97,84.86,91.07,4.04,15.67,25.53,87.23,...,0.0,0.0,23.40,6.38,4.26,0.0,0.0,6.38,6.38,0.0
653,http://www.politifact.com/wisconsin/statements...,1,573,84.56,73.09,8.76,72.20,26.05,25.48,83.77,...,0.0,0.0,11.17,4.54,4.89,0.0,0.0,0.00,1.75,0.0


In [9]:
full_df = dataset_df.set_index(using_dataset.id_col).join(liwc_df.set_index(using_dataset.id_col))
full_df

Unnamed: 0_level_0,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,Segment,WC,Analytic,Clout,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
factcheckURL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.politifact.com/arizona/statements/2016/may/13/blog-posting/popular-internet-story-claims-arizona-missouri-and/,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,1,481,96.43,82.60,...,0.0,0.0,14.76,3.53,6.65,0.0,0.0,1.04,3.53,0.0
http://www.politifact.com/california/statements/2016/aug/05/dianne-feinstein/feinsteins-claim-about-prime-time-marijuana-tv-ads/,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,1,184,93.86,71.47,...,0.0,0.0,14.67,4.35,3.80,0.0,0.0,1.09,5.43,0.0
http://www.politifact.com/california/statements/2017/aug/22/john-moorlach/has-crime-been-getting-worse-jerry-brown-was-elect/,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,1,339,93.61,31.80,...,0.0,0.0,18.88,6.78,5.60,0.0,0.0,1.77,4.72,0.0
http://www.politifact.com/california/statements/2017/dec/14/travis-allen/travis-allen-claims-californias-gas-tax-increase-w/,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,1,433,89.52,45.52,...,0.0,0.0,13.63,4.39,6.47,0.0,0.0,0.00,2.77,0.0
http://www.politifact.com/california/statements/2017/may/09/judy-chu/rape-pre-existing-condition-under-gop-bill-calif-c/,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,1,670,89.11,56.98,...,0.0,0.0,13.73,4.48,5.37,0.0,0.0,0.30,3.58,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
http://www.politifact.com/wisconsin/statements/2018/jan/26/scott-walker/checking-gov-walkers-claim-state-dot-ahead-game/,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,1,27,99.00,40.06,...,0.0,0.0,3.70,3.70,0.00,0.0,0.0,0.00,0.00,0.0
http://www.politifact.com/wisconsin/statements/2018/jan/31/scott-walker/scott-walkers-overstated-attack-governor-rival-pau/,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,1,44,45.12,99.00,...,0.0,0.0,15.91,6.82,2.27,0.0,0.0,0.00,6.82,0.0
http://www.politifact.com/wisconsin/statements/2018/mar/07/mahlon-mitchell/first-response-was-applaud-walker-recognizing-fire/,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,1,47,4.97,84.86,...,0.0,0.0,23.40,6.38,4.26,0.0,0.0,6.38,6.38,0.0
http://www.politifact.com/wisconsin/statements/2018/mar/12/leah-vukmir/Vukmir-misfires-on-Baldwin-trade-barb/,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018,1,573,84.56,73.09,...,0.0,0.0,11.17,4.54,4.89,0.0,0.0,0.00,1.75,0.0


In [10]:
grouped_by_year = full_df.groupby(by="originalDateYear")
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

Unnamed: 0_level_0,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,Segment,WC,Analytic,Clout,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
factcheckURL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.politifact.com/truth-o-meter/statements/2007/aug/30/john-mccain/mccain-picks-and-chooses-in-attack-on-clinton/,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,1,125,97.24,97.21,...,0.0,0.0,17.6,5.6,2.4,0.0,0.0,6.4,3.2,0.0


In [None]:
writer = pd.ExcelWriter(using_dataset.output_path, engine="xlsxwriter")

save_cols = set(
    using_dataset.save_cols + list(liwc_df.columns)
)
save_cols.remove(using_dataset.id_col) # Removing ID col because that's saved as the index

for df, year in zip(years_dfs, years):
    df.to_excel(writer, sheet_name=str(year), columns=save_cols)

writer.close()