In [1]:
import pandas as pd
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "for_liwc_path": "./liwc/Fakespeak-ENG_for_liwc.csv",
    "liwc_results_path": "./liwc/Fakespeak-ENG_liwc_results.csv"
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG | {
    "for_liwc_path": "./liwc/MisInfoText_for_liwc.csv",
    "liwc_results_path": "./liwc/MisInfoText_liwc_results.csv"
}

In [3]:
using_dataset = misinfotext_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"]
)

# Removing 2007 and 2008 years because little data in them
dataset_df = dataset_df[~(dataset_df[using_dataset["year_col"]] == 2007) & ~(dataset_df[using_dataset["year_col"]] == 2008)]

dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017


Prepare an output file to process using LIWC

In [5]:
data_for_liwc = dataset_df[[using_dataset["id_col"], using_dataset["text_col"]]]
data_for_liwc.to_csv(using_dataset["for_liwc_path"], index=False)

Now that you've run the above, use the generated file as input for LIWC

Then, we will take the file generated by LIWC and continue with the analysis

In [None]:
liwc_df = pd.read_csv(using_dataset["liwc_results_path"])
liwc_df.head()

Unnamed: 0,factcheckURL,Segment,WC,Analytic,Clout,Authentic,Tone,WPS,BigWords,Dic,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
0,http://www.politifact.com/arizona/statements/2...,1,481,96.43,82.60,16.18,45.23,28.29,29.31,82.12,...,0.0,0.0,14.76,3.53,6.65,0.00,0.00,1.04,3.53,0.0
1,http://www.politifact.com/california/statement...,1,184,93.86,71.47,7.87,6.28,20.44,38.59,78.80,...,0.0,0.0,14.67,4.35,3.80,0.00,0.00,1.09,5.43,0.0
2,http://www.politifact.com/california/statement...,1,339,93.61,31.80,42.71,20.23,16.95,25.37,79.35,...,0.0,0.0,18.88,6.78,5.60,0.00,0.00,1.77,4.72,0.0
3,http://www.politifact.com/california/statement...,1,433,89.52,45.52,28.63,23.14,22.79,21.02,79.21,...,0.0,0.0,13.63,4.39,6.47,0.00,0.00,0.00,2.77,0.0
4,http://www.politifact.com/california/statement...,1,670,89.11,56.98,18.27,22.08,23.93,28.51,83.28,...,0.0,0.0,13.73,4.48,5.37,0.00,0.00,0.30,3.58,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680,https://www.politifact.com/factchecks/2015/jul...,1,16,78.28,11.66,15.38,,16.00,25.00,68.75,...,0.0,0.0,18.75,0.00,0.00,6.25,0.00,6.25,6.25,0.0
681,https://www.politifact.com/factchecks/2015/jun...,1,24,73.36,6.61,15.38,1.00,8.00,20.83,87.50,...,0.0,0.0,25.00,12.50,0.00,0.00,0.00,4.17,8.33,0.0
682,https://www.politifact.com/factchecks/2015/mar...,1,849,90.00,38.01,3.85,9.89,30.32,30.74,81.86,...,0.0,0.0,20.02,4.36,4.24,0.00,0.00,3.06,8.36,0.0
683,https://www.politifact.com/factchecks/2015/feb...,1,390,93.65,77.41,13.53,6.78,19.50,28.97,86.15,...,0.0,0.0,16.67,5.38,3.85,0.77,0.26,0.51,5.90,0.0


In [None]:
# Append the new LIWC stats columns onto the original dataframe
full_df = dataset_df.set_index(using_dataset["id_col"]).join(liwc_df.set_index(using_dataset["id_col"]))
full_df = full_df.reset_index()
full_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,Segment,WC,Analytic,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,1,481,96.43,...,0.0,0.0,14.76,3.53,6.65,0.0,0.0,1.04,3.53,0.0
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,1,184,93.86,...,0.0,0.0,14.67,4.35,3.8,0.0,0.0,1.09,5.43,0.0
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,1,339,93.61,...,0.0,0.0,18.88,6.78,5.6,0.0,0.0,1.77,4.72,0.0
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,1,433,89.52,...,0.0,0.0,13.63,4.39,6.47,0.0,0.0,0.0,2.77,0.0
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,1,670,89.11,...,0.0,0.0,13.73,4.48,5.37,0.0,0.0,0.3,3.58,0.0


Get separate dataframes for each year

In [8]:
years, years_dfs = get_groups(full_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,Segment,WC,Analytic,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
428,http://www.politifact.com/truth-o-meter/statem...,https://bachmann.house.gov/News/DocumentSingle...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009-03-25,2009,1,255,98.12,...,0.0,0.0,18.82,8.24,5.88,0.0,0.0,0.0,4.71,0.0
429,http://www.politifact.com/truth-o-meter/statem...,https://healthcare.nationalreview.com/post/?q=...,When most Americans talk about the need for he...,Taxpayer-Funded Abortion Is Not Health-Care Re...,News and blog,2009-07-23,2009,1,814,88.12,...,0.0,0.0,20.39,5.04,4.67,0.49,0.0,2.09,8.11,0.0
430,http://www.politifact.com/truth-o-meter/statem...,http://krugman.blogs.nytimes.com/2009/08/05/on...,A number of people in the news analysis busine...,One of these things is not like the other,News and blog,2009-08-05,2009,1,229,74.53,...,0.0,0.0,11.79,4.37,6.55,0.44,0.0,0.0,0.44,0.0
431,http://www.politifact.com/truth-o-meter/statem...,https://www.facebook.com/notes/1020383705144285/,Yesterday President Obama responded to my stat...,,Social media,2009-08-13,2009,1,1021,68.28,...,0.0,0.0,27.33,7.05,4.11,0.39,0.0,1.86,13.91,0.0
432,http://www.politifact.com/truth-o-meter/statem...,https://jumpinginpools.blogspot.com/2009/01/mi...,Secretary of Defense Robert Gates is extremely...,"Military to Pledge Oath To Obama, Not Constitu...",News and blog,2009-01-28,2009,1,374,96.09,...,0.0,0.0,17.65,6.68,5.88,0.0,0.0,0.27,4.81,0.0


Get separate dataframes for each text type

In [9]:
types, types_dfs = get_groups(full_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,Segment,WC,Analytic,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,1,481,96.43,...,0.0,0.0,14.76,3.53,6.65,0.0,0.0,1.04,3.53,0.0
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,1,339,93.61,...,0.0,0.0,18.88,6.78,5.6,0.0,0.0,1.77,4.72,0.0
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,1,433,89.52,...,0.0,0.0,13.63,4.39,6.47,0.0,0.0,0.0,2.77,0.0
6,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/opinion/openforum/a...,"Recently, a group of special interests threate...","Repeal Californias gas tax increase, says GOP ...",News and blog,2017-10-19,2017,1,495,95.66,...,0.0,0.0,15.56,5.05,6.67,0.2,0.0,0.0,3.64,0.0
7,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/politics/article/Th...,"COSTA MESA, Orange County It was a surreal vi...","The pro-Russia, pro-weed, pro-Assange GOP cong...",News and blog,2017-09-14,2017,1,1052,78.87,...,0.1,0.0,15.59,5.13,6.27,0.19,0.0,0.0,3.99,0.0


## Writing dataframes to excel spreadsheet

Save to a single Excel file with tabs for each year

In [10]:
def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=liwc_df.columns
        )

In [11]:
output_path = make_output_path(using_dataset, "liwc")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

For each text type, we separate further into dataframes per year, and save the Excel files as above

In [12]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "liwc")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()