In [1]:
import pandas as pd
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "output_path": "./data/Fakespeak-ENG/Analysis_output/Fakespeak_liwc.xlsx",
    "save_cols": [BASE_FAKESPEAK_CONFIG["id_col"], BASE_FAKESPEAK_CONFIG["text_col"]],
    "for_liwc_path": "./liwc/Fakespeak-ENG_for_liwc.csv",
    "liwc_results_path": "./liwc/Fakespeak-ENG_liwc_results.csv"
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG | {
    "output_path": "./data/MisInfoText/Analysis_output/MisInfoText_liwc.xlsx",
    "save_cols": [BASE_MISINFOTEXT_CONFIG["id_col"], BASE_MISINFOTEXT_CONFIG["text_col"]],
    "for_liwc_path": "./liwc/MisInfoText_for_liwc.csv",
    "liwc_results_path": "./liwc/MisInfoText_liwc_results.csv"
}

In [3]:
using_dataset = misinfotext_config

In [None]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"]
)

In [None]:
dataset_df.head()

In [None]:
data_for_liwc = dataset_df[[using_dataset["id_col"], using_dataset["text_col"]]]
data_for_liwc.to_csv(using_dataset["for_liwc_path"], index=False)

Now that you've run the above, use the generated file as input for LIWC

Then, we will take the file generated by LIWC and continue with the analysis

In [None]:
liwc_df = pd.read_csv(using_dataset["liwc_results_path"])
liwc_df

In [None]:
full_df = dataset_df.set_index(using_dataset["id_col"]).join(liwc_df.set_index(using_dataset["id_col"]))
full_df

In [None]:
grouped_by_year = full_df.groupby(by=using_dataset["year_col"])
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

In [None]:
writer = pd.ExcelWriter(using_dataset["output_path"], engine="xlsxwriter")

save_cols = set(
    using_dataset["save_cols"] + list(liwc_df.columns)
)
save_cols.remove(using_dataset["id_col"]) # Removing ID col because that's saved as the index

for df, year in zip(years_dfs, years):
    df.to_excel(writer, sheet_name=str(year), columns=save_cols)

writer.close()