In [None]:
import pandas as pd
import re
import liwc
from collections import Counter
import spacy
import unicodedata

In [None]:
file_path = "../../data/adhd-beliefs-pt/adhd-beliefs-pt-anonymized.pkl"
df = pd.read_pickle(file_path)
df = df.drop(columns=["datetime"])
df.head()

## Retrieve LIWC categories

In [None]:
text_columns = ["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet"]
liwc_df = df[text_columns].fillna("").replace("NaN", "")
liwc_df["merged_text"] = liwc_df.agg(" ".join, axis=1).str.strip()
liwc_df = liwc_df[liwc_df["merged_text"] != ""]
liwc_df = liwc_df[["merged_text"]]
liwc_df

In [None]:
parse, category_names = liwc.load_token_parser('../../data/LIWC2007_Portugues_win.dic')
print("LIWC categories:", category_names)

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r'\S+@\S+', ' <EMAIL> ', text)
    tokens = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿÇç]+|[^\w\s]", text, flags=re.UNICODE)
    return tokens

In [None]:
nlp = spacy.load("pt_core_news_lg")
def count_words(text):
    if pd.isna(text):
        return 0
    doc = nlp(text)
    return len(doc)

In [None]:
def create_category_columns():
    for category in category_names:
        liwc_df[category] = 0

In [None]:
liwc_df.loc[:, "processed_text"] = liwc_df["merged_text"].apply(preprocess_text)
liwc_df["word_count"] = liwc_df["merged_text"].apply(lambda x: count_words(x))
create_category_columns()

In [None]:
liwc_df

In [None]:
def liwc_analyze(text):
    tokens = preprocess_text(text)
    counts = Counter()
    for token in tokens:
        categories = parse(token)
        for category in categories:
            counts[category] += 1
    return counts

def analyze_texts(df):
    for index, row in df.iterrows():
        text = row["merged_text"]
        counts = liwc_analyze(text)
        for category, count in counts.items():
            if category in liwc_df.columns:
                liwc_df.at[index, category] = count
    return liwc_df

In [None]:
liwc_df = analyze_texts(liwc_df)
liwc_df

## Standardization

In [None]:
prop_liwc_df = liwc_df.copy()
for category in category_names:
    prop_liwc_df[f"{category}"] = prop_liwc_df[category] / prop_liwc_df["word_count"]

In [None]:
zscore_liwc_df = liwc_df.copy()
for category in category_names:
    u = prop_liwc_df[f"{category}"].mean() 
    o = prop_liwc_df[f"{category}"].std(ddof=1)   # pandas std defaults to ddof=1 (sample std)
    zscore_liwc_df[f"{category}"] = (prop_liwc_df[f"{category}"] - u) / o

## Saving to local disk

In [None]:
liwc_df = df.join(liwc_df, how="left")
liwc_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc.pkl")
liwc_df

In [None]:
prop_liwc_df = df.loc[prop_liwc_df.index].join(prop_liwc_df, how="left")
prop_liwc_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc-proportional.pkl")
prop_liwc_df

In [None]:
zscore_liwc_df = df.loc[zscore_liwc_df.index].join(zscore_liwc_df, how="left")
zscore_liwc_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc-zscore.pkl")
zscore_liwc_df