In [1]:
import pandas as pd
import re
import liwc
from collections import Counter
import spacy
import unicodedata

  import pkg_resources


In [2]:
file_path = "../../data/adhd-beliefs-pt/adhd-beliefs-pt-anonymized.pkl"
df = pd.read_pickle(file_path)
df = df.drop(columns=["datetime"])
df.head()

Unnamed: 0,sex,adhd_diagnosis,age,education,occupation,dialect,forgetting_objects,forgetting_responsabilities,emotion_management,emotion_reactions,...,need_fast_talk_interest,need_fast_talk_information,speaking_before_thinking,something_to_add,something_to_add_timid,something_to_add_impulsive,special_interest,diary_entry,selfdefining_memory,empty_sheet
0,Masculino,Não tenho,21,Licenciatura,Estudante,Português de Portugal,2,3,2,3,...,2,1,1,3,4,2,,,,
1,Feminino,Não tenho,22,Licenciatura,Estudante,Português de Portugal,3,3,4,4,...,1,1,3,4,4,2,"Adoro cozinhar, porque sinto que é a forma ide...","Hoje foi um dia bastante normal, como ontem fi...","Quando era mais nova, eu tinha uma professora ...",Recentemente tenho pensado muito no impacto qu...
2,Feminino,Não tenho,53,Licenciatura,Trabalhador,Português de Portugal,3,4,4,3,...,1,1,3,3,1,2,,,,
3,Feminino,Não tenho,23,Mestrado,Trabalhador-estudante,Português de Portugal,2,1,4,4,...,1,1,4,4,4,2,Gosto de ver séries porque relatam relações sa...,Os eventos que considero mais relevantes serão...,Não tenho,Este questionário foi um pouco extenso. Meu au...
4,Feminino,Não tenho,20,Licenciatura,Estudante,Português de Portugal,4,2,2,3,...,2,2,4,3,4,3,,,,


## Retrieve LIWC categories

In [3]:
text_columns = ["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet"]
liwc_df = df[text_columns].fillna("").replace("NaN", "")
liwc_df["merged_text"] = liwc_df.agg(" ".join, axis=1).str.strip()
liwc_df = liwc_df[liwc_df["merged_text"] != ""]
liwc_df = liwc_df[["merged_text"]]
liwc_df

Unnamed: 0,merged_text
1,"Adoro cozinhar, porque sinto que é a forma ide..."
3,Gosto de ver séries porque relatam relações sa...
6,Ao longo de 7 anos nunca esperei acabar uma am...
8,Campos de ferias. Temos que criar o campo todo...
14,"Taylor Swift, amo o seu liricismo e a maneira ..."
...,...
314,Gosto de nadar e de estar em contacto com a ág...
315,Literatura dos séculos XIX e XX. Há imensas hi...
322,Pergunta difícil porque os meus interesses esp...
329,Musica e mecanismos da voz Cansaço com o dia a...


In [4]:
parse, category_names = liwc.load_token_parser('../../data/LIWC2007_Portugues_win.dic')
print("LIWC categories:", category_names)

LIWC categories: ['funct', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'verb', 'auxverb', 'past', 'present', 'future', 'adverb', 'preps', 'conj', 'negate', 'quant', 'number', 'swear', 'social', 'family', 'friend', 'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'inhib', 'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time', 'work', 'achieve', 'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl', 'filler']


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r'\S+@\S+', ' <EMAIL> ', text)
    tokens = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿÇç]+|[^\w\s]", text, flags=re.UNICODE)
    return tokens

In [6]:
nlp = spacy.load("pt_core_news_lg")
def count_words(text):
    if pd.isna(text):
        return 0
    doc = nlp(text)
    return len(doc)

In [7]:
def create_category_columns():
    for category in category_names:
        liwc_df[category] = 0

In [8]:
liwc_df.loc[:, "processed_text"] = liwc_df["merged_text"].apply(preprocess_text)
liwc_df["word_count"] = liwc_df["merged_text"].apply(lambda x: count_words(x))
create_category_columns()

In [9]:
liwc_df

Unnamed: 0,merged_text,processed_text,word_count,funct,pronoun,ppron,i,we,you,shehe,...,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
1,"Adoro cozinhar, porque sinto que é a forma ide...","[adoro, cozinhar, ,, porque, sinto, que, é, a,...",687,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Gosto de ver séries porque relatam relações sa...,"[gosto, de, ver, séries, porque, relatam, rela...",112,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Ao longo de 7 anos nunca esperei acabar uma am...,"[ao, longo, de, anos, nunca, esperei, acabar, ...",197,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Campos de ferias. Temos que criar o campo todo...,"[campos, de, ferias, ., temos, que, criar, o, ...",600,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,"Taylor Swift, amo o seu liricismo e a maneira ...","[taylor, swift, ,, amo, o, seu, liricismo, e, ...",348,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,Gosto de nadar e de estar em contacto com a ág...,"[gosto, de, nadar, e, de, estar, em, contacto,...",58,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
315,Literatura dos séculos XIX e XX. Há imensas hi...,"[literatura, dos, séculos, xix, e, xx, ., há, ...",203,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
322,Pergunta difícil porque os meus interesses esp...,"[pergunta, difícil, porque, os, meus, interess...",155,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
329,Musica e mecanismos da voz Cansaço com o dia a...,"[musica, e, mecanismos, da, voz, cansaço, com,...",31,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def liwc_analyze(text):
    tokens = preprocess_text(text)
    counts = Counter()
    for token in tokens:
        categories = parse(token)
        for category in categories:
            counts[category] += 1
    return counts

def analyze_texts(df):
    for index, row in df.iterrows():
        text = row["merged_text"]
        counts = liwc_analyze(text)
        for category, count in counts.items():
            if category in liwc_df.columns:
                liwc_df.at[index, category] = count
    return liwc_df

In [11]:
liwc_df = analyze_texts(liwc_df)
liwc_df

Unnamed: 0,merged_text,processed_text,word_count,funct,pronoun,ppron,i,we,you,shehe,...,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
1,"Adoro cozinhar, porque sinto que é a forma ide...","[adoro, cozinhar, ,, porque, sinto, que, é, a,...",687,334,154,103,32,0,63,66,...,27,19,17,1,22,2,0,2,27,1
3,Gosto de ver séries porque relatam relações sa...,"[gosto, de, ver, séries, porque, relatam, rela...",112,41,17,11,1,0,8,6,...,4,2,1,0,3,1,0,0,5,0
6,Ao longo de 7 anos nunca esperei acabar uma am...,"[ao, longo, de, anos, nunca, esperei, acabar, ...",197,93,37,27,8,0,15,17,...,3,7,2,0,3,0,0,2,10,0
8,Campos de ferias. Temos que criar o campo todo...,"[campos, de, ferias, ., temos, que, criar, o, ...",600,249,100,63,14,0,45,40,...,13,15,7,0,13,0,1,1,18,2
14,"Taylor Swift, amo o seu liricismo e a maneira ...","[taylor, swift, ,, amo, o, seu, liricismo, e, ...",348,155,64,37,14,0,20,21,...,5,5,3,2,8,1,3,2,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,Gosto de nadar e de estar em contacto com a ág...,"[gosto, de, nadar, e, de, estar, em, contacto,...",58,29,12,9,4,0,5,4,...,0,0,1,0,0,0,0,1,3,0
315,Literatura dos séculos XIX e XX. Há imensas hi...,"[literatura, dos, séculos, xix, e, xx, ., há, ...",203,79,33,19,10,0,7,10,...,8,9,1,0,4,0,1,0,3,0
322,Pergunta difícil porque os meus interesses esp...,"[pergunta, difícil, porque, os, meus, interess...",155,62,19,11,2,0,9,7,...,5,3,1,0,4,0,0,1,6,0
329,Musica e mecanismos da voz Cansaço com o dia a...,"[musica, e, mecanismos, da, voz, cansaço, com,...",31,12,4,3,0,0,3,3,...,2,2,0,0,2,0,0,0,2,0


## Standardization

In [12]:
prop_liwc_df = liwc_df.copy()
for category in category_names:
    prop_liwc_df[f"{category}"] = prop_liwc_df[category] / prop_liwc_df["word_count"]

In [13]:
zscore_liwc_df = liwc_df.copy()
for category in category_names:
    u = prop_liwc_df[f"{category}"].mean() 
    o = prop_liwc_df[f"{category}"].std(ddof=1)   # pandas std defaults to ddof=1 (sample std)
    zscore_liwc_df[f"{category}"] = (prop_liwc_df[f"{category}"] - u) / o

## Saving to local disk

In [14]:
liwc_df = df.join(liwc_df, how="left")
liwc_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc.pkl")
liwc_df

Unnamed: 0,sex,adhd_diagnosis,age,education,occupation,dialect,forgetting_objects,forgetting_responsabilities,emotion_management,emotion_reactions,...,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
0,Masculino,Não tenho,21,Licenciatura,Estudante,Português de Portugal,2,3,2,3,...,,,,,,,,,,
1,Feminino,Não tenho,22,Licenciatura,Estudante,Português de Portugal,3,3,4,4,...,27.0,19.0,17.0,1.0,22.0,2.0,0.0,2.0,27.0,1.0
2,Feminino,Não tenho,53,Licenciatura,Trabalhador,Português de Portugal,3,4,4,3,...,,,,,,,,,,
3,Feminino,Não tenho,23,Mestrado,Trabalhador-estudante,Português de Portugal,2,1,4,4,...,4.0,2.0,1.0,0.0,3.0,1.0,0.0,0.0,5.0,0.0
4,Feminino,Não tenho,20,Licenciatura,Estudante,Português de Portugal,4,2,2,3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,Feminino,"Sim, diagnosticado",28,Licenciatura,Trabalhador,Português de Portugal,4,4,4,5,...,,,,,,,,,,
328,Feminino,Não tenho,26,Licenciatura,Estudante,Português de Portugal,3,3,4,5,...,,,,,,,,,,
329,Feminino,Não tenho,25,Licenciatura,Trabalhador,Português de Portugal,2,4,3,4,...,2.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0
330,Feminino,"Sim, diagnosticado",24,Licenciatura,Estudante,Português de Portugal,3,2,4,4,...,,,,,,,,,,


In [15]:
prop_liwc_df = df.loc[prop_liwc_df.index].join(prop_liwc_df, how="left")
prop_liwc_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc-proportional.pkl")
prop_liwc_df

Unnamed: 0,sex,adhd_diagnosis,age,education,occupation,dialect,forgetting_objects,forgetting_responsabilities,emotion_management,emotion_reactions,...,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
1,Feminino,Não tenho,22,Licenciatura,Estudante,Português de Portugal,3,3,4,4,...,0.039301,0.027656,0.024745,0.001456,0.032023,0.002911,0.000000,0.002911,0.039301,0.001456
3,Feminino,Não tenho,23,Mestrado,Trabalhador-estudante,Português de Portugal,2,1,4,4,...,0.035714,0.017857,0.008929,0.000000,0.026786,0.008929,0.000000,0.000000,0.044643,0.000000
6,Feminino,Não tenho,20,Ensino secundário,Estudante,Português de Portugal,3,2,4,4,...,0.015228,0.035533,0.010152,0.000000,0.015228,0.000000,0.000000,0.010152,0.050761,0.000000
8,Feminino,Não tenho,19,Licenciatura,Estudante,Português de Portugal,3,2,2,3,...,0.021667,0.025000,0.011667,0.000000,0.021667,0.000000,0.001667,0.001667,0.030000,0.003333
14,Feminino,Não tenho,25,Mestrado,Trabalhador,Português de Portugal,4,3,4,4,...,0.014368,0.014368,0.008621,0.005747,0.022989,0.002874,0.008621,0.005747,0.034483,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,Feminino,Não tenho,23,Licenciatura,Estudante,Português de Portugal,3,4,5,4,...,0.000000,0.000000,0.017241,0.000000,0.000000,0.000000,0.000000,0.017241,0.051724,0.000000
315,Masculino,"Sim, diagnosticado",20,Ensino secundário,Estudante,Português de Portugal,4,2,4,4,...,0.039409,0.044335,0.004926,0.000000,0.019704,0.000000,0.004926,0.000000,0.014778,0.000000
322,Feminino,"Sim, diagnosticado",36,Mestrado,Trabalhador,Português de Portugal,4,2,3,4,...,0.032258,0.019355,0.006452,0.000000,0.025806,0.000000,0.000000,0.006452,0.038710,0.000000
329,Feminino,Não tenho,25,Licenciatura,Trabalhador,Português de Portugal,2,4,3,4,...,0.064516,0.064516,0.000000,0.000000,0.064516,0.000000,0.000000,0.000000,0.064516,0.000000


In [16]:
zscore_liwc_df = df.loc[zscore_liwc_df.index].join(zscore_liwc_df, how="left")
zscore_liwc_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc-zscore.pkl")
zscore_liwc_df

Unnamed: 0,sex,adhd_diagnosis,age,education,occupation,dialect,forgetting_objects,forgetting_responsabilities,emotion_management,emotion_reactions,...,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
1,Feminino,Não tenho,22,Licenciatura,Estudante,Português de Portugal,3,3,4,4,...,0.521953,0.082700,0.224769,-0.243972,0.466249,-0.125018,-0.278456,-0.081356,0.360382,-0.060915
3,Feminino,Não tenho,23,Mestrado,Trabalhador-estudante,Português de Portugal,2,1,4,4,...,0.377815,-0.371983,-0.386703,-0.520319,0.162758,0.352118,-0.278456,-0.357665,0.580878,-0.173442
6,Feminino,Não tenho,20,Ensino secundário,Estudante,Português de Portugal,3,2,4,4,...,-0.445366,0.448166,-0.339394,-0.520319,-0.506930,-0.355857,-0.278456,0.605909,0.833448,-0.173442
8,Feminino,Não tenho,19,Licenciatura,Estudante,Português de Portugal,3,2,2,3,...,-0.186659,-0.040559,-0.280849,-0.520319,-0.133866,-0.355857,0.021483,-0.199478,-0.023569,0.084244
14,Feminino,Não tenho,25,Mestrado,Trabalhador,Português de Portugal,4,3,4,4,...,-0.479948,-0.533886,-0.398606,0.570773,-0.057272,-0.128003,1.272955,0.187806,0.161476,-0.173442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,Feminino,Não tenho,23,Licenciatura,Estudante,Português de Portugal,3,4,5,4,...,-1.057289,-1.200544,-0.065331,-0.520319,-1.389342,-0.355857,-0.278456,1.278749,0.873188,-0.173442
315,Masculino,"Sim, diagnosticado",20,Ensino secundário,Estudante,Português de Portugal,4,2,4,4,...,0.526275,0.856573,-0.541438,-0.520319,-0.247568,-0.355857,0.608064,-0.357665,-0.651909,-0.173442
322,Feminino,"Sim, diagnosticado",36,Mestrado,Trabalhador,Português de Portugal,4,2,3,4,...,0.238934,-0.302491,-0.482462,-0.520319,0.106014,-0.355857,-0.278456,0.254671,0.335960,-0.173442
329,Feminino,Não tenho,25,Licenciatura,Trabalhador,Português de Portugal,2,4,3,4,...,1.535158,1.792967,-0.731881,-0.520319,2.349048,-0.355857,-0.278456,-0.357665,1.401232,-0.173442
