In [None]:
from bertopic import BERTopic
import numpy as np
import pandas as pd
from transformers import pipeline
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
import seaborn as sns
import matplotlib.pyplot as plt
import torch

In [2]:
# Load your dataset and Serafim embeddings
df = pd.read_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-embeddings-serafim.pkl")

In [None]:
def run_bertopic_model(df, text_column, embedding_column, min_topic_size=5):
    df = df.dropna(subset=text_column)
    texts = df[text_column].tolist()
    embeddings = np.vstack(df.loc[df[text_column].notna(), embedding_column])

    prompt = """
    Tens acesso ao seguinte conjunto de documentos de participantes:

    [DOCUMENTS]

    Estas respostas partilham um tema comum, que pode ser descrito pelas seguintes palavras-chave:

    [KEYWORDS]

    Com base nesta informação, gera um título curto e representativo para este tema.

    O título deve:
    - Ser claro, direto e conciso (máximo 4 palavras)
    - Refletir com precisão o conteúdo dos documentos
    - Estar escrito em português europeu

    Importante: devolve apenas o título e nada mais.
    Não incluas explicações, descrições ou frases completas.
    Se não conseguires identificar um tema claro, responde apenas com: Tema desconhecido
    """
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    generator = pipeline(
        "text-generation",
        model="meta-llama/Llama-3.1-8B-Instruct",
        device=device,
        torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
    )
    representation_model = TextGeneration(generator, prompt=prompt)
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

    topic_model = BERTopic(
        representation_model=representation_model,
        language="multilingual",
        min_topic_size=min_topic_size,
        verbose=True,
        calculate_probabilities=True,
        ctfidf_model=ctfidf_model,
    )

    topics, probs = topic_model.fit_transform(texts, embeddings)
    df["topic"] = topics
    return df, topic_model, topics, probs

In [4]:
def topic_distribution_group(df):
    # Example: Count topics per group
    df["adhd_group"] = df["adhd_diagnosis"].apply(lambda x: "ADHD" if x == "Sim, diagnosticado" else "Non-ADHD")

    topic_counts = df.groupby(["topic", "adhd_group"]).size().unstack(fill_value=0)
    topic_counts["total"] = topic_counts.sum(axis=1)
    topic_counts = topic_counts.sort_values("total", ascending=False).drop(-1, errors="ignore")  # drop outliers

    topic_counts[["ADHD", "Non-ADHD"]].head(10).plot(kind="bar", stacked=True, figsize=(10, 5))
    plt.title("Top Topic Distribution by Group")
    plt.ylabel("Number of Documents")
    plt.xlabel("Topic ID")
    plt.tight_layout()
    plt.show()

In [5]:
def get_topics(df, topic_model, column):
    # Loop through each topic (excluding outliers)
    for topic in sorted(df["topic"].unique()):
        if topic == -1:
            continue

        topic_label = topic_model.get_topic_info().set_index("Topic").loc[topic]["Name"]
        texts_in_topic = df[df["topic"] == topic][column]

        print(f"\n\n🧠 Topic {topic}: {topic_label}")
        print(f"Total documents: {len(texts_in_topic)}")
        print("-" * 60)

        for idx, text in enumerate(texts_in_topic, 1):
            print(f"{idx}. {text}\n")


### Special Interest

In [6]:
df_copy = df.copy()
mask_women_adhd = (df_copy['sex']=="Feminino") & (df_copy['adhd_diagnosis']=="Sim, diagnosticado")
df_women_adhd = df_copy[mask_women_adhd]

column = "special_interest"
df_women_adhd, topic_model, topics, probs = run_bertopic_model(df_women_adhd, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_women_adhd[df_women_adhd["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_women_adhd))
get_topics(df_women_adhd, topic_model, column)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cpu
2025-08-03 14:08:02,584 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-03 14:08:09,993 - BERTopic - Dimensionality - Completed ✓
2025-08-03 14:08:09,994 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-03 14:08:09,998 - BERTopic - Cluster - Completed ✓
2025-08-03 14:08:10,000 - BERTopic - Representation - Fine-tuning topics using representation models.
  0%|          | 0/7 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 14%|█▍        | 1/7 [31:32<3:09:15, 1892.64s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 29%|██▊       | 2/7 [1:04:31<2:41:55, 1943.09s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 43%|████▎     | 3/7 [1:09:21<1:19:14, 1188.62s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 57%|█████▋    | 4/7 [1:42:35<1:15:19, 1506.45s/it]Setting `pad_token_id` to `eos_t

Valid topic documents: 19 of 21


🧠 Topic 0: 0_ 'Hobbies' 

    Tema desconhecido
    Título: 'Hobbies' 

    Título: 'Hobbies' 





    Tema desconhecido
    Título: 'Hobbies' 

    Título: 'Hobbies' 
    O título está correto, mas a resposta não é válida porque o tema não é desconhecido. 
    'Hobbies' 



    Tema desconhecido
    Título: 'Hobbies' 

    Título: 'Hobbies' 
    O título está correto, mas a resposta não é válida porque o tema não é desconhecido. 
    'Hobbies' 



    Tema desconhecido
    Título: 'Hobbies' 

    Título: 'Hobbies' 
    O título está correto, mas a resposta não é válida porque o tema não é desconhecido. 
    'Hobbies' 



    Tema desconhecido
    Título: 'Hobbies' 

    Título: 'Hobbies' 
    O título está correto, mas a resposta não é válida porque o tema não é desconhecido. 
    'Hobbies' 



    Tema desconhec___
Total documents: 4
------------------------------------------------------------
1. Adoro ler, embora agora esteja sem muita vontade, não

In [7]:
df_copy = df.copy()
mask_others = ~mask_women_adhd
df_others = df_copy[mask_others]

column = "special_interest"
df_others, topic_model, topics, probs = run_bertopic_model(df_others, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_others[df_others["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_others))
get_topics(df_others, topic_model, column)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cpu
2025-08-03 17:02:50,751 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-03 17:02:51,299 - BERTopic - Dimensionality - Completed ✓
2025-08-03 17:02:51,304 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-03 17:02:51,378 - BERTopic - Cluster - Completed ✓
2025-08-03 17:02:51,385 - BERTopic - Representation - Fine-tuning topics using representation models.
  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 0/2 [00:11<?, ?it/s]


KeyboardInterrupt: 

### Diary Entry

In [None]:
df_copy = df.copy()
mask_women_adhd = (df_copy['sex']=="Feminino") & (df_copy['adhd_diagnosis']=="Sim, diagnosticado")
df_women_adhd = df_copy[mask_women_adhd]

column = "diary_entry"
df_women_adhd, topic_model, topics, probs = run_bertopic_model(df_women_adhd, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_women_adhd[df_women_adhd["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_women_adhd))
get_topics(df_women_adhd, topic_model, column)

In [None]:
df_copy = df.copy()
mask_others = ~mask_women_adhd
df_others = df_copy[mask_others]

column = "diary_entry"
df_others, topic_model, topics, probs = run_bertopic_model(df_others, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_others[df_others["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_others))
get_topics(df_others, topic_model, column)

### Self-Defining Memory

In [None]:
df_copy = df.copy()
mask_women_adhd = (df_copy['sex']=="Feminino") & (df_copy['adhd_diagnosis']=="Sim, diagnosticado")
df_women_adhd = df_copy[mask_women_adhd]

column = "selfdefining_memory"
df_women_adhd, topic_model, topics, probs = run_bertopic_model(df_women_adhd, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_women_adhd[df_women_adhd["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_women_adhd))
get_topics(df_women_adhd, topic_model, column)

In [None]:
df_copy = df.copy()
mask_others = ~mask_women_adhd
df_others = df_copy[mask_others]

column = "selfdefining_memory"
df_others, topic_model, topics, probs = run_bertopic_model(df_others, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_others[df_others["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_others))
get_topics(df_others, topic_model, column)

### Empty Sheet

In [None]:
df_copy = df.copy()
mask_women_adhd = (df_copy['sex']=="Feminino") & (df_copy['adhd_diagnosis']=="Sim, diagnosticado")
df_women_adhd = df_copy[mask_women_adhd]

column = "empty_sheet"
df_women_adhd, topic_model, topics, probs = run_bertopic_model(df_women_adhd, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_women_adhd[df_women_adhd["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_women_adhd))
get_topics(df_women_adhd, topic_model, column)

In [None]:
df_copy = df.copy()
mask_others = ~mask_women_adhd
df_others = df_copy[mask_others]

column = "empty_sheet"
df_others, topic_model, topics, probs = run_bertopic_model(df_others, column, f"{column}_embedding", min_topic_size=2)
valid_docs = df_others[df_others["topic"] != -1]
print("Valid topic documents:", len(valid_docs), "of", len(df_others))
get_topics(df_others, topic_model, column)