In [None]:
from bertopic import BERTopic
import numpy as np
import pandas as pd
from transformers import pipeline
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import datetime
import nltk
from nltk.corpus import stopwords
import logging
import os
from dotenv import load_dotenv
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import openai
import spacy
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_KEY")

In [None]:
df = pd.read_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-embeddings-serafim.pkl")
df

In [None]:
question_types = ["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet"]
topic_df = pd.concat([df.assign(question=q) for q in question_types], ignore_index=False)
topic_df["response"] = topic_df.apply(lambda row: row[row["question"]], axis=1)
topic_df["response_embedding"] = topic_df.apply(lambda row: row[f"{row['question']}_embedding"], axis=1)
topic_df = topic_df.drop(columns=["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet", "merged_text", "special_interest_embedding", "diary_entry_embedding", "selfdefining_memory_embedding", "empty_sheet_embedding", "merged_text_embedding"])
topic_df = topic_df.dropna(subset=["response"])
topic_df

In [None]:
question_counts = topic_df["question"].value_counts()
question_counts

In [None]:
topic_df['group'] = (
    topic_df['sex'].map({'Feminino':'Female','Masculino':'Male'}).astype(str) + '_' +
    np.where(topic_df['adhd_diagnosis']=="Sim, diagnosticado", 'ADHD', 'noADHD')
)
groups = topic_df['group'].unique().tolist()
group_counts = topic_df['group'].value_counts()
group_counts

In [None]:
nltk.download("stopwords", quiet=True)

# Load Portuguese stopwords
portuguese_stopwords = stopwords.words("portuguese")
additional_stopwords = [
    "pra",
    "pro",
    "tá",
    "já",
    "ter",
    "vai",
    "vou",
    "então",
    "assim",
    "aí",
    "sobre"
]
portuguese_stopwords.extend(additional_stopwords)

In [None]:
def run_bertopic_model(df, texts, embeddings, min_cluster_size=5):
    
    prompt = """
    Eu tenho um tópico que contem o seguinte conjunto de documentos:
    [DOCUMENTS]
    O tópico é descrito pelas seguintes palavras-chave: [KEYWORDS]

    Com base na informação acima, extrai um rótulo de tópico curto, mas altamente descritivo, de no máximo 5 palavras. Certifica-te de que está no seguinte formato:
    tópico: <rótulo tópico>
    """
        
    # best practices for BERTopic
    embedding_model = SentenceTransformer("PORTULAN/serafim-900m-portuguese-pt-sentence-encoder")
    umap_model = UMAP(n_neighbors=8, n_components=5, min_dist=0.0, metric='cosine', random_state=42) # try with pca as well
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True) # try with kmeans as well
    vectorizer_model = CountVectorizer(stop_words=portuguese_stopwords, min_df=2, ngram_range=(1, 2))
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
    
    keybert_model = KeyBERTInspired()
    pos_model = PartOfSpeech("pt_core_news_lg")
    mmr_model = MaximalMarginalRelevance(diversity=0.3)
    client = openai.OpenAI(api_key=OPENAI_KEY)
    openai_model = OpenAI(client, model="gpt-5", exponential_backoff=True, prompt=prompt)

    representation_model = {
        "KeyBERT": keybert_model,
        "OpenAI": openai_model,
        "MMR": mmr_model,
        "POS": pos_model,
    }
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        language="multilingual",
        top_n_words=10,
        verbose=True,
        calculate_probabilities=True,
        ctfidf_model=ctfidf_model,
    )

    topics, probs = topic_model.fit_transform(texts, embeddings)
    df["topic"] = topics
    
    chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
    chatgpt_topic_labels[-1] = "Outlier Topic"
    topic_model.set_topic_labels(chatgpt_topic_labels)
    
    topic_model.save("../../data/adhd-beliefs-pt/bertopic_models/women_adhd_response/", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
    return df, topic_model, topics, probs

In [None]:
def get_topics(df, topic_model, column, output_file=None):
    lines = []
    # Loop through each topic (excluding outliers)
    for topic in sorted(df["topic"].unique()):
        if topic == -1:
            continue

        topic_label = topic_model.get_topic_info().set_index("Topic").loc[topic]["Name"]
        texts_in_topic = df[df["topic"] == topic][column]

        lines.append(f"\n\n🧠 Topic {topic}: {topic_label}")
        lines.append(f"Total documents: {len(texts_in_topic)}")
        lines.append("-" * 60)

        for idx, text in enumerate(texts_in_topic, 1):
            lines.append(f"{idx}. {text}\n")

    output = "\n".join(lines)
    if output_file:
        logging.info(f"Writing topics to {output_file}")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(output)
    else:
        logging.info("Output file not specified, printing topics to console.")
        logging.info(output)

## Women with ADHD

In [None]:
df_women_adhd = topic_df[topic_df["group"] == "Female_ADHD"]
column = "response"
texts = df_women_adhd[column].tolist()
embeddings = np.vstack(df_women_adhd[f"{column}_embedding"])
time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

In [None]:
df_women_adhd, topic_model, topics, probs = run_bertopic_model(df_women_adhd, texts, embeddings, min_cluster_size=2)

In [None]:
#get_topics(df_women_adhd, topic_model, column, f"data/bertopic_results/{time}_bertopic_{column}_women_adhd.txt")

In [None]:
topic_term_matrix = topic_model.c_tf_idf_
words = topic_model.vectorizer_model.get_feature_names()

In [None]:
valid_docs = df_women_adhd[df_women_adhd["topic"] != -1]
print(f"Valid topic documents: {len(valid_docs)} of {len(df_women_adhd)}")

In [None]:
topic_model.get_topic_info()

In [None]:
docs = df_women_adhd[column].tolist()
topic_model.get_document_info(docs)

In [None]:
topic_model.visualize_topics(custom_labels=True)

In [None]:
topic_model.visualize_heatmap(custom_labels=True)

In [None]:
classes = df_women_adhd["question"].tolist()
topics_per_class = topic_model.topics_per_class(texts, classes=classes)
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
reduced_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
# NOTE: You can also hide the annotations with `hide_annotations=True` which is helpful to see the larger structure
topic_model.visualize_documents(texts, reduced_embeddings=reduced_embeddings, custom_labels=True)

In [None]:
topic_model.visualize_document_datamap(texts, reduced_embeddings=reduced_embeddings, custom_labels=True)

In [None]:
fig = topic_model.visualize_document_datamap(texts, reduced_embeddings=reduced_embeddings, interactive=True, custom_labels=True)

In [None]:
topic_model.visualize_distribution(probs[0], custom_labels=True) # topic_distr[0] if not HDBSCAN 

In [None]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(texts, calculate_tokens=True)
df_topic_distr = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])
df_topic_distr

In [None]:
topic_model.visualize_barchart(custom_labels=True)

In [None]:
topic_model.visualize_term_rank(custom_labels=True)

In [None]:
topic_model.visualize_term_rank(log_scale=True, custom_labels=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(texts)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True)

In [None]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

In [None]:
topic_model.visualize_hierarchical_documents(texts, hierarchical_topics, embeddings=embeddings, custom_labels=True)

In [None]:
topic_model.visualize_hierarchical_documents(texts, hierarchical_topics, reduced_embeddings=reduced_embeddings, custom_labels=True)