# Topic modelling for Dreamachine dataset

Author : Romy Beauté\
Date created : 13/05/2024\
Last modified : 16/09/2025\
Corresp : r.beaut@sussex.ac.uk

Selection of sentence transformer embedding models :
https://www.sbert.net/docs/pretrained_models.html

The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality



### imports and global setup

In [None]:
# %%capture

import torch
import os
import numpy as np
import nltk
import sys
import pandas as pd
import nltk
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from bertopic.representation import KeyBERTInspired, LlamaCPP
import datamapplot
import random
from sklearn.metrics.pairwise import cosine_similarity
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
import matplotlib.pyplot as plt



# Global random seed for reproducibility ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): #will be false on mac but true with gpus
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# Environment setup
os.environ["TOKENIZERS_PARALLELISM"] = "True"
nltk.download('stopwords')

# Add project root to path
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)  
sys.path.append(project_root)

# Import project modules
from src.preprocessor import split_sentences
from src.utils import get_params_grid, calculate_coherence,calculate_embedding_coherence
from src.model import setup_model, setup_umap, setup_hdbscan
from configs.dreamachine2 import config
from src.llama_CPP_custom import *

dataset = "DREAMACHINE"
condition = "DL"

print(f"Current working directory: {os.getcwd()}")
BOX_DIR = os.path.join(os.path.expanduser("~"), "Library", "CloudStorage", "Box-Box", "TMDATA")
print(f"Retrieving data from BOX, locally stored at: {BOX_DIR}")
DATA_DIR = os.path.join(BOX_DIR, dataset)
print(f"Data directory: {DATA_DIR}")
results_dir = os.path.join(project_root, "EVAL",dataset.lower())

### Define parameters for current run


In [None]:

sentences = True
random_seed = SEED # uses the global seed 

# # Outlier reduction settings
# reduce_outliers = False
# outlier_strategy = 'embeddings' 
# outlier_threshold = 0.7 #works well with 'embeddings' and 0.7 threshold OR 'probabilities' and 0.15 threshold


### Load and preprocess data


In [None]:

reports_path = os.path.join(DATA_DIR, f"{condition}_reflections_APIcleaned.csv")
print("Using data from:", reports_path)


# load data and divide into sentences if needed
df_reports = pd.read_csv(reports_path)['cleaned_reflection'].dropna().reset_index(drop=True)
df_reports


### Pre-processing


In [None]:
#divide in sentences if needed
if sentences:
    df_reports = split_sentences(df_reports)[0]
print(f"\nSuccessfully loaded and processed {len(df_reports)} sentences.")


#remove sentences defined as too short
min_words = 2 #threshold for minimum words in a sentence
#print all sentences that contain only min_words word
for i, sentence in enumerate(df_reports):
    if len(sentence.split()) < min_words:
        print(sentence)

#print the amount of sentences that have less than min_words words
short_sentences = [sentence for sentence in df_reports if len(sentence.split()) < min_words]
print(f"\nThere are {len(short_sentences)} sentences with less than {min_words} words.\n")

# Remove sentences with less than 2 words
df_reports = [sentence for sentence in df_reports if len(sentence.split()) >= min_words]
print(f"After removing short sentences, {len(df_reports)} sentences remain.")

# Remove duplicate sentences if any
seen = set()
df_reports = [s for s in df_reports if not (s in seen or seen.add(s))]
print(f"After removing duplicates, {len(df_reports)} remain.")

### Setup models (sentence transformer and countvectorizer) and generate embeddings

In [None]:
# Models setup (from configs)
print("Setting up models and generating embeddings...")

embedding_model = SentenceTransformer(config.transformer_model)
print("Using embedding model:", config.transformer_model)
embeddings = embedding_model.encode(df_reports, show_progress_bar=True)

vectorizer_model = CountVectorizer(
    ngram_range=config.ngram_range, 
    stop_words=list(config.extended_stop_words),
    max_df=config.max_df,
    min_df=config.min_df
)

print("Embeddings and vectorizer ready.")

### Train BERTopic model 

In [None]:
### Chose either to load best params from grid search, optuna or use defaults tailored ones from the config file

param_selection = "optuna" #can be "optuna", "default", or "grid_search"
target_trial_number = None#65 #or None
sanitized_model_name = config.transformer_model.replace('/', '_') #name of the transformer model with '/' replaced by '_'


if param_selection=="optuna":
    if target_trial_number is None:
        params = pd.read_csv(os.path.join(results_dir,f"OPTUNA_results_{condition}_{'sentences' if sentences else ''}_{sanitized_model_name}.csv")).sort_values(by='embedding_coherence', ascending=False).reset_index(drop=True)
        chosen_params = params.iloc[0]
        print("Best parameters loaded from optuna:")
        print(chosen_params)
    else:
        params = pd.read_csv(os.path.join(results_dir,f"OPTUNA_results_{condition}_{'sentences' if sentences else ''}_{sanitized_model_name}.csv"))
        chosen_params_row = params[params['trial_number'] == target_trial_number]
        # Check if the trial was actually found in the file
        if not chosen_params_row.empty:
            # .iloc[0] selects the first row of the filtered result, which gives us the parameter series
            chosen_params = chosen_params_row.iloc[0]
            print(f"Parameters successfully loaded for Optuna trial number: {target_trial_number}")
            print(chosen_params)
        else:
            raise ValueError(f"Error: Trial number {target_trial_number} was not found in the results file.")

elif param_selection=="grid_search":
    params = pd.read_csv(os.path.join(results_dir,f"GS_results_{condition}_{'sentences' if sentences else ''}.csv")).sort_values(by='embedding_coherence', ascending=False).reset_index(drop=True)
    chosen_params = params.iloc[0]
    print("Best parameters loaded from grid search:")
    print(chosen_params)
elif param_selection=="default":
    print(f"Loading default parameters for condition: {condition}")
    chosen_params = config.get_default_params(condition)
    print("Default parameters loaded:")
    print(pd.Series(chosen_params))
else:
    raise ValueError("param_selection must be 'optuna', 'default', or 'grid_search'")


In [None]:


# --- Setup UMAP and HDBSCAN models ---
umap_model = setup_umap(
    n_neighbors=int(chosen_params['n_neighbors']),
    n_components=int(chosen_params['n_components']),
    min_dist=float(chosen_params['min_dist']),
    random_seed=random_seed
)


hdbscan_model = setup_hdbscan(
    min_cluster_size=int(chosen_params['min_cluster_size']),
    min_samples=int(chosen_params['min_samples']) if pd.notna(chosen_params['min_samples']) else None
)

# Initialise and fit the main Topic model
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    # representation_model=representation_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=config.top_n_words, #how many key words are generated for each topic's default representation
    nr_topics="auto",
    verbose=True,
    calculate_probabilities=True # Needed for the 'probabilities' outlier reduction strategy
)

topics, probs = topic_model.fit_transform(df_reports, embeddings) #run main training step

### Check topic and embedding coherence

In [None]:
# Calculate initial coherence score
initial_coherence, _ = calculate_coherence(topic_model, df_reports)
embedding_coherence = calculate_embedding_coherence(topic_model, df_reports, embeddings)
print(f"Initial Word-Based Coherence Score (C_v): {initial_coherence:.4f}")
print(f"Initial Embedding Coherence Score: {embedding_coherence:.4f}") # Display the new score
print(f"Initial number of topics extracted: {len(np.unique(topics))}")
print(f"Initial number of outlier sentences: {len([t for t in topics if t == -1])}")

### Add representations (before LLM)

In [None]:
# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr,
}

print("\nUpdating model with KeyBERT and MMR representations...")
topic_model.update_topics(df_reports, representation_model=representation_model)
print("Update complete.")

### Check topics that were generated, their preliminary representation, and check which docs (sentences) were used for representation

In [None]:
topic_model.get_topic_info() #get an overview of the topics found

In [None]:
docs_info = topic_model.get_document_info(df_reports)
docs_info

In [None]:
topic_id = 1 #check this topic content

print(f"Representative sentences for topic {topic_id}: ",topic_model.get_representative_docs(topic_id)) #get representative documents for topic 1

all_sentences_topic = [doc for doc, topic in zip(df_reports, topics) if topic == topic_id] #get all sentences assigned to topic 1
print(f"\nNumber of documents assigned to topic {topic_id}: {len(all_sentences_topic)}\n")
for doc in all_sentences_topic:
    print(doc)


docs_info= topic_model.get_document_info(df_reports)
topic_docs = docs_info[docs_info['Topic'] == topic_id]
topic_docs_sorted = topic_docs.sort_values(by='Probability', ascending=False)#.reset_index(drop=True)


topic_docs_sorted = topic_docs_sorted.rename(columns={
    'Document': 'Sentence',
    'Probability': 'Topic_Probability'
})

result_table = topic_docs_sorted[['Sentence', 'Topic_Probability','Representative_document']]

print(result_table)

result_table.to_csv(f'../RESULTS/dreamachine/topics_content/topic{topic_id}.csv')

### Experiment with Vectorizer Settings (without re-training)
use the `.update_topics()` function to quickly test different `CountVectorizer` settings and see how they change the keywords (without re-running the clustering)


In [None]:
# --- First, let's see the keywords from your initial model run for comparison ---
print("--- Initial Keywords (from loaded GS parameters) ---")
# Let's look at the first 4 topics as an example
num_topics = len(topic_model.get_topic_info()) - 1 
for topic_id in range(num_topics):
    # The .get_topic() method returns a list of (word, score) tuples
    keywords = [word for word, score in topic_model.get_topic(topic_id)]
    print(f"Topic {topic_id}: {keywords}")
print("\n" + "="*80 + "\n")

### Reduce outliers

In [None]:
#print the sentences that have been considered outliers
outlier_sentences = [doc for doc, topic in zip(df_reports, topics) if topic == -1]
print(f"N={len(outlier_sentences)} outlier sentences:")
for sentence in outlier_sentences:
    print(sentence)

In [None]:

# Store the original topics before the loop
original_topics = topic_model.topics_

# --- Define strategies and thresholds to test ---
strategies = ['probabilities', 'embeddings']
thresholds_to_test = {
    'probabilities': np.arange(0.1, 0.51, 0.05), #explore between 0.1 and 0.50 
    'embeddings': np.arange(0.5, 0.81, 0.05)
}

results = []

# --- Iterate over each strategy and threshold ---
for strategy in strategies:
    for threshold in thresholds_to_test[strategy]:
        print(f"Testing strategy '{strategy}' with threshold {threshold:.2f}...")
        
        # NOTE: The reduce_outliers function checks the internal state of `topic_model` before running.
        # This is why the model must be reset at the end of each iteration.
        new_topics = topic_model.reduce_outliers(
            df_reports, 
            topics, # It is safe to use the original `topics` variable here
            strategy=strategy,
            probabilities=probs,
            embeddings=embeddings,
            threshold=threshold
        )
        
        # Temporarily update the model to calculate coherence with new topics
        topic_model.update_topics(df_reports, topics=new_topics)
        
        # Calculate metrics
        num_outliers = len([t for t in new_topics if t == -1])
        cv_coherence, _ = calculate_coherence(topic_model, df_reports)
        emb_coherence = calculate_embedding_coherence(topic_model, df_reports, embeddings)

        # Store results
        results.append({
            'Strategy': strategy,
            'Threshold': threshold,
            'Num_Outliers': num_outliers,
            'Cv_Coherence': cv_coherence,
            'Embedding_Coherence': emb_coherence
        })
        
        # THIS IS THE CRITICAL FIX: Reset the model to its original state before the next iteration
        topic_model.update_topics(df_reports, topics=original_topics)

# --- Create and display the results DataFrame ---
results_df = pd.DataFrame(results)
print("\n--- Evaluation Complete ---")
display(results_df)

In [None]:
# --- Create plots to visualise the results ---
fig, axes = plt.subplots(len(strategies), 1, figsize=(12, 8 * len(strategies)))
fig.suptitle('Outlier Reduction Strategy Evaluation', fontsize=16)

for i, strategy in enumerate(strategies):
    ax = axes[i]
    strategy_df = results_df[results_df['Strategy'] == strategy]
    
    # Plot coherence scores on the primary y-axis
    ax.plot(strategy_df['Threshold'], strategy_df['Cv_Coherence'], 'o-', color='b', label='Word Coherence (Cv)')
    ax.plot(strategy_df['Threshold'], strategy_df['Embedding_Coherence'], 's-', color='c', label='Embedding Coherence')
    ax.set_xlabel('Threshold')
    ax.set_ylabel('Coherence Score', color='b')
    ax.tick_params(axis='y', labelcolor='b')
    ax.set_title(f'Strategy: {strategy}')
    
    # Create a secondary y-axis for the number of outliers
    ax2 = ax.twinx()
    ax2.plot(strategy_df['Threshold'], strategy_df['Num_Outliers'], 'o--', color='r', label='Number of Outliers')
    ax2.set_ylabel('Number of Outliers', color='r')
    ax2.tick_params(axis='y', labelcolor='r')
    
    # Add legends
    lines, labels = ax.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc='center right')

plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()

In [None]:
# reduce outliers (optional)
outlier_strategy = 'embeddings' 
outlier_threshold = 0.65


reduce_outliers = True

if reduce_outliers:
    print(f"\nReducing outliers using the '{outlier_strategy}' strategy with threshold {outlier_threshold}...")
    new_topics = topic_model.reduce_outliers(df_reports, topics, strategy=outlier_strategy,probabilities=probs,threshold=outlier_threshold)

    # update the topics variable for subsequent steps
    topics = new_topics
    
    # update the model with the new topic assignments
    topic_model.update_topics(df_reports, topics=topics)

    print(f"Remaining outliers after reduction: {len([t for t in topics if t == -1])}")
    
    # re-calculate coherence to see the impact after outlier reduction
    final_coherence, _ = calculate_coherence(topic_model, df_reports)
    print(f"Coherence Score after outlier reduction: {final_coherence}")
    embedding_coherence = calculate_embedding_coherence(topic_model, df_reports, embeddings)
    print(f"Embedding Coherence Score after outlier reduction: {embedding_coherence:.4f}") # Display the new score
    print(f"Number of topics after outlier reduction: {len(np.unique(topics))}\n")

In [None]:
topic_model.visualize_barchart(title=f'Topic Word Scores for {condition}',autoscale=True)

In [None]:
topic_model.visualize_documents(df_reports,title=f'Documents and Topics for {condition}')

In [None]:
# Get the topic information DataFrame which contains all representations
topic_info_df = topic_model.get_topic_info()

print("--- Keywords from c-TF-IDF, KeyBERT and MMR Representations ---")

# Iterate over each row in the DataFrame
for index, row in topic_info_df.iterrows():
    topic_id = row['Topic']
    
    # Skip the outlier topic
    if topic_id == -1:
        continue
        
    # CORRECTED: Directly access the columns, as they already contain the list of keywords
    ctfidf_keywords = row['Representation']
    keybert_keywords = row['KeyBERT']
    mmr_keywords = row['MMR']
    
    # Print the keywords for the current topic
    print(f"\nTopic {topic_id}:")
    print(f"  - c-TF-IDF: {ctfidf_keywords}")
    print(f"  - KeyBERT: {keybert_keywords}")
    print(f"  - MMR:     {mmr_keywords}")

print("\n" + "="*80 + "\n")

# Update model with LLAMA 3 for Topic Labelling

In [None]:
print("\nSetting up Llama3 for topic representation...")

# LLama setup
model_path = hf_hub_download(
    repo_id="NousResearch/Meta-Llama-3-8B-Instruct-GGUF",
    filename="Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
)

# Use llama.cpp to load in a Quantized LLM
llm = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n","Label:","Topic name:","Keywords:","(Note:","Note:","The label is:"],verbose=False)

In [None]:

    
pipeline_kwargs = {
    "max_tokens": 15, 
    "temperature": 0.2,
    "repeat_penalty": 1.2
}

prompt = """Below are documents and keywords that describe a single topic.

DOCUMENTS:
[DOCUMENTS]

KEYWORDS:
- [KEYWORDS]


TASK:
Generate a single, scientific label that summarises the topic.

RULES:
- The label must be in Title Case.
- It must be concise (max 10 words).
- It must reflect directly reported phenomena.

Only return the label itself, without any additional text or punctuation.


LABEL:
"""


prompt_custom = """Below are sentences and keyword sets that describe a single topic.

SENTENCES:
[SENTENCES]

KEYWORDS:
- [KEYWORDS]
- [KEYBERT_KEYWORDS]
- [MMR_KEYWORDS]

TASK:
Uses information from the SENTENCES and the KEYWORDS to generate a single, scientific label that summarises the topic.

RULES:
- The label must be in Title Case.
- It must be concise but precise (max 10 words).
- It must reflect directly reported phenomena.

Only return the label itself, without any additional text or punctuation.


LABEL:
"""



# Instantiate our new custom model
llm_model = LlamaCPP(llm, prompt=prompt,nr_docs=10,pipeline_kwargs=pipeline_kwargs,diversity=0.3)
llm_model_custom = PhenoLabeler(llm, prompt=prompt_custom,nr_docs=10,pipeline_kwargs=pipeline_kwargs,diversity=0.3,verbose=False) #set verbose to True to see which sentences and keywords have been used as LLM input
lll_model_custom2 = MultiKeywordLLM(llm, prompt=prompt_custom, pipeline_kwargs=pipeline_kwargs,nr_docs=10)

# Define the representation model using our custom class
representation_model = {"LLM": llm_model,"LLM_Custom": llm_model_custom,"LLM_Custom2": lll_model_custom2}

# UPDATE THE FITTED MODEL - this now works as you originally wanted
print("Updating topic representations with custom Llama 3 prompter...")
topic_model.update_topics(df_reports, representation_model=representation_model)
print("Update complete.")

#now check our new generatd labels
topic_model.get_topic_info()

In [None]:
# Get the latest topic information DataFrame
topic_info_df = topic_model.get_topic_info()

print("--- Comparison of LLM vs. LLM_Custom Labels ---")

# Iterate over each topic in the DataFrame
for index, row in topic_info_df.iterrows():
    topic_id = row['Topic']
    
    # Skip the outlier topic at the top of the DataFrame
    if topic_id == -1:
        continue
        
    # --- Clean the label from the standard 'LLM' column ---
    # The label is the first item in a list, so we select it.
    try:
        llm_label = row['LLM'][0].strip()
    except (TypeError, IndexError):
        llm_label = "Not available"

    # --- Clean the label from our 'LLM_Custom' column ---
    # The label is the first line of a messy string, so we split by newline and take the first part.
    try:
        llm_custom_label = row['LLM_Custom'][0].strip()#row['LLM_Custom'].split('\n')[0].strip()
    except AttributeError:
        llm_custom_label = "Not available" # Handles case where the cell is empty (NaN)


    try:
        llm_custom_label2 = row['LLM_Custom2'].split('\n')[0].strip()
    except AttributeError:
        llm_custom_label2 = "Not available" # Handles case where the cell is empty (NaN)
        
    # --- Print the clean, side-by-side comparison ---
    print(f"\nTopic {topic_id}:")
    print(f"  - {'LLM:'.ljust(15)} {llm_label}")
    print(f"  - {'LLM_Custom:'.ljust(15)} {llm_custom_label}")
    print(f"  - {'LLM_Custom2:'.ljust(15)} {llm_custom_label2}")

print("\n" + "="*80 + "\n")

### Process and apply topic labels generated by Llama3


In [None]:
llm_labels_raw = [label[0][0] for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [re.sub(r'\s+', ' ', label.split("\n")[0].replace('"', '')).strip() for label in llm_labels_raw]
#llm_labels = [label[0][0].replace('\nThe topic is labeled as:','').replace('\n', '').replace('Label:', '').replace('"', '') for label in topic_model.get_topics(full=True)["LLM"].values()]
# llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]

llm_labels = [label if label else "Unlabelled" for label in llm_labels]

all_labels = [llm_labels[topic + topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics] #create mapping for all sentences
len(all_labels) #should be the same as the sentences

#set topic labels for visu
unique_topics = sorted(set(topics))
topic_labels = {topic_id: llm_labels[i] for i, topic_id in enumerate(unique_topics) if topic_id != -1}
topic_model.set_topic_labels(topic_labels)

print("\nGenerated Topic Labels:")
topic_labels


### Visualisation

In [None]:
# Basic visu (to compare with visu before LLM labelling)
display(topic_model.visualize_documents(df_reports, title=f"Documents and Topics for {condition} (Llama 3)",
hide_annotations=True, hide_document_hover=False, custom_labels=True))


# Datammaplot visu 
reduced_embeddings = UMAP(
    n_neighbors=int(chosen_params['n_neighbors']),
    n_components=2, #reduced embedding to 2D for visualization
    min_dist=float(chosen_params['min_dist']),
    metric='cosine',
    random_state=random_seed
).fit_transform(embeddings)

datamapplot.create_plot(
   reduced_embeddings,
   all_labels,
   label_font_size=15,
   title=f"{condition}: Topic representations with Llama 3",
   sub_title="Labeled with `llama-3-8b-instruct` (representations on reduced 2D embeddings)",
   label_wrap_width=20,
   use_medoids=True
)

# hierarchical visu
display(topic_model.visualize_hierarchy(custom_labels=True))

### create and save final topic summary

In [None]:
def create_topic_summary(topic_labels, df_reports, topics):
    """Creates df summarizing topic counts and content."""
    topic_analysis = pd.DataFrame({
        'Topic_Content': df_reports,
        'Topic_Number': topics
    })
    
    # map the topic number to the generated Llama label
    topic_analysis['Topic_Label'] = topic_analysis['Topic_Number'].map(topic_labels).fillna("Outlier")

    # Group and aggregate results
    topic_summary = (
        topic_analysis
        .groupby(['Topic_Label', 'Topic_Number'])
        .agg(
            Sentence_Count=('Topic_Content', 'count'),
            Content=('Topic_Content', list)
        )
        .sort_values('Sentence_Count', ascending=False)
        .reset_index()
    )
    
    return topic_summary

# --- Generate and display the summary ---
topic_summary = create_topic_summary(topic_labels, df_reports, topics)

print("\nFinal Topic Summary:")
print(topic_summary[['Topic_Label', 'Topic_Number', 'Sentence_Count']])

# --- Save summary to CSV ---
summary_results_file = os.path.join(project_root, "RESULTS", f"topic_analysis_{condition}.csv")
topic_summary.to_csv(summary_results_file, index=False)
print(f"\nSummary saved to {summary_results_file}")
