# Topic modelling for Dreamachine dataset : Deep Listening (DL) condition


Author : Romy Beauté\
Date created : 13/05/2024\
Last modified : 04/11/2024\
Corresp : r.beaut@sussex.ac.uk

Selection of sentence transformer embedding models :
https://www.sbert.net/docs/pretrained_models.html

The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality



In [None]:
%%capture
!pip install bertopic accelerate bitsandbytes xformers adjustText
!pip install llama-cpp-python
!{sys.executable} -m pip install "scipy==1.9.0" "scikit-image==0.23.2"

import os
import nltk
import sys
import pandas as pd

current_dir = os.getcwd()  # should be in NOTEBOOKS
lyra_dir = os.path.abspath(os.path.join(current_dir, '..'))  # go up one level
print("Lyra directory:", lyra_dir)


# Change directory and update path
os.chdir(lyra_dir)
sys.path.insert(0, lyra_dir)
print("Current working directory:", os.getcwd())
print("Python path first entry:", sys.path[0])


from grid_search_colyra import *
from META_helpers_colyra import split_sentences,reduced_custom_stopwords


os.environ["TOKENIZERS_PARALLELISM"] = "True"
nltk.download('stopwords')


condition = "DL" 
sentences = True
reduced_GS = False
random_seed = 42
nr_topics = "auto" 



results_GS = pd.read_csv(f'/home/romy/lyra/RESULTS/grid_search_results_{condition}_seed42{"_sentences" if sentences else ""}{"_reduced" if reduced_GS else ""}.csv')

reports_path = os.path.join("DATA",f"{condition}_reflections_cleaned.csv")
df_reports = pd.read_csv(reports_path,sep="\t")['reflection_answer']


if sentences:
    df_reports = split_sentences(df_reports)

results_GS.head(5)

In [None]:
# #only print the results that have between 10 and 25 number of topics
results_GS = results_GS[results_GS['n_topics']<=25]
results_GS = results_GS[results_GS['n_topics']>=10]
results_GS = results_GS.sort_values(by='coherence_score',ascending=False)



results_GS.head(5)

In [None]:
chosen_params = results_GS.iloc[0]
chosen_params


In [None]:
sentence_transformer_model = "all-mpnet-base-v2" #"paraphrase-MiniLM-L6-v2"# #"BAAI/bge-small-en" "all-MiniLM-L6-v2'"
embedding_model = SentenceTransformer(sentence_transformer_model)
embeddings = embedding_model.encode(df_reports, show_progress_bar=True)
print(embedding_model)

vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words=list(stopwords.words('english')),max_df=0.9,min_df=2)#+list(reduced_custom_stopwords)) #ngram_range=(1,2) : unigrams and bigrams

n_neighbors = int(chosen_params['n_neighbors'])
n_components = int(chosen_params['n_components'])
min_dist = float(chosen_params['min_dist'])
min_cluster_size = int(chosen_params['min_cluster_size'])
min_samples= int(chosen_params['min_samples']) if pd.notna(chosen_params['min_samples']) else None
top_n_words = int(chosen_params['top_n_words'])
nr_topics = "auto"



metric='cosine'

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine', random_state=random_seed).fit_transform(embeddings)

umap_model = UMAP(n_neighbors=n_neighbors,
                  n_components=n_components,
                  min_dist=min_dist,
                  metric=metric,
                  random_state=random_seed) # rdm seed for reportability

hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size,
                        min_samples=min_samples,
                        gen_min_span_tree=True,
                        prediction_data=True)


In [None]:
model = BERTopic(
    umap_model=umap_model,
    low_memory=True,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=top_n_words,
    nr_topics= nr_topics,#default to None
    language='english',
    calculate_probabilities=True,
    verbose=True)

topics,_ = model.fit_transform(df_reports)

coherence_score,coherence_score_umass = calculate_coherence(model, df_reports) #sanitary check for coherence, that we obtain the same score as when running GS
print("Coherence Score:", coherence_score)

print(f"n = {len(np.unique(topics))} topics extracted")


#check how many documents identified as outliers
print(len(model.hdbscan_model.labels_[model.hdbscan_model.labels_==-1]))

In [None]:
#print sentences that are identified as outliers
outliers = np.where(model.hdbscan_model.labels_==-1)[0]
for i in outliers:
    print(df_reports[i])


In [None]:
new_topics = model.reduce_outliers(df_reports, topics, strategy="embeddings",threshold=0.4)

# Check the impact
print(f"Original outliers: {len(model.hdbscan_model.labels_[model.hdbscan_model.labels_==-1])}")
print(f"Remaining outliers after reduction: {len([t for t in new_topics if t == -1])}")

#print the sentences that are identified as outliers after reduction
outliers = np.where(np.array(new_topics)==-1)[0]
for i in outliers:
    print(df_reports[i])

# Verify coherence is maintained
model.topics_ = new_topics
coherence_score, coherence_score_umass = calculate_coherence(model, df_reports)
print(f"New coherence score: {coherence_score:.4f}")

In [None]:
topics = new_topics
_, probabilities = model.transform(df_reports)

print(f"Number of unique topics after update: {len(np.unique([t for t in topics if t != -1]))}")
print(f"Number of remaining outliers: {len([t for t in topics if t == -1])}")


In [None]:
model.visualize_barchart(title=f'Topic Word Scores for {condition}',autoscale=True)

In [None]:
model.visualize_documents(df_reports,title=f'Documents and Topics for {condition}')

# LLAMA 3

In [None]:
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
# # !wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf

# # DataMapPlot
# !git clone https://github.com/TutteInstitute/datamapplot.git
# !pip install datamapplot/.

import datamapplot
import torch
torch.cuda.empty_cache()

# # Remove existing installations
# !pip uninstall -y cudf cuml cugraph cupy rmm

# Install RAPIDS packages in correct order
!pip install --upgrade rmm-cu12 --extra-index-url=https://pypi.nvidia.com/simple
!pip install --upgrade cudf-cu12 --extra-index-url=https://pypi.nvidia.com/simple
!pip install --upgrade cuml-cu12 --extra-index-url=https://pypi.nvidia.com/simple

In [None]:
from huggingface_hub import hf_hub_download  # For loading the model instance from Hugging Face
import os  # For creating system directories
from llama_cpp import Llama  # LLM Wrapper
from bertopic.representation import KeyBERTInspired, LlamaCPP  # Representation Comparison
from sentence_transformers import SentenceTransformer  # Embedding Model Wrapper

from bertopic import BERTopic  # For topic modeling with BERTopic
import PIL  # For image processing
import numpy as np  # For numerical computations
import requests  # For making HTTP requests
import re  # For regular expressions


#to speed up computation
import torch
torch.cuda.empty_cache()
from cuml.manifold import UMAP  # For UMAP dimensionality reduction
from cuml.cluster import HDBSCAN  # For clustering with HDBSCAN

#add periodic memory clearing (use if too slow to run)
def process_batch(batch):
    results = []
    for item in batch:
        results.append(process_item(item))
        if len(results) % 100 == 0:  # Clear memory every 100 items
            torch.cuda.empty_cache()
    return results


In [None]:
model_name_or_path = "NousResearch/Meta-Llama-3-8B-Instruct-GGUF"
model_basename = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
os.makedirs('model', exist_ok=True)
path = "model"


# Downloading the model from repo
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename, cache_dir=path)

# Use llama.cpp to load in a Quantized LLM
llm = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"],verbose=False)

In [None]:



prompt = """Q:
Topic documents:
[DOCUMENTS]

Topic keywords: '[KEYWORDS]'

Create a specific, scientific label for this topic based on the above information. The label should:

2. Use proper spacing between words
3. Use Title Case (capitalize each significant word)
5. Not start or end with spaces
6. Be a complete title (not cut off mid-word or mid-idea)

Only return the label itself, without any additional text or punctuation.
A:
"""


representation_model = {
   "KeyBERT": KeyBERTInspired(),
   "LLM": LlamaCPP(llm, prompt=prompt),
}


In [None]:
torch.cuda.empty_cache()

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  nr_topics = nr_topics,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(df_reports,embeddings)

# Show topics
topic_model.get_topic_info()



In [None]:
# outlier reduction using embeddingd
new_topics = topic_model.reduce_outliers(df_reports, topics,strategy='embeddings',threshold=0.4)

# Check the impact
print(f"Original outliers: {len(topic_model.hdbscan_model.labels_[topic_model.hdbscan_model.labels_==-1])}")
print(f"Remaining outliers after reduction: {len([t for t in new_topics if t == -1])}")

topic_model.topics_ = new_topics

topics = new_topics


In [None]:



llm_labels = [label[0][0].replace('\nThe topic is labeled as:','').replace('\n', '').replace('Label:', '').replace('"', '') for label in topic_model.get_topics(full=True)["LLM"].values()]

llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [label if label else "Unlabelled" for label in llm_labels]
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics] 

filtered_labels = [label for label in all_labels if label != "Unlabelled"] #remove -1 topics (outliers)



#set topic labels for visu
unique_topics = sorted(set(topics))
topic_labels = {topic_id: llm_labels[i] for i, topic_id in enumerate(unique_topics) if topic_id != -1}
topic_model.set_topic_labels(topic_labels)

topic_labels

In [None]:
topic_model.visualize_documents(df_reports, title=f"Documents and Topics for {condition} (Llama 3)",
hide_annotations=True, hide_document_hover=False, custom_labels=True)

In [None]:
# Run the visualization
datamapplot.create_plot(
   reduced_embeddings,
   all_labels,
   label_font_size=11,
   title=f"{condition} : Topic representations with Llama 3",
   sub_title="labeled with `llama-3-8b-instruct` (representations on reduced 2D embeddings)",
   label_wrap_width=20,
   use_medoids=True
);

In [None]:

hierarchy_vis = topic_model.visualize_hierarchy(custom_labels=True, color_threshold=.9)
hierarchy_vis

In [None]:
# Create a DataFrame with topic statistics and content
topic_analysis = pd.DataFrame({
    'Topic_Label': all_labels,
    'Topic_Content': df_reports,
    'Topic_Number': topics
}).reset_index()

# Get topic counts and contents
topic_summary = (
    topic_analysis
    .groupby(['Topic_Label', 'Topic_Number'])
    .agg({
        'index': 'count',  # Count of sentences
        'Topic_Content': lambda x: list(x)  # List of all sentences
    })
    .rename(columns={'index': 'Sentence_Count'})
    .sort_values('Sentence_Count', ascending=False)
    .reset_index()
)

# Print summary for each topic
for _, row in topic_summary.iterrows():
    print(f"\n{'='*80}")
    print(f"Topic Label: {row['Topic_Label']}")
    print(f"Number of sentences: {row['Sentence_Count']}")
    print("\nSentences in this topic:")
    for i, sentence in enumerate(row['Topic_Content'], 1):
        print(f"{i}. {sentence}")

# Save to CSV if needed
topic_summary.to_csv(f'topic_analysis_{condition}.csv', index=False)

# Display DataFrame
print("\nSummary DataFrame:")
print(topic_summary[['Topic_Label', 'Topic_Number', 'Sentence_Count']])