@Author: Romy Beauté\
@Contact: r.beaut@sussex.ac.uk\
@Date: 07-2023\
@Last modification: 08-03-2024

In [1]:
import pandas as pd
import os 
from pathlib import Path
from bertopic import BERTopic
import re 
from nltk.corpus import stopwords
import sys


#import helpers functions manually created
project_path = os.path.abspath('/Users/rb666/projects/TopicModelling_META')
if project_path not in sys.path:
    sys.path.append(project_path)
from helpers.BERT_helpers import *


#Select parameters for dataset and preprocessing
HighSensory = False #If False, corresponds to deep listening experience (no flicker)
remove_stopwords = True
extended_stopwords = False


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


# Prepare reports dataset

In [2]:
dataset_name = "SensoryTool_CombinedData.csv"
metaproject_name = 'TopicModelling_META'
subproject_name = 'TopMod_pipeline'

condition = 'highsensory' if HighSensory else 'deeplistening'
print(f'Condition : "{condition}"')

PROJDIR = os.path.expanduser(f"~/projects/{metaproject_name}")
DATADIR = os.path.join(PROJDIR,f'DATA/{dataset_name}')
CODEDIR = os.path.join(PROJDIR,f'{subproject_name}')

df = pd.read_csv(DATADIR)
dataset = df[df['meta_HighSensory'] == HighSensory]['reflection_answer']
reports = dataset[dataset.notna() & (dataset != '')]

print('N={} reports (HighSensory = {})'.format(len(reports),HighSensory))
reports.head()




Condition : "deeplistening"
N=98 reports (HighSensory = False)


  df = pd.read_csv(DATADIR)


29      thecoloursmade patterns when i had my eyes clo...
252     not sure what the puroose of it was except bei...
1062    thus should be available for everyone all the ...
1292    detachment as though my body was in one box, m...
1469                                           a doorway 
Name: reflection_answer, dtype: object

### Start preprocessing

In [3]:
#select stop words to remove
stop_words = set(stopwords.words('english'))
if extended_stopwords:
    stop_words = stop_words.union(custom_stopwords) #load custom stopwords from BERT_helpers.py

reports_cleaned = reports.apply(clean_text)
reports_filtered = reports_cleaned[reports_cleaned.apply(lambda x: len(x.split()) > 1)]

print(reports_filtered.head())
#print the number of reports after cleaning
print('N={} reports after cleaning'.format(len(reports_filtered)))

29      thecoloursmade patterns eyes closec almost lik...
252     sure puroose except muxic certainly inuced sle...
1062                         thus available everyone time
1292    detachment though body one box mind another an...
1509                 visuals trees heatsun likeexperience
Name: reflection_answer, dtype: object
N=90 reports after cleaning


# Embeddings and BERTopic Model

After preporcessing we convert clean text into embeddings and apply BERTopic model. This step includes choosing the right transformer model for embeddings, setting BERTopic parameters, and training the model 

In [4]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer



# convert text to embeddings
model = "all-MiniLM-L6-v2"  #"paraphrase-MiniLM-L6-v2" #
sentence_model = SentenceTransformer(model)
embeddings = sentence_model.encode(reports_filtered.values,show_progress_bar=True)
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")



# BERTopic model initialisation
model = BERTopic(language="english", 
                #  vectorizer_model=vectorizer_model, 
                 calculate_probabilities=True,
                 min_topic_size=5,
                 embedding_model=model)



# Fit BERTopic model
topics, probs = model.fit_transform(reports_filtered.values, embeddings) 



Batches:   0%|          | 0/3 [00:00<?, ?it/s]

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [5]:
model.visualize_topics()

### visualisation and evaluation

In [6]:
topic_freq = model.get_topic_freq().head() #Get the frequency of each topic
freq = model.get_topic_info() #Get the frequency of each topic
print("N number of topics: ",len(freq))
print(freq.head())

N number of topics:  5
   Topic  Count                           Name  \
0     -1     23  -1_saw_way_everything_colours   
1      0     21   0_eyes_colours_see_different   
2      1     21         1_like_felt_ocean_blue   
3      2     16              2_music_ho_un_che   
4      3      9     3_deep_mind_complete_would   

                                      Representation  \
0  [saw, way, everything, colours, like, felt, li...   
1  [eyes, colours, see, different, saw, lights, c...   
2  [like, felt, ocean, blue, also, things, one, l...   
3  [music, ho, un, che, space, took, journey, fel...   
4  [deep, mind, complete, would, thoughts, back, ...   

                                 Representative_Docs  
0  [windows busy life flashes past seeing lovely ...  
1  [saw different cokour lights centre closed eye...  
2  [felt like deep ocean dark also brilliant blue...  
3  [first hearing gongs saw sammi lings buddhist ...  
4  [detachment though body one box mind another a...  


In [7]:
model.visualize_barchart() #Visualize the top 10 topics

In [8]:
model.visualize_documents(reports_filtered.values) #Visualize the documents in each topic

In [9]:
#Visualize the documents that belong to the top 10 topics
model.visualize_heatmap()
model.visualize_hierarchy()

