@Author: Romy Beauté\
@Contact: r.beaut@sussex.ac.uk\
@Date: 07-2023\
@Last modification: 08-03-2024

In [84]:
import pandas as pd
import os 
from pathlib import Path
from bertopic import BERTopic
import re 
from nltk.corpus import stopwords
import sys


#import helpers functions manually created
project_path = os.path.abspath('/Users/rb666/projects/TopicModelling_META')
if project_path not in sys.path:
    sys.path.append(project_path)
from helpers.BERT_helpers import *


#Select parameters for dataset and preprocessing
HighSensory = True #If False, corresponds to deep listening experience (no flicker)
remove_stopwords = True
extended_stopwords = False


# Prepare reports dataset

In [85]:
dataset_name = "SensoryTool_CombinedData.csv"
metaproject_name = 'TopicModelling_META'
subproject_name = 'TopMod_pipeline'

condition = 'highsensory' if HighSensory else 'deeplistening'
print(f'Condition : "{condition}"')

PROJDIR = os.path.expanduser(f"~/projects/{metaproject_name}")
DATADIR = os.path.join(PROJDIR,f'DATA/{dataset_name}')
CODEDIR = os.path.join(PROJDIR,f'{subproject_name}')

df = pd.read_csv(DATADIR)
dataset = df[df['meta_HighSensory'] == HighSensory]['reflection_answer']
reports = dataset[dataset.notna() & (dataset != '')]

print('N={} reports (HighSensory = {})'.format(len(reports),HighSensory))
reports.head()




Condition : "highsensory"
N=336 reports (HighSensory = True)



Columns (23,24,71,73,75,77,109,111,112,113,114,115,116,117,118,119,120,121,122,123,124,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,157,159,161,165,168,196,203,205) have mixed types. Specify dtype option on import or set low_memory=False.



83     Intense chaos. And then my mind checked out an...
99                                         good stuff.\n
120    a pattern of red and white lights thatflashed ...
172    i wentback to many hard and mostly beautiful m...
189                                                Hello
Name: reflection_answer, dtype: object

### Start preprocessing

In [86]:
#select stop words to remove
stop_words = set(stopwords.words('english'))
if extended_stopwords:
    stop_words = stop_words.union(custom_stopwords) #load custom stopwords from BERT_helpers.py

reports_cleaned = reports.apply(clean_text)
reports_filtered = reports_cleaned[reports_cleaned.apply(lambda x: len(x.split()) > 1)]

print(reports_filtered.head())
#print the number of reports after cleaning
print('N={} reports after cleaning'.format(len(reports_filtered)))

83     intense chaos mind checked subconscious took s...
99                                            good stuff
120    pattern red white lights thatflashed andbecame...
172    wentback many hard mostly beautiful memories w...
191                                          hope colour
Name: reflection_answer, dtype: object
N=312 reports after cleaning


# Embeddings and BERTopic Model

After preporcessing we convert clean text into embeddings and apply BERTopic model. This step includes choosing the right transformer model for embeddings, setting BERTopic parameters, and training the model 

In [87]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer



# convert text to embeddings
model = "all-MiniLM-L6-v2"  #"paraphrase-MiniLM-L6-v2" #
sentence_model = SentenceTransformer(model)
embeddings = sentence_model.encode(reports_filtered.values,show_progress_bar=True)
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")



# BERTopic model initialisation
model = BERTopic(language="english", 
                #  vectorizer_model=vectorizer_model, 
                 calculate_probabilities=True,
                 min_topic_size=5,
                 embedding_model=model)



# Fit BERTopic model
topics, probs = model.fit_transform(reports_filtered.values, embeddings) 



Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [88]:
model.visualize_topics()

### visualisation and evaluation

In [89]:
topic_freq = model.get_topic_freq().head() #Get the frequency of each topic
freq = model.get_topic_info() #Get the frequency of each topic
print("N number of topics: ",len(freq))
print(freq.head())

N number of topics:  15
   Topic  Count                             Name  \
0     -1    100   -1_felt_experience_like_really   
1      0     70            0_shapes_saw_like_one   
2      1     20  1_peace_peaceful_year_happiness   
3      2     17         2_ocean_saw_visuals_away   
4      3     16       3_colours_orange_like_pink   

                                      Representation  \
0  [felt, experience, like, really, could, memori...   
1  [shapes, saw, like, one, moving, colours, colo...   
2  [peace, peaceful, year, happiness, calm, self,...   
3  [ocean, saw, visuals, away, charlie, boat, fac...   
4  [colours, orange, like, pink, colors, patterns...   

                                 Representative_Docs  
0  [thought fluffy speech writtenhealth safetyasp...  
1  [experienced range images chose many colours s...  
2  [peace sunlight ture, utter peace happiness, s...  
3  [interesting experience definitely easy moment...  
4  [first unsettling lights flashing like kaledis..

In [90]:
model.visualize_barchart() #Visualize the top 10 topics

In [91]:
model.visualize_documents(reports_filtered.values) #Visualize the documents in each topic

In [92]:
#Visualize the documents that belong to the top 10 topics
model.visualize_heatmap()
model.visualize_hierarchy()

