<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/topic_modelling/BERTopic2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modelling with BERTopic

## Mounting the Drive (Google Colab)

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Importing The Packages

In [None]:
!pip install octis

In [None]:
!pip install bertopic
!pip install -U sentence-transformers
!pip install datashader


In [81]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 15
np.set_printoptions(precision = 4, suppress = True)

# Creates a progress bar for pandas functions
from tqdm import tqdm
tqdm.pandas()

# Used for resolving paths
from pathlib import Path

# Topic modelling with BERT necessities 
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

# Evaluation
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from sklearn.feature_extraction.text import CountVectorizer

# Filter out the irrelevant warnings
import warnings
warnings.filterwarnings('ignore')

## Resolving File Paths

In [4]:
cur_dir = Path().resolve() # this should provide you with the folder in which this notebook is placed

### Google Colab

In [5]:
# use this for colab
gdrive_path_to_datasets = Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Datasets')
print(gdrive_path_to_datasets)
print('Does path exist? ->', Path.exists(gdrive_path_to_datasets))

path_to_datasets = gdrive_path_to_datasets

/content/gdrive/Shareddrives/Minecraft/Datasets
Does path exist? -> True


### Local Repository

In [None]:
# use this for local repository
local_path_to_datasets = Path.joinpath(cur_dir, 'datasets')
print(local_path_to_datasets)
print('Does path exist? ->', Path.exists(local_path_to_datasets))

path_to_datasets = local_path_to_datasets

In [6]:
# same for colab and local repository
path_to_docs = Path.joinpath(path_to_datasets, '8April_100k_sentiment_sample_50_50_mixed.csv')
print(path_to_docs)
print('Does path exist? ->', Path.exists(path_to_docs))

/content/gdrive/Shareddrives/Minecraft/Datasets/8April_100k_sentiment_sample_50_50_mixed.csv
Does path exist? -> True


## Importing The Data

In [7]:
docs = pd.read_csv(path_to_docs)
docs = docs["review"]
docs.head()

0    Solid breakfast menu. For people who actually ...
1    FOOD:  the dumplings (dim sum) were not that i...
2    The food was good. I was looking for an inexpe...
3    Unique ambiance and good quality food (althoug...
4    We just got delivery from here after not havin...
Name: review, dtype: object

## Training

In [58]:
# Set the random state in the UMAP model to prevent stochastic behavior 
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)

In [80]:
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words="english")

### Embedding Model 1: paraphrase-distilroberta-base-v2

In [9]:
# Create embeddings
sentence_model = SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v2")
embeddings_1 = sentence_model.encode(docs, show_progress_bar = True)

Downloading:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [10]:
# saving embeddings as npz file
from numpy import savez_compressed
savez_compressed(Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Embeddings/paraphrase-distilroberta-base-v2.npz'), embeddings_1)

#### 10 Topics

In [24]:
# Fit the model
first_model_10 = BERTopic(umap_model = umap_model, nr_topics = 10, min_topic_size = 8)
first_model_10_topics, _ = first_model_10.fit_transform(docs, embeddings_1)

In [82]:
first_model_10.update_topics(docs, first_model_10_topics, vectorizer_model=vectorizer_model)

#### 20 Topics

In [29]:
first_model_20 = BERTopic(umap_model = umap_model, nr_topics = 20, min_topic_size = 8)
first_model_20_topics, _ = first_model_20.fit_transform(docs, embeddings_1)

In [83]:
first_model_20.update_topics(docs, first_model_20_topics, vectorizer_model=vectorizer_model)

#### 30 Topics

In [30]:
first_model_30 = BERTopic(umap_model = umap_model, nr_topics = 30, min_topic_size = 8)
first_model_30_topics, _ = first_model_30.fit_transform(docs, embeddings_1)

In [84]:
first_model_30.update_topics(docs, first_model_30_topics, vectorizer_model=vectorizer_model)

Saving Models and Topics

In [85]:
path_to_models = Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Our_Models/BERTopic')
first_model_10.save(Path.joinpath(path_to_models, 'first_model_10'))
first_model_20.save(Path.joinpath(path_to_models, 'first_model_20'))
first_model_30.save(Path.joinpath(path_to_models, 'first_model_30'))

### Embedding Model 2: paraphrase-MiniLM-L12-v2

In [35]:
sentence_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L12-v2")
embeddings_2 = sentence_model.encode(docs, show_progress_bar = True)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [36]:
# saving embeddings as npz file
savez_compressed(Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Embeddings/paraphrase-MiniLM-L12-v2.npz'), embeddings_2)

#### 10 Topics

In [37]:
second_model_10 = BERTopic(umap_model = umap_model, nr_topics = 10, min_topic_size = 8)
second_model_10_topics, _ = second_model_10.fit_transform(docs, embeddings_2)

In [86]:
second_model_10.update_topics(docs, second_model_10_topics, vectorizer_model=vectorizer_model)

#### 20 Topics

In [38]:
second_model_20 = BERTopic(umap_model = umap_model, nr_topics = 20, min_topic_size = 8)
second_model_20_topics, _ = second_model_20.fit_transform(docs, embeddings_2)

In [87]:
second_model_20.update_topics(docs, second_model_20_topics, vectorizer_model=vectorizer_model)

#### 30 Topics

In [39]:
second_model_30 = BERTopic(umap_model = umap_model, nr_topics = 30, min_topic_size = 8)
second_model_30_topics, _ = second_model_30.fit_transform(docs, embeddings_2)

In [88]:
second_model_30.update_topics(docs, second_model_30_topics, vectorizer_model=vectorizer_model)

Saving Models and Topics

In [89]:
second_model_10.save(Path.joinpath(path_to_models, 'second_model_10'))
second_model_20.save(Path.joinpath(path_to_models, 'second_model_20'))
second_model_30.save(Path.joinpath(path_to_models, 'second_model_30'))

### Embedding Model 3: paraphrase-mpnet-base-v2

In [55]:
sentence_model = SentenceTransformer("sentence-transformers/paraphrase-mpnet-base-v2")
embeddings_3 = sentence_model.encode(docs, show_progress_bar = True)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [56]:
# saving embeddings as npz file
savez_compressed(Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Embeddings/paraphrase-mpnet-base-v2.npz'), embeddings_3)

10 Topics

In [67]:
third_model_10 = BERTopic(umap_model = umap_model, nr_topics = 10)
third_model_10_topics, _ = third_model_10.fit_transform(docs, embeddings_3)

In [90]:
third_model_10.update_topics(docs, third_model_10_topics, vectorizer_model=vectorizer_model)

20 Topics

In [68]:
third_model_20 = BERTopic(umap_model = umap_model, nr_topics = 20)
third_model_20_topics, _ = third_model_20.fit_transform(docs, embeddings_3)

In [91]:
third_model_20.update_topics(docs, third_model_20_topics, vectorizer_model=vectorizer_model)

30 Topics

In [69]:
third_model_30 = BERTopic(umap_model = umap_model, nr_topics = 30)
third_model_30_topics, _ = third_model_30.fit_transform(docs, embeddings_3)

In [None]:
third_model_30.update_topics(docs, third_model_30_topics, vectorizer_model=vectorizer_model)

Saving Models and Topics

In [92]:
third_model_10.save(Path.joinpath(path_to_models, 'third_model_10'))
third_model_20.save(Path.joinpath(path_to_models, 'third_model_20'))
third_model_30.save(Path.joinpath(path_to_models, 'third_model_30'))

## Evaluation

### NPMI Coherence Scores

In [26]:
def calculuate_coherence_score(topic_model, topics, docs):  
    """

    ----------
    Calculate NPMI coherence score for a fitted BERTopic model.

    Author: Maarten Grootendorst
    Source: https://github.com/MaartenGr/BERTopic/issues/90
    ----------
    
    :param topic_model: BERTopic model for evaluation.
    :param topics: The topics generated by the model.
    :param docs: The data used for evaluation. 
    :return: The coherence score for the generated topics.

    """
    
    # Preprocess Documents
    documents = pd.DataFrame({"Document": docs,
                            "ID": range(len(docs)),
                            "Topic": topics})

    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words != ''] 
        for topic in range(0, (len(set(topics))-1))]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                    texts=tokens, 
                                    corpus=corpus,
                                    dictionary=dictionary, 
                                    coherence='c_v')
    coherence = coherence_model.get_coherence()

    return coherence

CV Scores for The First Embedding Model

In [93]:
cv_first_model_10 = calculuate_coherence_score(first_model_10, first_model_10_topics, docs)
cv_first_model_10

0.5716354618102059

In [94]:
cv_first_model_20 = calculuate_coherence_score(first_model_20, first_model_20_topics, docs)
cv_first_model_20

0.6230034348458839

In [95]:
cv_first_model_30 = calculuate_coherence_score(first_model_30, first_model_30_topics, docs)
cv_first_model_30

0.6170944074267501

CV Scores for The Second Embedding Model

In [96]:
cv_second_model_10 = calculuate_coherence_score(second_model_10, second_model_10_topics, docs)
cv_second_model_10

0.5545338754678701

In [97]:
cv_second_model_20 = calculuate_coherence_score(second_model_20, second_model_20_topics, docs)
cv_second_model_20

0.5928406223632277

In [98]:
cv_second_model_30 = calculuate_coherence_score(second_model_30, second_model_30_topics, docs)
cv_second_model_30

0.6106763863050818

CV Scores for The Third Embedding Model

In [99]:
cv_third_model_10 = calculuate_coherence_score(third_model_10, third_model_10_topics, docs)
cv_third_model_10

0.5316643797779589

In [100]:
cv_third_model_20 = calculuate_coherence_score(third_model_20, third_model_20_topics, docs)
cv_third_model_20

0.5716643763481318

In [79]:
cv_third_model_30 = calculuate_coherence_score(third_model_30, third_model_30_topics, docs)
cv_third_model_30

0.610815256722585

### Topic Diversity Scores

In [48]:
metric = TopicDiversity(topk=10)  # Initialize metric

TD for The First Model

In [54]:
td_first_model_10 = metric.score(first_model_10) # Compute score of the metric
td_first_model_10

TypeError: ignored

In [None]:
td_first_model_20 = metric.score(first_model_20_topics)
td_first_model_20

In [None]:
td_first_model_30 = metric.score(first_model_30_topics)
td_first_model_30

TD for The Second Model

In [None]:
td_second_model_10 = metric.score(second_model_10_topics) # Compute score of the metric
td_second_model_10

In [None]:
td_second_model_20 = metric.score(second_model_20_topics)
td_second_model_20

In [None]:
td_second_model_30 = metric.score(second_model_30_topics)
td_second_model_30

TD for The Third Model

In [None]:
td_third_model_10 = metric.score(third_model_10_topics) # Compute score of the metric
td_third_model_10

In [None]:
td_third_model_20 = metric.score(third_model_20_topics) 
td_third_model_20

In [None]:
td_third_model_30 = metric.score(third_model_30_topics) 
td_third_model_30

### Visualization

In [101]:
first_model_20.visualize_topics()

In [102]:
first_model_20.visualize_hierarchy()

In [104]:
first_model_20.get_topics()

{-1: [('food', 0.02213895901167306),
  ('good', 0.01916942408293472),
  ('place', 0.01858008277490456),
  ('great', 0.015288269745647237),
  ('like', 0.015069551405505548),
  ('just', 0.014973259293914196),
  ('service', 0.014410101650959172),
  ('really', 0.013280351836502334),
  ('time', 0.012436744380385039),
  ('restaurant', 0.011988220350685457)],
 0: [('pizza', 0.13699311358541555),
  ('crust', 0.03134264847156243),
  ('slice', 0.028500305935417625),
  ('pie', 0.025759282779072197),
  ('good', 0.021173758319086786),
  ('place', 0.02026518440582996),
  ('pizzas', 0.018924306568498913),
  ('best', 0.018035730127280457),
  ('just', 0.01751144817245275),
  ('like', 0.016720207682618397)],
 1: [('brunch', 0.06625631990924506),
  ('eggs', 0.03879193857818006),
  ('pancakes', 0.03310648396001137),
  ('coffee', 0.027586700435963302),
  ('breakfast', 0.0243601268228855),
  ('toast', 0.02406616409775676),
  ('wait', 0.019793737231335512),
  ('good', 0.01969028638154123),
  ('french', 0.018