<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/topic_modelling/BERTopic2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modelling with BERTopic

## Mounting the Drive (Google Colab)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Importing The Packages

In [None]:
!pip install bertopic
!pip install -U sentence-transformers
!pip install datashader


In [None]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 15
np.set_printoptions(precision = 4, suppress = True)

# Creates a progress bar for pandas functions
from tqdm import tqdm
tqdm.pandas()

# Used for resolving paths
from pathlib import Path

# Topic modelling with BERT necessities 
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

# Evaluation
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import umap.plot

# Filter out the irrelevant warnings
import warnings
warnings.filterwarnings('ignore')

## Resolving File Paths

In [None]:
cur_dir = Path().resolve() # this should provide you with the folder in which this notebook is placed

### Google Colab

In [None]:
# use this for colab
gdrive_path_to_datasets = Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Datasets')
print(gdrive_path_to_datasets)
print('Does path exist? ->', Path.exists(gdrive_path_to_datasets))

path_to_datasets = gdrive_path_to_datasets

/content/gdrive/Shareddrives/Minecraft/Datasets
Does path exist? -> True


### Local Repository

In [None]:
# use this for local repository
local_path_to_datasets = Path.joinpath(cur_dir, 'datasets')
print(local_path_to_datasets)
print('Does path exist? ->', Path.exists(local_path_to_datasets))

path_to_datasets = local_path_to_datasets

In [None]:
# same for colab and local repository
path_to_docs = Path.joinpath(path_to_datasets, 'new_100k_sentiment_sample_50_50.csv')
print(path_to_docs)
print('Does path exist? ->', Path.exists(path_to_docs))

/content/gdrive/Shareddrives/Minecraft/Datasets/new_100k_sentiment_sample_50_50.csv
Does path exist? -> True


## Importing The Data

In [None]:
docs = pd.read_csv(path_to_docs)
docs = docs["review"]
docs.head()

0    I went there last night with a large group so ...
1    I really wanted to like ONE 53, but I'm not su...
2    Business in the land of Lidia is doing well. W...
3    Pleasant little joint for a sandwich.  Food is...
4    I think that the food is great and this place ...
Name: review, dtype: object

## Training

### Embedding Model 1: all-distilroberta-v1

In [None]:
# Create embeddings
sentence_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
embeddings_1 = sentence_model.encode(docs, show_progress_bar = True)

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [None]:
# saving embeddings as npz file
from numpy import savez_compressed
savez_compressed(Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Embeddings/all-distilroberta-v1.npz'), embeddings_1)

#### 10 Topics

In [None]:
# Fit the model
first_model_10 = BERTopic(nr_topics = 10).fit(docs, embeddings_1)

In [None]:
# Create topics for later evaluation
first_model_10_topics, _ = BERTopic(nr_topics = 10).fit_transform(docs, embeddings_1)

#### 20 Topics

In [None]:
first_model_20 = BERTopic(nr_topics = 20).fit(docs, embeddings_1)

In [None]:
first_model_20_topics, _ = BERTopic(nr_topics = 20).fit_transform(docs, embeddings_1)

#### 30 Topics

In [None]:
first_model_30 = BERTopic(nr_topics = 30).fit(docs, embeddings_1)

In [None]:
first_model_30_topics, _ = BERTopic(nr_topics = 30).fit_transform(docs, embeddings_1)

Saving Models and Topics

In [None]:
path_to_models = Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Models/BERTopic')
first_model_10.save(Path.joinpath(path_to_models, 'first_model_10'))
first_model_20.save(Path.joinpath(path_to_models, 'first_model_20'))
first_model_30.save(Path.joinpath(path_to_models, 'first_model_30'))

### Embedding Model 2: all-MiniLM-L6-v2

In [None]:
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings_2 = sentence_model.encode(docs, show_progress_bar = True)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [None]:
# saving embeddings as npz file
savez_compressed(Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Embeddings/all-MiniLM-L6-v2.npz'), embeddings_2)

#### 10 Topics

In [None]:
second_model_10 = BERTopic(nr_topics = 10).fit(docs, embeddings_2)

In [None]:
second_model_10_topics, _ = BERTopic(nr_topics = 10).fit_transform(docs, embeddings_2)

#### 20 Topics

In [None]:
second_model_20 = BERTopic(nr_topics = 20).fit(docs, embeddings_2)

In [None]:
second_model_20_topics, _ = BERTopic(nr_topics = 20).fit_transform(docs, embeddings_2)

#### 30 Topics

In [None]:
second_model_30 = BERTopic(nr_topics = 30).fit(docs, embeddings_2)

In [None]:
second_model_30_topics, _ = BERTopic(nr_topics = 30).fit_transform(docs, embeddings_2)

Saving Models and Topics

In [None]:
second_model_10.save(Path.joinpath(path_to_models, 'second_model_10'))
second_model_20.save(Path.joinpath(path_to_models, 'second_model_20'))
second_model_30.save(Path.joinpath(path_to_models, 'second_model_30'))

### Embedding Model 3: all-mpnet-base-v2 

In [None]:
sentence_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings_3 = sentence_model.encode(docs, show_progress_bar = True)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [None]:
# saving embeddings as npz file
savez_compressed(Path.joinpath(cur_dir, 'gdrive/Shareddrives/Minecraft/Embeddings/all-mpnet-base-v2.npz'), embeddings_3)

In [None]:
# Set the random state in the UMAP model to prevent stochastic behavior 
umap_model = UMAP(n_neighbors = 200, n_components = 5, 
                  min_dist = 0.0, metric='cosine', random_state=42)

In [None]:
mapper = umap_model.fit(embeddings_3)

In [None]:
umap.plot.points(mapper)

ValueError: ignored

In [None]:
third_model_30 = BERTopic(umap_model = umap_model, nr_topics = 10)
third_model_30_topics, _ = third_model_30.fit_transform(docs, embeddings_3)

## Evaluation

### NPMI Coherence Scores

In [None]:
def calculuate_coherence_score(topic_model, topics, docs):  
    """

    ----------
    Calculate NPMI coherence score for a fitted BERTopic model.

    Author: Maarten Grootendorst
    Source: https://github.com/MaartenGr/BERTopic/issues/90
    ----------
    
    :param topic_model: BERTopic model for evaluation.
    :param topics: The topics generated by the model.
    :param docs: The data used for evaluation. 
    :return: The coherence score for the generated topics.

    """
    
    # Preprocess Documents
    documents = pd.DataFrame({"Document": docs,
                            "ID": range(len(docs)),
                            "Topic": topics})

    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words != ''] 
        for topic in range(0, (len(set(topics))-1))]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                    texts=tokens, 
                                    corpus=corpus,
                                    dictionary=dictionary, 
                                    coherence='c_v')
    coherence = coherence_model.get_coherence()

    return coherence

In [None]:
calculuate_coherence_score(third_model_30, third_model_30_topics, docs)

0.34033934797588294