# Topic Modelling with BERTopic

## Importing The Packages

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 15
np.set_printoptions(precision=4, suppress=True)

# Creates a progress bar for pandas functions
from tqdm import tqdm

tqdm.pandas()

# Used for resolving paths
from pathlib import Path

# Topic modelling with BERT necessities
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Filter out the irrelevant warnings
import warnings

warnings.filterwarnings("ignore")

## Resolving File Paths

In [8]:
cur_dir = (
    Path().resolve()
)  # this should provide you with the folder in which this notebook is placed
path_to_topic = Path.joinpath(cur_dir, "yelp_100k.csv")
print(path_to_topic)
print("Does path exist? ->", Path.exists(path_to_topic))

C:\Users\lmps\github\TextMining\datasets\yelp_100k.csv
Does path exist? -> True


## Importing The Data

In [31]:
yelp_100k = pd.read_csv(path_to_topic)
yelp_100k.head()

Unnamed: 0,userID,productID,rating,label,date,review,sentiment label
0,38779,1701,4.0,1,2012-06-15,"Okay, so first of all... this place is BYOB. A...",positive
1,120326,1701,4.0,1,2012-06-06,"The Chicken Fingers are to die for - also, the...",positive
2,87807,1701,4.0,1,2013-01-23,The only thing preventing a 5 star review here...,positive
3,5555,1701,5.0,1,2012-04-29,I came here last week and am still thinking ab...,positive
4,15686,1701,4.0,1,2012-02-26,Comfort's the right word. Yay they have itis o...,positive


In [36]:
def build_vocab(words, verbose=True):
    """
    :param words: list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(words, disable=(not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [34]:
words = yelp_100k["review"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(words)
print({k: vocab[k] for k in list(vocab)[:10]})

100%|██████████| 108457/108457 [00:01<00:00, 62915.25it/s]
100%|██████████| 108457/108457 [00:02<00:00, 41770.41it/s]


{'Okay,': 182, 'so': 42931, 'first': 9818, 'of': 151732, 'all...': 53, 'this': 62158, 'place': 46763, 'is': 143557, 'BYOB.': 246, 'And': 7876}


## Training

### Embedding Model 1: all-distilroberta-v1

In [None]:
# Create embeddings
docs = yelp_100k["review"]
sentence_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [86]:
# Fit the model
review_model = BERTopic().fit(docs, embeddings)

In [117]:
test = BERTopic(nr_topics = 10).fit(docs, embeddings)

In [118]:
test.get_topics()

{-1: [('the', 0.05512069271021429),
  ('and', 0.046426999052854044),
  ('to', 0.036714269572826405),
  ('was', 0.034139876488866905),
  ('of', 0.030376427117135334),
  ('is', 0.02929217648671531),
  ('it', 0.02780060323090121),
  ('for', 0.026707072510036647),
  ('in', 0.02499882415824387),
  ('with', 0.02266480483042815)],
 0: [('burger', 0.08546214718221784),
  ('the', 0.05933803784979652),
  ('and', 0.04276857435521607),
  ('to', 0.033942242009514685),
  ('burgers', 0.03341900187975403),
  ('it', 0.033330726829555776),
  ('fries', 0.03252901297100106),
  ('was', 0.031424647717342685),
  ('is', 0.030148349273371636),
  ('of', 0.029872361492441893)],
 1: [('to', 0.05328595717308142),
  ('the', 0.049401696611656776),
  ('we', 0.04687423521657769),
  ('and', 0.04190089341917041),
  ('was', 0.03553858767980425),
  ('that', 0.031531658318209733),
  ('our', 0.027941012868806615),
  ('of', 0.027653648631668883),
  ('for', 0.026817967518679917),
  ('she', 0.0263785061339603)],
 2: [('pizza',

In [87]:
# Create topics for later evaluation
topics, _ = BERTopic().fit_transform(docs, embeddings)

### Embedding Model 2: all-MiniLM-L6-v2

In [93]:
# Create embeddings
sentence_model_2 = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings_2 = sentence_model_2.encode(docs, show_progress_bar=True)

Batches: 100%|██████████| 3390/3390 [1:18:48<00:00,  1.39s/it]  


In [109]:
# Fit the model
review_model_2 = BERTopic().fit(docs, embeddings_2)

In [110]:
# Create topics for later evaluation
topics_2, _ = BERTopic().fit_transform(docs, embeddings_2)

Embedding Model 3: 

# Evaluation


### NPMI Coherence Scores

In [2]:
def calculuate_coherence_score(topic_model, topics, docs):  
    """

    ----------
    Calculate NPMI coherence score for a fitted BERTopic model.

    Author: Maarten Grootendorst
    Source: https://github.com/MaartenGr/BERTopic/issues/90
    ----------
    
    :param topic_model: BERTopic model for evaluation.
    :param topics: The topics generated by the model.
    :param docs: The data used for evaluation. 
    :return: The coherence score for the generated topics.

    """
    
    # Preprocess Documents
    documents = pd.DataFrame({"Document": docs,
                            "ID": range(len(docs)),
                            "Topic": topics})

    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words != ''] 
        for topic in range(0, (len(set(topics))-1))]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                    texts=tokens, 
                                    corpus=corpus,
                                    dictionary=dictionary, 
                                    coherence='c_v')
    coherence = coherence_model.get_coherence()
    ass 

    return coherence

In [116]:
print(type(embeddings))

<class 'numpy.ndarray'>


In [107]:
docs = yelp_100k["review"]
docs_list = docs.tolist()


Coherence score for the Embedding Model 1

In [90]:
calculuate_coherence_score(review_model, topics, docs_list)

0.5326838476419081

Coherence score for the Embedding Model 2

In [115]:
calculuate_coherence_score(review_model_2, topics_2, docs_list)

TypeError: 'bool' object is not iterable

In [24]:
topic_model.save("BERTopic")

  self._set_arrayXarray(i, j, x)


In [5]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
topic_model_loaded = BERTopic.load("BERTopic", embedding_model=sentence_model)

In [11]:
topics_loaded, probabilities_loaded = topic_model_loaded.fit_transform(yelp_100k['review'])

In [15]:
topic_model_loaded.get_topics()

{-1: [('was', 0.0022139900345926945),
  ('we', 0.0021764214818189103),
  ('and', 0.002061209813820475),
  ('the', 0.0020518477048900963),
  ('to', 0.002026982922339576),
  ('were', 0.002017625854200519),
  ('food', 0.001965000215226433),
  ('for', 0.0019588356863114134),
  ('our', 0.0019472972973097215),
  ('had', 0.0019465432213367476)],
 0: [('pizza', 0.02283854844417964),
  ('crust', 0.006491202040423056),
  ('pie', 0.005149500385615752),
  ('slice', 0.005005529052034542),
  ('pizzas', 0.0041548292750675404),
  ('thin', 0.003726017663344131),
  ('oven', 0.00334446425110005),
  ('best', 0.0032783625916562326),
  ('pies', 0.0031351241816716213),
  ('slices', 0.0027585671115468993)],
 1: [('burger', 0.015791086596599127),
  ('fries', 0.00885232156125609),
  ('burgers', 0.008557563478705812),
  ('label', 0.006336829556151927),
  ('minetta', 0.005308049149223259),
  ('black', 0.005053788624379973),
  ('marrow', 0.0038035580726671816),
  ('tavern', 0.003264652552067825),
  ('bone', 0.0030

In [19]:
reduced_topics_30, reduced_probs_30 = topic_model_loaded.reduce_topics(yelp_100k['review'], topics_loaded, probabilities_loaded, nr_topics = 30)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
import copy
topic_model_30 = copy.deepcopy(topic_model_loaded)
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words = "english")
topic_model_30.update_topics(yelp_100k['review'], reduced_topics_30, vectorizer_model=vectorizer_model)

In [21]:
topic_model_30.get_topics()

{-1: [('food', 0.019485920284504826),
  ('place', 0.015781217670183312),
  ('good', 0.015757578604954452),
  ('great', 0.014991569384794103),
  ('service', 0.013386094294458474),
  ('like', 0.012233537431168154),
  ('just', 0.012020263536357283),
  ('time', 0.01166014892489276),
  ('restaurant', 0.01146017428477528),
  ('really', 0.010759805983819138)],
 0: [('pizza', 0.13624635482554368),
  ('crust', 0.024407485513474592),
  ('best', 0.022115488420041966),
  ('pie', 0.021446241825365996),
  ('good', 0.018360578528070177),
  ('slice', 0.018322651151635706),
  ('place', 0.018214261963433127),
  ('great', 0.01600885436103235),
  ('pizzas', 0.014296942408977087),
  ('wait', 0.013878806980704284)],
 1: [('burger', 0.08494874378552016),
  ('fries', 0.041628486584619664),
  ('burgers', 0.034202597066964165),
  ('label', 0.021996257518861716),
  ('black', 0.019824688500130548),
  ('good', 0.01790007928394429),
  ('minetta', 0.01783122672274776),
  ('best', 0.014601471927764701),
  ('great', 0

In [24]:
topic_model_loaded.get_topics()

{-1: [('the', 0.02574885759114598),
  ('and', 0.023601142485296534),
  ('to', 0.02048505676879029),
  ('was', 0.02045553773393932),
  ('it', 0.017510338876801982),
  ('of', 0.017324648126489078),
  ('for', 0.0161849560958391),
  ('is', 0.01589823760832481),
  ('we', 0.01578295991448046),
  ('in', 0.014597782873321682)],
 0: [('pizza', 0.08340811712716682),
  ('the', 0.02535181910087107),
  ('is', 0.02359074514479289),
  ('and', 0.02147480293453824),
  ('it', 0.020370380535542354),
  ('to', 0.020247557014911047),
  ('in', 0.019187440910606174),
  ('you', 0.016539491773638417),
  ('of', 0.016251554260995456),
  ('for', 0.014920916785626818)],
 1: [('burger', 0.052099670406976564),
  ('the', 0.02767241163533619),
  ('fries', 0.024791282063168975),
  ('and', 0.02158144281204974),
  ('it', 0.0208519296888324),
  ('burgers', 0.019688642290462754),
  ('to', 0.019205739337276068),
  ('was', 0.018918991360806183),
  ('of', 0.01770503038032618),
  ('but', 0.016371296001942665)],
 2: [('sushi', 0

In [25]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
topic_model_loaded_2 = BERTopic.load("BERTopic", embedding_model=sentence_model)

In [26]:
reduced_topics_20, reduced_probs_20 = topic_model_loaded_2.reduce_topics(yelp_100k['review'], topics_loaded, probabilities_loaded, nr_topics = 20)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
import copy
topic_model_20 = copy.copy(topic_model_loaded_2)
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words = "english")
topic_model_20.update_topics(yelp_100k['review'], reduced_topics_20, vectorizer_model=vectorizer_model)

In [31]:
topic_model_30.get_topics()

{-1: [('food', 0.019485920284504826),
  ('place', 0.015781217670183312),
  ('good', 0.015757578604954452),
  ('great', 0.014991569384794103),
  ('service', 0.013386094294458474),
  ('like', 0.012233537431168154),
  ('just', 0.012020263536357283),
  ('time', 0.01166014892489276),
  ('restaurant', 0.01146017428477528),
  ('really', 0.010759805983819138)],
 0: [('pizza', 0.13624635482554368),
  ('crust', 0.024407485513474592),
  ('best', 0.022115488420041966),
  ('pie', 0.021446241825365996),
  ('good', 0.018360578528070177),
  ('slice', 0.018322651151635706),
  ('place', 0.018214261963433127),
  ('great', 0.01600885436103235),
  ('pizzas', 0.014296942408977087),
  ('wait', 0.013878806980704284)],
 1: [('burger', 0.08494874378552016),
  ('fries', 0.041628486584619664),
  ('burgers', 0.034202597066964165),
  ('label', 0.021996257518861716),
  ('black', 0.019824688500130548),
  ('good', 0.01790007928394429),
  ('minetta', 0.01783122672274776),
  ('best', 0.014601471927764701),
  ('great', 0

In [1]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
topic_model_loaded_3 = BERTopic.load("BERTopic", embedding_model=sentence_model)

In [6]:
reduced_topics_10, reduced_probs_10 = topic_model_loaded_3.reduce_topics(yelp_100k['review'], topics_loaded, probabilities_loaded, nr_topics = 10)

NameError: name 'topics_loaded' is not defined

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
import copy
topic_model_10 = copy.copy(topic_model_loaded_3)
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words = "english")
topic_model_10.update_topics(yelp_100k['review'], reduced_topics_10, vectorizer_model=vectorizer_model)

In [37]:
topic_model_30.get_topics()

{-1: [('food', 0.019485920284504826),
  ('place', 0.015781217670183312),
  ('good', 0.015757578604954452),
  ('great', 0.014991569384794103),
  ('service', 0.013386094294458474),
  ('like', 0.012233537431168154),
  ('just', 0.012020263536357283),
  ('time', 0.01166014892489276),
  ('restaurant', 0.01146017428477528),
  ('really', 0.010759805983819138)],
 0: [('pizza', 0.13624635482554368),
  ('crust', 0.024407485513474592),
  ('best', 0.022115488420041966),
  ('pie', 0.021446241825365996),
  ('good', 0.018360578528070177),
  ('slice', 0.018322651151635706),
  ('place', 0.018214261963433127),
  ('great', 0.01600885436103235),
  ('pizzas', 0.014296942408977087),
  ('wait', 0.013878806980704284)],
 1: [('burger', 0.08494874378552016),
  ('fries', 0.041628486584619664),
  ('burgers', 0.034202597066964165),
  ('label', 0.021996257518861716),
  ('black', 0.019824688500130548),
  ('good', 0.01790007928394429),
  ('minetta', 0.01783122672274776),
  ('best', 0.014601471927764701),
  ('great', 0

In [38]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
topic_model_loaded_4 = BERTopic.load("BERTopic", embedding_model=sentence_model)

In [39]:
reduced_topics_5, reduced_probs_5 = topic_model_loaded_4.reduce_topics(yelp_100k['review'], topics_loaded, probabilities_loaded, nr_topics = 5)