# Topic Modeling based on the transformer models

## Install necessary packages

In [None]:
!pip install --upgrade pip
!pip install --upgrade numpy
!pip install --upgrade sentence_transformers
!conda install -c conda-forge hdbscan --y
!pip install bokeh
!pip install --upgrade bertopic[visualization]
!pip install octis

## Import necessary libraries

In [12]:
import pytest

import numpy

from click.testing import CliRunner
from octis.evaluation_metrics.topic_significance_metrics import *
from octis.evaluation_metrics.classification_metrics import F1Score, PrecisionScore
from octis.evaluation_metrics.classification_metrics import AccuracyScore, RecallScore
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO, KLDivergence, LogOddsRatio, \
    WordEmbeddingsInvertedRBO
from octis.evaluation_metrics.similarity_metrics import WordEmbeddingsRBOMatch, PairwiseJaccardSimilarity, RBO, \
    WordEmbeddingsCentroidSimilarity, WordEmbeddingsPairwiseSimilarity

from octis.evaluation_metrics.coherence_metrics import *
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

import os

In [13]:
from bertopic import BERTopic
import pandas as pd
import random
# random.seed(42)
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer
import sklearn.manifold
from sklearn.cluster import KMeans
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from bertopic.backend import WordDocEmbedder
import gensim.downloader as api
import gensim.corpora as corpora
from gensim.models import CoherenceModel

from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma, d3, Turbo256
from bokeh.plotting import figure
from bokeh.transform import transform
import bokeh.io
bokeh.io.output_notebook()

from octis.evaluation_metrics.coherence_metrics import Coherence

import bokeh.plotting as bpl
import bokeh.models as bmo
bpl.output_notebook()
from transformers import RobertaModel

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Dataset

In [15]:
data = pd.read_csv('/content/drive/MyDrive/output_type_2_w_shuffle.csv', header=None, encoding="utf-8")
data.rename(columns = {0 : 'ErrorMessage'}, inplace = True)
print("The size of input data is:", data.shape[0])
data.head(5)

The size of input data is: 7823


Unnamed: 0,ErrorMessage
0,set array element sequence
1,tensor name embed w find checkpoint file model...
2,custom spectral norm already register function...
3,least populate class member minimum number gro...
4,dll load fail specify module could find


## [BertTopic framework](https://maartengr.github.io/BERTopic/index.html)

### Prepare custom models

In [16]:
umap_model = UMAP(n_neighbors=15, 
                  n_components=10, 
                  metric='cosine', 
                  low_memory=False)

hdbscan_model = HDBSCAN(min_cluster_size=15, 
                        metric='euclidean', 
                        cluster_selection_method='eom', 
                        prediction_data=True,
                        min_samples=2)

# vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")

# Word embedding model
# ft = api.load('fasttext-wiki-news-subwords-300')

# Document embedding model
# embedding_model = SentenceTransformer('all-mpnet-base-v2')
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# embedding_model = SentenceTransformer('all-distilroberta-v1')
embedding_model = SentenceTransformer('microsoft/codebert-base')
# embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Create a model that uses both language models and pass it through BERTopic
# word_doc_embedder = WordDocEmbedder(embedding_model=embedding_model, word_embedding_model=ft)



In [17]:
# embedding_model = SentenceTransformer('all-mpnet-base-v2')
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# embedding_model = SentenceTransformer('all-distilroberta-v1')
embedding_model = SentenceTransformer('microsoft/codebert-base')
# embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')



### Clustering

In [18]:
data = data[~data["ErrorMessage"].isna()]
text = data["ErrorMessage"].values.tolist()

USE_SAVED_MODEL = False

if USE_SAVED_MODEL:
    model = BERTopic.load("/content/drive/MyDrive/bertopic_model.pt")
else:
  
  # Pass the custom models to BERTopic
  model = BERTopic(
                    # umap_model=umap_model, 
                    # hdbscan_model=hdbscan_model, 
                    embedding_model=embedding_model,
                    top_n_words=10,
                    min_topic_size=30,
                    # embedding_model=word_doc_embedder,
                    # vectorizer_model=vectorizer_model,
                    language="english", 
                    nr_topics="auto",
                    calculate_probabilities=True, 
                    verbose=True,
                    # n_gram_range=(1, 2),
                    )
  # model = BERTopic(nr_topics="auto", n_gram_range=(1, 2), language="english", calculate_probabilities=True, verbose=True)
  
  topics, probs = model.fit_transform(text)

  # arr_topics_probs = np.vstack([np.array(topics), probs]).T
  
  # model.save("/content/drive/MyDrive/bertopic_model.pt")

Batches:   0%|          | 0/245 [00:00<?, ?it/s]

2022-04-19 21:14:07,628 - BERTopic - Transformed documents to Embeddings
2022-04-19 21:14:52,600 - BERTopic - Reduced dimensionality with UMAP
2022-04-19 21:14:55,220 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-04-19 21:14:56,120 - BERTopic - Reduced number of topics from 76 to 61


In [19]:
model.get_params()

{'calculate_probabilities': True,
 'diversity': None,
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f4f911c6a90>,
 'hdbscan_model': HDBSCAN(min_cluster_size=30, prediction_data=True),
 'language': None,
 'low_memory': False,
 'min_topic_size': 30,
 'n_gram_range': (1, 1),
 'nr_topics': 'auto',
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(),
 'verbose': True}

T-distributed Stochastic Neighbor Embedding.

t-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results.

It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples. For more tips see Laurens van der Maaten’s FAQ [2].

In [20]:
  # https://www.sbert.net/docs/pretrained_models.html
  # model_st = SentenceTransformer('all-mpnet-base-v2')
  # model_st = SentenceTransformer('all-MiniLM-L6-v2')
  # model_st = SentenceTransformer('all-distilroberta-v1')
  model_st = SentenceTransformer('microsoft/codebert-base')
  # model_st = SentenceTransformer('distilbert-base-nli-mean-tokens')

  embeddings = model_st.encode(text)
  out = sklearn.manifold.TSNE(n_components=2).fit_transform(embeddings)



 let's take a look at the most frequent topic that was generated, topic 0:

In [21]:
model.get_topic(0)

[('check', 0.033206678745581156),
 ('error', 0.031355629314180934),
 ('array', 0.028880924132580865),
 ('model', 0.027540922708192833),
 ('target', 0.026589063037552284),
 ('expect', 0.02505340292585566),
 ('dense', 0.024119035992345033),
 ('get', 0.02105724666281114),
 ('shape', 0.018393291129556004),
 ('use', 0.01807797242832623)]

### BertTopic reports

In [22]:
topic_df = model.get_topic_freq()

def get_keywords(i):
    if i == -1: return 'outlier'
    tpc = model.get_topic(i)
    words = [x[0] for x in tpc]
    tw = ' '.join(words)
    return tw

topic_df['keywords'] = topic_df['Topic'].apply(get_keywords)
topic_df

Unnamed: 0,Topic,Count,keywords
0,-1,2559,outlier
1,0,580,check error array model target expect dense ge...
2,1,375,module name tensorflow attribute sklearn pytho...
3,2,311,lib python package site users tensorflow local...
4,3,265,attribute object module sequential classifier ...
...,...,...,...
56,55,35,dimensional size xx channel batch weight inste...
57,56,34,squeeze convd dimension check input expect dim...
58,57,34,pickle lock thread io object text wrapper seri...
59,58,33,instead array expect get scalar tensor input


In [23]:
model.visualize_topics()

In [24]:
# fig = topic_model.visualize_barchart()
# fig.write_html("path/to/file.html")
model.visualize_barchart()

In [25]:
# model.visualize_distribution(probabilities=probs[3], width=800, height=800)

In [26]:
model.visualize_heatmap(top_n_topics=15, width=1024, height=1024)

It is used to perform the hierarchical clustering based on the cosine distance matrix between topic embeddings.

In [27]:
model.visualize_hierarchy(top_n_topics=20)

In [28]:
model.visualize_term_rank()

### Create plot based on the T-SNNE:

In [29]:
topic_words = ['-1: outlier']
for i in range(len(set(topics))-1):
  tpc = model.get_topic(i)
  words = [x[0] for x in tpc]
  tw = ' '.join([str(i) + ':'] + words)
  topic_words.append(tw)

exp_topics = [topic_words[x+1] for x in topics]

In [30]:
# Create different colors based on the topics' number
clrs = random.sample(Turbo256, len(set(topics)))
color_map = bmo.CategoricalColorMapper(factors=topic_words, palette=clrs)

list_x = out[:,0]
list_y = out[:,1]
desc = text

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, topic=exp_topics))
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ('desc', '@desc'),
    ('topic', '@topic')
])

p = figure(plot_width=800, plot_height=800, tools=[hover], title="")
p.circle('x', 'y', size=10, source=source,
         fill_color=transform('topic', color_map),
        #  legend='topic'
)
# p.legend.location = "top_left"
# p.legend.click_policy="mute"

bpl.show(p)

## Coherence evaluation

In [31]:
print ("preprocess for coherence...")
# Preprocess Documents
documents = pd.DataFrame({"Document": text,
                          "ID": range(len(text)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

print ("vectorizer and analyzer for coherence...")
# Extract vectorizer and analyzer from BERTopic
vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
print ("Extract features for Topic Coherence evaluation...")
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in model.get_topic(topic)] for topic in range(len(set(topics))-1)]

# Evaluate
print ("Evaluate...")
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

print("BERTopic coherence score: {}".format(coherence))

preprocess for coherence...
vectorizer and analyzer for coherence...
Extract features for Topic Coherence evaluation...
Evaluate...
BERTopic coherence score: 0.6570059794856513


In [32]:
modell_output = {}
modell_output["topics"] = topic_words

In [33]:
metric = RBO(topk=10)
score = metric.score(modell_output)
assert type(score) == np.float64 or type(score) == float
assert 0 <= score <= 1
print("RBO: ", score)

metric = TopicDiversity(topk=10)
score = metric.score(modell_output)
assert type(score) == np.float64 or type(score) == float
assert 0 <= score <= 1
print("Diversity: ", score)

RBO:  0.018714663002782572
Diversity:  0.6283333333333333


min_size = 50
model = 1
BERTopic coherence score: 0.6706251823962723
RBO:  0.019753925430256336
Diversity:  0.7214285714285714

min_size = 10
model = 1
BERTopic coherence score: 0.6792914366121349
RBO:  0.009155892909584629
Diversity:  0.5473958333333333

min_size = 20
model = 1
BERTopic coherence score: 0.6984410625183131
RBO:  0.013576561794235609
Diversity:  0.6081632653061224

min_size = 30 model = 1 
BERTopic coherence score: 0.6673988337320986
RBO:  0.015177221377390748
Diversity:  0.659016393442623

min_size = 50
model = 2
BERTopic coherence score: 0.6221637165148065
RBO:  0.022952914300016336
Diversity:  0.6923076923076923

min_size = 10
model = 2
BERTopic coherence score: 0.5091741313153112
RBO:  0.05896325038990691
Diversity:  0.4785714285714286

min_size = 20
model = 2
BERTopic coherence score: 0.5457264041132985
RBO:  0.04056855081728916
Diversity:  0.6157894736842106

min_size = 30 
model = 2
BERTopic coherence score: 0.6436740585947095
RBO:  0.014773030611398341
Diversity:  0.64

Hype

min_size = 10
model = 1

min_size = 20
model = 1

min_size = 30
model = 1
BERTopic coherence score: 0.6949257730604134
RBO:  0.009170568962736825
Diversity:  0.5852112676056338

min_size = 50
model = 1

In [34]:
# # 4. save the top 50 topics and its keywords
# topn = 50
# top_list = []
# topic_info = topic_model.get_topic_info()
# topic_num = topic_info["Topic"].tolist()[1:51]
# topic_size = topic_info["Count"].tolist()[1:51]
# for i in range(topn):
#     top_list.append({"topic": topic_num[i], "size": topic_size[i], "keywords": topic_model.get_topic(topic=i)})

In [35]:
# model.visualize_topics().write_html("images/topics.html")
# model.visualize_barchart().write_html("images/barchart.html")
# model.visualize_heatmap().write_html("images/heatmap.html")
# date = datetime.now().strftime("%Y-%m-%d")
# np.save(f"/home/paperspace/src/news/data/topics/bertopic_{date}.npy", arr_topics_probs)

In [36]:
# # Further reduce topics
# new_topics, new_probs = topic_model.reduce_topics(docs, topics, probs, nr_topics=16)

In [37]:
# def matrix(topics, probs):
#     matrix = []
#     for i in range(len(topics)):
#         matrix.append([topics[i]])
#         matrix[i].append(probs[i])
#     return matrix
# mat = matrix(topics, probs)
# def writeFile(matrix):
#     mat = np.matrix(matrix)
#     file_path = '../bert_data/data_from_bert/' + programa + '_topics_and_probs.txt'
#     path = os.path.relpath(file_path, cur_path)
#     #with open('bert_data/data_from_bert/output.txt', 'wb') as file:
#     with open(path, 'wb') as file:
#         for line in mat:
#             np.savetxt(file, line, fmt='%s')

In [38]:
# # Remove web links
# def remove_links(text):
#     link_regex = re.compile(
#         "((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", re.DOTALL
#     )
#     links = re.findall(link_regex, text)
#     for link in links:
#         text = text.replace(link[0], " , ")
#     return text


# # Remove: email adresses, and all hashtags but the first one
# def remove_emails_hashtags(text):
#     entity_prefixes = ["@", "#", "_"]
#     for separator in string.punctuation:
#         if separator not in entity_prefixes:
#             text = text.replace(separator, " ")
#     words = []
#     hastag_counts = 0
#     for word in text.split():
#         word = word.strip()
#         if "#" in word:
#             hastag_counts += 1
#         if word:
#             if "#" in word and hastag_counts < 2 or word[0] not in entity_prefixes:
#                 words.append(word)

#     return " ".join(words)

#         # tweet = re.sub(r'@[^\s\n\r]+', '', tweet) 
#         # tweet = re.sub(r'[Hh]ttps?://[^\s\n\r]+', '', tweet) 