In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# test_dataset = pd.read_csv('/raid/mayanka/abcnews-date-text.csv')
# data = test_dataset['headline_text'][:250000]
data = fetch_20newsgroups(subset='all')['data']

In [2]:
topic_model = BERTopic()
documents = pd.DataFrame({"Document": data,
                      "ID": range(len(data)),
                      "Topic": None})

In [3]:
%%time
#Extract embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(
    documents.Document,
    show_progress_bar=False
    # device='cpu'
)

CPU times: user 3min 17s, sys: 13.7 s, total: 3min 30s
Wall time: 35.9 s


In [4]:
%%time
# Dimensionality Reduction
umap_embeddings = topic_model._reduce_dimensionality(embeddings)

CPU times: user 12min 20s, sys: 8.06 s, total: 12min 28s
Wall time: 27.8 s


In [6]:
%%time
# Cluster UMAP embeddings with HDBSCAN
documents, probabilities = topic_model._cluster_embeddings(umap_embeddings, documents)

CPU times: user 901 ms, sys: 604 ms, total: 1.5 s
Wall time: 1.86 s


In [7]:
%%time
# Sort and Map Topic IDs by their frequency
if not topic_model.nr_topics:
    documents = topic_model._sort_mappings_by_frequency(documents)

# Extract topics by calculating c-TF-IDF
topic_model._extract_topics(documents) # does both topic extraction and representation

CPU times: user 6.56 s, sys: 160 ms, total: 6.72 s
Wall time: 6.71 s


In [8]:
## Running fit_transform

In [9]:
%%time
topic_model_pipeline = BERTopic()
topics, probs = topic_model_pipeline.fit_transform(data)

CPU times: user 21min, sys: 35.1 s, total: 21min 36s
Wall time: 1min 2s


In [10]:
%%time
topic_model.get_topic_info()

CPU times: user 4.52 ms, sys: 7 µs, total: 4.52 ms
Wall time: 3.89 ms


Unnamed: 0,Topic,Count,Name
0,-1,6280,-1_email_program_file_information
1,0,853,0_baseball_game_team_year
2,1,307,1_gun_guns_militia_firearms
3,2,187,2_image_3d_graphics_processing
4,3,152,3_atheists_atheism_atheist_god
...,...,...,...
362,361,10,361_solar_sail_sails_snydefjengauburnedu
363,362,10,362_widgets_gabi_gui_motif
364,363,10,363_mining_ecofreaks_miners_basalts
365,364,10,364_s1_s2_u1_u2


In [11]:
%%time
topic_model.get_topic(0)

CPU times: user 17 µs, sys: 1e+03 ns, total: 18 µs
Wall time: 28.4 µs


[('baseball', 0.00672086361283629),
 ('game', 0.005817995879831344),
 ('team', 0.005618226784043931),
 ('year', 0.005353854197670103),
 ('players', 0.005285759166032568),
 ('braves', 0.0051786048900586456),
 ('games', 0.004970825796636513),
 ('hit', 0.004924050725108594),
 ('runs', 0.004664071624908669),
 ('pitching', 0.004447818375223756)]