In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [2]:
data = pd.read_csv('/datasets/vjawa/abcnews/abcnews-date-text.csv')['headline_text']

In [3]:
topic_model = BERTopic()
documents = pd.DataFrame({"Document": data,
                      "ID": range(len(data)),
                      "Topic": None})

## Extract embeddings

In [4]:
%%time
#Extract embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(
    documents.Document,
    batch_size=64,
    show_progress_bar=True,
    # device='cpu'
)

Batches:   0%|          | 0/19161 [00:00<?, ?it/s]

CPU times: user 35min 54s, sys: 4min 33s, total: 40min 28s
Wall time: 3min 13s


## Dimensionality Reduction

In [5]:
%%time
# Dimensionality Reduction
umap_embeddings = topic_model._reduce_dimensionality(embeddings)

CPU times: user 2d 21h 36min 1s, sys: 1h 40min 47s, total: 2d 23h 16min 49s
Wall time: 1h 40min 8s


## Cluster UMAP embeddings with HDBSCAN


In [6]:
%%time
# Cluster UMAP embeddings with HDBSCAN
documents, probabilities = topic_model._cluster_embeddings(umap_embeddings, documents)

CPU times: user 6min 41s, sys: 4.51 s, total: 6min 46s
Wall time: 7min 20s


## Sort and Map Topic IDs by their frequency

In [7]:
%%time
# Sort and Map Topic IDs by their frequency
if not topic_model.nr_topics:
    documents = topic_model._sort_mappings_by_frequency(documents)

# Extract topics by calculating c-TF-IDF
topic_model._extract_topics(documents) # does both topic extraction and representation

CPU times: user 21.5 s, sys: 1.08 s, total: 22.5 s
Wall time: 22.5 s


## E2E-> Running fit_transform

In [8]:
%%time
topic_model_pipeline = BERTopic()
topics, probs = topic_model_pipeline.fit_transform(data)

CPU times: user 1d 1h 16min 34s, sys: 56min, total: 1d 2h 12min 35s
Wall time: 42min 19s


In [9]:
%%time
topic_model.get_topic_info()

CPU times: user 24.6 ms, sys: 9.17 ms, total: 33.7 ms
Wall time: 32.5 ms


Unnamed: 0,Topic,Count,Name
0,-1,527782,-1_guilty_fire_woman_arrested
1,0,2795,0_flood_flash_flooding_floods
2,1,2700,1_fiji_tonga_tongan_fijis
3,2,2100,2_shark_sharks_surfer_sighting
4,3,1973,3_png_pngs_sorcery_oneill
...,...,...,...
11261,11469,10,11469_gospers_severe_yulara_sols
11260,11470,10,11470_rearguard_guard_bashed_ppe
11259,11471,10,11471_feria_verita_eno_parramasala
11258,11314,10,11314_pieman_tracks_4wd_coronor


In [10]:
%%time
topic_model.get_topic(0)

CPU times: user 20 µs, sys: 0 ns, total: 20 µs
Wall time: 29.1 µs


[('flood', 0.013034136241098766),
 ('flash', 0.011030052416256098),
 ('flooding', 0.009824295447533259),
 ('floods', 0.007223232063935848),
 ('mitigation', 0.005399498008327533),
 ('floodwaters', 0.0052701544148756685),
 ('deluge', 0.0035567393420317893),
 ('ses', 0.002957053886676728),
 ('recede', 0.0029317263206314545),
 ('flooded', 0.0028729524787099456)]