In [1]:
import os
os.environ['NVCC']="/usr/local/cuda-11.5/bin/nvcc"

from sklearn.datasets import fetch_20newsgroups
import cudf
import cupy as cp
import pandas as pd
from cuBERTopic import gpu_BERTopic
from embedding_extraction import create_embeddings
import rmm
from transformers import AutoModel

os.environ["TOKENIZERS_PARALLELISM"] = "true"
rmm.reinitialize(pool_allocator=True,initial_pool_size=5e+9)

## Read Dataset

In [2]:
test_dataset = cudf.read_csv('/datasets/vjawa/abcnews/abcnews-date-text.csv')
data = test_dataset['headline_text']

In [3]:
gpu_topic = gpu_BERTopic()
documents = cudf.DataFrame(
    {"Document": data, "ID": cp.arange(len(data)), "Topic": None}
)

## Extract embeddings

In [4]:
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to('cuda')

In [5]:
%%time
# Extract embeddings
embeddings = create_embeddings(documents.Document, 
                               embedding_model,
                               vocab_file='../vocab/voc_hash.txt')

CPU times: user 2min 17s, sys: 1.68 s, total: 2min 19s
Wall time: 2min 13s


## Umap Dimensionality Reduction

In [6]:
%%time
# Dimensionality Reduction
umap_embeddings = gpu_topic.reduce_dimensionality(embeddings)
del embeddings

CPU times: user 1min 4s, sys: 34.2 s, total: 1min 39s
Wall time: 1min 38s


## Cluster Embedding

In [7]:
%%time
# Cluster UMAP embeddings with HDBSCAN
documents, probabilities = gpu_topic.cluster_embeddings(
    umap_embeddings,
    documents
)
del umap_embeddings

Label prop iterations: 45
Label prop iterations: 14
Label prop iterations: 9
Label prop iterations: 9
Label prop iterations: 7
Label prop iterations: 5
Label prop iterations: 4
Iterations: 7
2689,8352,16122,23,650,23704
Label prop iterations: 6
Label prop iterations: 3
Label prop iterations: 3
Iterations: 3
1141,165,7008,9,267,671
Label prop iterations: 5
Label prop iterations: 3
Iterations: 2
1000,124,5260,10,186,412
CPU times: user 1min 6s, sys: 24.2 s, total: 1min 30s
Wall time: 1min 30s


## C-TF-IDF

In [8]:
%%time
# Topic representation
tf_idf, vectorizer, topic_labels = gpu_topic.create_topics(documents)

CPU times: user 998 ms, sys: 274 ms, total: 1.27 s
Wall time: 1.3 s


## Topic representation

In [14]:
%%timeit
top_n_words, name_repr = gpu_topic.extract_top_n_words_per_topic(
    tf_idf, vectorizer, topic_labels, n=30
)

gpu_topic.topic_sizes_df["Name"] = gpu_topic.topic_sizes_df["Topic"].map(
    name_repr
)

624 ms ± 6.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
gpu_topic.topic_sizes_df

Unnamed: 0,Topic,Count,Name
0,-1,556862,-1_fire_train_guilty_assault
6291,6290,5875,6290_gaza_israeli_palestinian_israel
8657,8656,3984,8656_water_irrigators_restrictions_allocations
8591,8590,3029,8590_flood_flash_flooding_floods
7146,7145,2948,7145_electoral_voting_polling_ballot
...,...,...,...
10915,10914,10,10914_molotov_cocktail_cocktails_bollards
10919,10918,10,10918_breivik_behring_anders_breiviks
10934,10933,10,10933_springborg_cute_blighs_lawrence
10952,10951,10,10951_kohler_alan_finance_tuesday


## Running the end-to-end pipeline to extract topics

In [11]:
%%time
gpu_topic_model = gpu_BERTopic(vocab_file='../vocab/voc_hash.txt')
topics_gpu, probs_gpu = gpu_topic_model.fit_transform(data)

Label prop iterations: 37
Label prop iterations: 13
Label prop iterations: 11
Label prop iterations: 7
Label prop iterations: 6
Label prop iterations: 6
Label prop iterations: 2
Label prop iterations: 2
Iterations: 8
2716,8391,17865,24,746,20056
Label prop iterations: 9
Label prop iterations: 3
Label prop iterations: 2
Iterations: 3
961,162,7011,11,271,743
Label prop iterations: 3
Iterations: 1
969,82,3506,6,95,162
CPU times: user 4min 35s, sys: 57.8 s, total: 5min 33s
Wall time: 5min 26s


In [12]:
%%time
gpu_topic_model.get_topic_info()

CPU times: user 21.7 ms, sys: 0 ns, total: 21.7 ms
Wall time: 18.4 ms


Unnamed: 0,Topic,Count,Name
0,-1,558053,-1_guilty_man_murder_fire
8018,0,3221,0_korea_jong_korean_koreas
10000,1,3010,1_coronavirus_covid_cases_covid19
9308,2,2941,2_wine_grape_winery_winemakers
9276,3,2780,3_bushfire_bushfires_contained_downgraded
...,...,...,...
10804,10865,10,10865_patel_lavarch_airfare_patels
10810,10866,10,10866_mailings_asbestos_lacey_missions
10821,10867,10,10867_asbestos_knowingly_minimise_pits
10851,10868,10,10868_oats_xi_scatter_ticehurst


In [13]:
gpu_topic_model.get_topic(0)

[('korea', 0.01919563531487256),
 ('jong', 0.01433898681735657),
 ('korean', 0.012157346925094625),
 ('koreas', 0.009394400597796893),
 ('kim', 0.009285031247095787),
 ('north', 0.00619449878965439),
 ('missile', 0.005732123749010998),
 ('il', 0.005723975448621466),
 ('nkorea', 0.0052962113104115804),
 ('koreans', 0.005070683452143232)]