In [1]:
from sklearn.datasets import fetch_20newsgroups
import cudf
import cupy as cp
import pandas as pd
from cuBERTopic import gpu_BERTopic
from embedding_extraction import create_embeddings
import rmm
import os
from transformers import AutoModel

os.environ["TOKENIZERS_PARALLELISM"] = "true"
rmm.reinitialize(pool_allocator=True,initial_pool_size=5e+9)

## Read Dataset

In [2]:
test_dataset = cudf.read_csv('/datasets/vjawa/abcnews/abcnews-date-text.csv')
data = test_dataset['headline_text']

In [3]:
gpu_topic = gpu_BERTopic()
documents = cudf.DataFrame(
    {"Document": data, "ID": cp.arange(len(data)), "Topic": None}
)

## Extract embeddings

In [4]:
embedding_model = AutoModel.from_pretrained( "sentence-transformers/all-MiniLM-L6-v2").to('cuda')

In [5]:
%%time
# Extract embeddings
embeddings = create_embeddings(documents.Document, 
                               embedding_model,
                               vocab_file='../vocab/voc_hash.txt')

CPU times: user 2min 44s, sys: 1.57 s, total: 2min 46s
Wall time: 2min 53s


## Umap Dimensionality Reduction

In [6]:
%%time
# Dimensionality Reduction
umap_embeddings = gpu_topic.reduce_dimensionality(embeddings)
del embeddings

CPU times: user 1min 4s, sys: 34.3 s, total: 1min 38s
Wall time: 1min 38s


## Cluster Embedding

In [7]:
%%time
# Cluster UMAP embeddings with HDBSCAN
documents, probabilities = gpu_topic.clustering_hdbscan(
    umap_embeddings,
    documents
)
del umap_embeddings

Label prop iterations: 42
CPU times: user 1min 6s, sys: 24.6 s, total: 1min 31s
Wall time: 1min 31s


## Topic representation

In [8]:
%%time
# Topic representation
tf_idf, count, docs_per_topics_topics = gpu_topic.create_topics(
    documents
)
top_n_words, name_repr = gpu_topic.extract_top_n_words_per_topic(
    tf_idf, count, docs_per_topics_topics, n=30
)

gpu_topic.topic_sizes_df["Name"] = gpu_topic.topic_sizes_df["Topic"].map(
    name_repr
)

CPU times: user 5.35 s, sys: 572 ms, total: 5.92 s
Wall time: 6.07 s


## Running the end-to-end pipeline to extract topics

In [9]:
%%time
gpu_topic_model = gpu_BERTopic(vocab_file='../vocab/voc_hash.txt')
topics_gpu, probs_gpu = gpu_topic_model.fit_transform(data)

CPU times: user 5min 12s, sys: 57 s, total: 6min 9s
Wall time: 6min 2s


In [10]:
gpu_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,556666,-1_dealer_burglary_girlfriend_pentagon
6139,0,5903,0_hezbollah_abbas_palestinians_beirut
6794,1,3425,1_jong_koreas_il_koreans
8758,2,2979,2_winemakers_wines_winery_grapes
8682,3,2723,3_fijian_tongan_fijis_tonga
...,...,...,...
10913,10944,10,10944_fluoridation_fluoride_warrumbungle_fluor...
10916,10945,10,10945_varischetti_stoate_belinda_hiscock
10917,10946,10,10946_toad_lone_inspection_hop
10929,10947,10,10947_hodgman_hodgmans_ricahrd_018


In [11]:
%%time
gpu_topic_model.get_topic(0)

CPU times: user 330 ms, sys: 9.87 ms, total: 340 ms
Wall time: 575 ms


[('hezbollah', 0.00610913676936764),
 ('abbas', 0.005759148089882871),
 ('palestinians', 0.005563681779571561),
 ('beirut', 0.005238712666775752),
 ('lebanese', 0.005119743222673001),
 ('jerusalem', 0.004823289645864537),
 ('israelis', 0.004637208955532768),
 ('sharon', 0.004572436689574688),
 ('fatah', 0.004421821650886115),
 ('israels', 0.0035350177821009965)]