In [1]:
from sklearn.datasets import fetch_20newsgroups
import cudf
import cupy as cp
import pandas as pd
from cuBERTopic import gpu_BERTopic
from embedding_extraction import create_embeddings
import rmm
import os
from transformers import AutoModel

os.environ["TOKENIZERS_PARALLELISM"] = "true"
rmm.reinitialize(pool_allocator=True,initial_pool_size=5e+9)

## Read Dataset

In [2]:
test_dataset = cudf.read_csv('/datasets/vjawa/abcnews/abcnews-date-text.csv')
data = test_dataset['headline_text']

In [3]:
gpu_topic = gpu_BERTopic()
documents = cudf.DataFrame(
    {"Document": data, "ID": cp.arange(len(data)), "Topic": None}
)

## Extract embeddings

In [4]:
embedding_model = AutoModel.from_pretrained( "sentence-transformers/all-MiniLM-L6-v2").to('cuda')

In [5]:
%%time
# Extract embeddings
embeddings = create_embeddings(documents.Document, 
                               embedding_model,
                               vocab_file='../vocab/voc_hash.txt')

CPU times: user 2min 19s, sys: 1.59 s, total: 2min 21s
Wall time: 2min 14s


## Umap Dimensionality Reduction

In [6]:
%%time
# Dimensionality Reduction
umap_embeddings = gpu_topic.reduce_dimensionality(embeddings)
del embeddings

CPU times: user 1min 4s, sys: 34.1 s, total: 1min 39s
Wall time: 1min 38s


## Cluster Embedding

In [7]:
%%time
# Cluster UMAP embeddings with HDBSCAN
documents, probabilities = gpu_topic.clustering_hdbscan(
    umap_embeddings,
    documents
)
del umap_embeddings

Label prop iterations: 36
CPU times: user 1min 7s, sys: 24.4 s, total: 1min 31s
Wall time: 1min 31s


## Topic representation

In [8]:
%%time
# Topic representation
tf_idf, count, docs_per_topics_topics = gpu_topic.create_topics(
    documents
)
top_n_words, name_repr = gpu_topic.extract_top_n_words_per_topic(
    tf_idf, count, docs_per_topics_topics, n=30
)

gpu_topic.topic_sizes_df["Name"] = gpu_topic.topic_sizes_df["Topic"].map(
    name_repr
)

CPU times: user 2.31 s, sys: 536 ms, total: 2.85 s
Wall time: 3.69 s




## Running the end-to-end pipeline to extract topics

In [9]:
%%time
gpu_topic_pipeline = gpu_BERTopic(vocab_file='../vocab/voc_hash.txt')
topics_gpu, probs_gpu = gpu_topic_pipeline.fit_transform(data)

CPU times: user 4min 41s, sys: 57.4 s, total: 5min 39s
Wall time: 5min 32s
