In [14]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
import string
import numpy as np
import random

custom_words_to_filter = ['Reuters', 'Reuters.com', 'CNET']

docs_StarWars = ["Count Dooku Voice Actor Corey Burton Tried Something New on Star Wars: Tales of the Jedi", "'Andor' Episode 8 Explained: 'Rogue One' Cameos and a 'Star Wars' Hell Prison - CNET", "Andor Gave Us the Gayest Screen Fade in Star Wars History", "Ahsoka Tano Herself, Ashley Eckstein, Breaks Down Star Wars: Tales of the Jedi", "Star Wars: The Deckbuilding Game could rule the galaxy of 2-player card games", "There's a new Star Wars project from Damon Lindelof in the works", "Star Wars Fatigue Shouldn't Stop You From Watching 'Andor' - CNET", "'Andor' is the best 'Star Wars' show since 'The Mandalorian' — but the least popular", "Star Wars characters take over Mexico City - Reuters"]

docs_China = ["Alarmed by suicide attack, China and Pakistan join hands in probe - Reuters", "Death of boy in lockdown fuels backlash against China's zero-Covid policy", "UPDATE 2-Hong Kong stocks tumble as Xi appointments fan economic fears; yuan weakens", "China supports central SOEs to issue tech innovation bonds - Reuters"]

docs_WorldCup = ["World Cup: FIFA head comments on beer ban - CP24", "T20 World Cup: Bangladesh bowlers impress to secure victory over Netherlands", "Canada's goalkeeper Crepeau to miss World Cup with broken leg - Reuters", "World Cup stadium alcohol ban emblematic of contradictions", "T20 World Cup: Pakistan beat Netherlands to avoid elimination"]

docs_Ukraine = ["Ukraine war: Wagner chief Prigozhin defends brutal killing video", "Chance for peace in Ukraine, says France's Macron - Reuters", "Ukraine nuclear agency thickens alleged dirty bomb plot - CTV News", "Ukraine war round-up: Missile blast in Poland and Zambian family's grief", "Ukraine round-up: Refugees urged to stay away and 'dirty bomb' claims", "Russia says Ukraine hands over 50 prisoners of war - Reuters.com"]

docs = [docs_StarWars, docs_China, docs_WorldCup, docs_Ukraine]
docs = list(np.concatenate(docs).flat)
random.shuffle(docs)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
words_to_filter = np.concatenate((stop_words, custom_words_to_filter))

def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    text = lemmatizer.lemmatize(text)
    text = tokenizer.tokenize(text)
    text = [word for word in text if word not in words_to_filter]
    text = ' '.join(text)
    text = text.lower()

    return text

docs = [clean_text(doc) for doc in docs]
print(docs)

['world cup stadium alcohol ban emblematic contradictions', 'ukraine roundup refugees urged stay away dirty bomb claims', 'ukraine war roundup missile blast poland zambian familys grief', 'ahsoka tano herself ashley eckstein breaks down star wars tales jedi', 't20 world cup pakistan beat netherlands avoid elimination', 'star wars characters take mexico city', 'andor best star wars show since the mandalorian least popular', 't20 world cup bangladesh bowlers impress secure victory netherlands', 'star wars fatigue shouldnt stop you from watching andor', 'russia says ukraine hands 50 prisoners war reuterscom', 'chance peace ukraine says frances macron', 'alarmed suicide attack china pakistan join hands probe', 'andor gave us gayest screen fade star wars history', 'world cup fifa head comments beer ban cp24', 'ukraine nuclear agency thickens alleged dirty bomb plot ctv news', 'andor episode 8 explained rogue one cameos star wars hell prison', 'china supports central soes issue tech innovati

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\steem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\steem\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
from bertopic import BERTopic
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(docs)


cluster_model = AgglomerativeClustering(linkage='ward', distance_threshold=1.5, n_clusters=None)
topic_model = BERTopic(hdbscan_model=cluster_model).fit(docs, embeddings)
topics, probs = topic_model.fit_transform(docs)

topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                                 topic_prefix=False,
                                                 word_length=15,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)

topic_model.get_topic_info()

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Unnamed: 0,Topic,Count,Name,CustomName
0,0,9,0_wars_star_andor_jedi,"wars, star, andor"
1,1,6,1_ukraine_war_says_roundup,"ukraine, war, says"
2,2,5,2_cup_world_netherlands_t20,"cup, world, netherlands"
3,3,4,3_china_2hong_yuan_fuels,"china, 2hong, yuan"


In [17]:
topic_model.visualize_documents(docs, embeddings=embeddings, custom_labels=True)