In [1]:
from bertopic import BERTopic
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import pandas as pd
from pathlib import Path

In [2]:
data_path = ( Path.cwd() / "data/RoundedCleanedArticles.csv").resolve()
data = pd.read_csv(data_path, encoding='utf-8')

In [3]:
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(data['title_topic'])

# TODO: try n_gram_range=(a, b) inside BERTopic
cluster_model = AgglomerativeClustering(linkage='ward', distance_threshold=1.5, n_clusters=None)
topic_model = BERTopic(hdbscan_model=cluster_model).fit(data['title_topic'], embeddings)
topics, probs = topic_model.fit_transform(data['title_topic'])

topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                                 topic_prefix=False,
                                                 word_length=15,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)
# topic_model.save('topic_model')

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName
0,0,9,0_star_wars_andor_jedi,"star, wars, andor"
1,1,6,1_ukraine_war_bomb_dirty,"ukraine, war, bomb"
2,2,5,2_cup_world_netherlands_t20,"cup, world, netherlands"
3,3,4,3_china_alarmed_yuan_covid,"china, alarmed, yuan"


In [4]:
topic_labels_series = pd.Series(topic_labels)
docs_topic = topic_labels_series[topics].tolist()
data['predicted_topic'] = docs_topic
display(data)

data_path = ( Path.cwd() / "data/RoundedWithTopic.csv").resolve()
data.to_csv(data_path, encoding='utf-8', index=False)

Unnamed: 0,title,title_stance,title_topic,predicted_topic
0,Count Dooku Voice Actor Corey Burton Tried Som...,count dooku voice actor corey burton tried som...,count dooku voice actor corey burton tried som...,"star, wars, andor"
1,'Andor' Episode 8 Explained: 'Rogue One' Cameo...,andor episode 8 explained rogue one cameos and...,andor episode 8 explained rogue one cameos sta...,"star, wars, andor"
2,Andor Gave Us the Gayest Screen Fade in Star W...,andor gave us the gayest screen fade in star w...,andor gave us gayest screen fade star wars his...,"star, wars, andor"
3,"Ahsoka Tano Herself, Ashley Eckstein, Breaks D...",ahsoka tano herself ashley eckstein breaks dow...,ahsoka tano ashley eckstein breaks star wars t...,"star, wars, andor"
4,Star Wars: The Deckbuilding Game could rule th...,star wars the deckbuilding game could rule the...,star wars deckbuilding game could rule galaxy ...,"star, wars, andor"
5,There's a new Star Wars project from Damon Lin...,there s a new star wars project from damon lin...,new star wars project damon lindelof works,"star, wars, andor"
6,Star Wars Fatigue Shouldn't Stop You From Watc...,star wars fatigue shouldn t stop you from watc...,star wars fatigue stop watching andor,"star, wars, andor"
7,'Andor' is the best 'Star Wars' show since 'Th...,andor is the best star wars show since the man...,andor best star wars show since mandalorian le...,"star, wars, andor"
8,Star Wars characters take over Mexico City - R...,star wars characters take over mexico city,star wars characters take mexico city,"star, wars, andor"
9,"Alarmed by suicide attack, China and Pakistan ...",alarmed by suicide attack china and pakistan j...,alarmed suicide attack china pakistan join han...,"china, alarmed, yuan"
