In [1]:
!pip install BERTopic

Collecting BERTopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from BERTopic)
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from BERTopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->BERTopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB

In [2]:
from bertopic import BERTopic
import json
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer

In [3]:
import json
file_path = 'controversial_users_comments.json'
with open(file_path, 'r') as f:
      data = json.load(f)

In [4]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Extract 'text' values from dictionaries in 'data', ensuring the key exists
comments = [doc.get('text') for doc in data if 'text' in doc]

In [9]:
# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)
topics, probs = topic_model.fit_transform(comments)

In [10]:
topic_model.get_topics()

{-1: [('gaza', 0.628116),
  ('palestinians', 0.6015337),
  ('palestinian', 0.5458102),
  ('hamas', 0.5379767),
  ('palestine', 0.52899337),
  ('israeli', 0.47779593),
  ('israel', 0.472199),
  ('israelis', 0.45431316),
  ('bombing', 0.27720353),
  ('terrorists', 0.26562995)],
 0: [('gaza', 0.6286634),
  ('gazan', 0.5910463),
  ('hamas', 0.57940197),
  ('palestinian', 0.53890187),
  ('israeli', 0.48496854),
  ('children', 0.4567936),
  ('israel', 0.42835388),
  ('terrorists', 0.39732218),
  ('infants', 0.38199216),
  ('militants', 0.3800876)],
 1: [('idf', 0.83128124),
  ('idfs', 0.8141105),
  ('civilians', 0.45841882),
  ('military', 0.39022696),
  ('terrorists', 0.3871957),
  ('soldiers', 0.3804301),
  ('civilian', 0.37368897),
  ('army', 0.37277108),
  ('terrorist', 0.3637004),
  ('bombing', 0.34860817)],
 2: [('genocide', 0.6841537),
  ('genocides', 0.6267208),
  ('genocided', 0.58907926),
  ('palestinians', 0.5717684),
  ('holocaust', 0.5483699),
  ('gaza', 0.5450766),
  ('palestin

In [11]:
topic_model.get_document_info(comments)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,i genuinely pity anyone who can read this and ...,831,831_cheering_cheer_cheeringtop_cheerleaders,"[cheering, cheer, cheeringtop, cheerleaders, a...","[half of reddit is cheering it on., not cheeri...",cheering - cheer - cheeringtop - cheerleaders ...,1.000000,False
1,doesn't shock me. israel is showing it's true ...,-1,-1_gaza_palestinians_palestinian_hamas,"[gaza, palestinians, palestinian, hamas, pales...",[nrc is the dutch paper of record. here is an ...,gaza - palestinians - palestinian - hamas - pa...,0.000000,False
2,there is no real peace here. the two sides hat...,-1,-1_gaza_palestinians_palestinian_hamas,"[gaza, palestinians, palestinian, hamas, pales...",[nrc is the dutch paper of record. here is an ...,gaza - palestinians - palestinian - hamas - pa...,0.000000,False
3,i guess they didn't condemn hamas loudly enoug...,61,61_hamas_palestinian_condemn_hamasis,"[hamas, palestinian, condemn, hamasis, condemn...","[do you condemn hamas, but i condemn hamas, 😢,...",hamas - palestinian - condemn - hamasis - cond...,0.560773,False
4,"the comments. too many new people, some are c...",1,1_idf_idfs_civilians_military,"[idf, idfs, civilians, military, terrorists, s...","[same with the idf, and the idf., idf too]",idf - idfs - civilians - military - terrorists...,0.879226,False
...,...,...,...,...,...,...,...,...
198874,only because it served their needs.,-1,-1_gaza_palestinians_palestinian_hamas,"[gaza, palestinians, palestinian, hamas, pales...",[nrc is the dutch paper of record. here is an ...,gaza - palestinians - palestinian - hamas - pa...,0.000000,False
198875,"i certainly didn't, but you try to justify the...",-1,-1_gaza_palestinians_palestinian_hamas,"[gaza, palestinians, palestinian, hamas, pales...",[nrc is the dutch paper of record. here is an ...,gaza - palestinians - palestinian - hamas - pa...,0.000000,False
198876,stupid racist shit is fine since it didn’t hap...,27,27_racism_racist_racists_racial,"[racism, racist, racists, racial, race, blacks...","[that's racism., racism, because racism.]",racism - racist - racists - racial - race - bl...,0.922957,False
198877,germany and forcing other eu members to take o...,-1,-1_gaza_palestinians_palestinian_hamas,"[gaza, palestinians, palestinian, hamas, pales...",[nrc is the dutch paper of record. here is an ...,gaza - palestinians - palestinian - hamas - pa...,0.000000,False


In [12]:
topic_model.save("Israel_Hamas_ControversialComments_model", serialization="safetensors")