In [1]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.9.4-py2.py3-none-any.whl (57 kB)
[?25l[K     |█████▊                          | 10 kB 24.1 MB/s eta 0:00:01[K     |███████████▍                    | 20 kB 24.9 MB/s eta 0:00:01[K     |█████████████████               | 30 kB 25.9 MB/s eta 0:00:01[K     |██████████████████████▊         | 40 kB 14.0 MB/s eta 0:00:01[K     |████████████████████████████▍   | 51 kB 13.3 MB/s eta 0:00:01[K     |████████████████████████████████| 57 kB 3.4 MB/s 
Collecting hdbscan>=0.8.27
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 23.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 9.5 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloa

In [2]:
from bertopic import BERTopic
import pandas as pd

## Data scarcity issue

First I tried `nrows=10`, thinking to speed up the exploratory part with just a little bit of data. But I kept getting the error `zero-size array to reduction operation maximum which has no identity` even after increasing to 100 rows. Luckily I found an issue in `MaartenGr/BERTopic` about this, which revealed that too little data will do that as there is not enough text from which to extract any meaningful topics.

I then increased to `nrows=500`, since the issue indicated that this seemed to be the minimum amount of data required. Then I had another problem: 

In [79]:
# Change to nrows=5563 for the full dataset
#df = pd.read_json('https://files.ifi.uzh.ch/cl/siclemat/lehre/fs21/tm/data/all_de_topics.jsonl', lines=True, nrows=500)
df = pd.read_json('https://files.ifi.uzh.ch/cl/siclemat/lehre/fs21/tm/data/all_de_topics.jsonl',lines = True, nrows=5563)
docs = df.iloc[:,0].tolist()

topic_model = BERTopic(verbose=True, language='German')
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()
topic_model.get_topic(0) # full of stop-words 

Batches:   0%|          | 0/174 [00:00<?, ?it/s]

2022-04-24 12:42:19,074 - BERTopic - Transformed documents to Embeddings
2022-04-24 12:42:35,513 - BERTopic - Reduced dimensionality with UMAP
2022-04-24 12:42:35,749 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Unnamed: 0,content,target_names
0,"Ich bitte Sie namens der Minderheit, diese Mot...",FraktionderSchweizerischenVolkspartei
1,Die Kommission für Verkehr und Fernmeldewesen ...,FraktionderSchweizerischenVolkspartei
2,"Ich bitte Sie, in dieser Frage dem Weg der Kom...",FraktionderSchweizerischenVolkspartei


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('german')
#stops = set(stopwords.words('german'))
stop_words.extend(['geht', 'müssen', 'muss','ja','sagen','frage','mehr','immer','schon','wurde','000', 'gesagt','sowie','de','bänziger','gibt'])
stops = set(stop_words)

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stops)

topic_model = BERTopic(verbose=True, language='German', vectorizer_model=vectorizer_model)
topics_svp, probs = topic_model.fit_transform(docs)

topic_model.get_topic_info()
topic_model.get_topic(0)

In [99]:
parteis = set(df.iloc[:,1].tolist()) # two main parties (SVP/SP)

# get SVP - speeches -> create doc_svp out of it
# get SP - speeches -> create doc_sp out of it
df_svp = df[df['target_names']=='FraktionderSchweizerischenVolkspartei']
df_sp = df[df['target_names']=='SozialdemokratischeFraktion']
docs_svp = df_svp.iloc[:,0].tolist()
docs_sp = df_sp.iloc[:,0].tolist()

"""
# SVP topic model - stop-word problem:

topic_model = BERTopic(verbose=True, language='German')
topics_svp, probs = topic_model.fit_transform(docs_svp)
topic_model.get_topic_info()
topic_model.get_topic(0) # full of stop-words 
"""

"\n# SVP topic model - stop-word problem:\n\ntopic_model = BERTopic(verbose=True, language='German')\ntopics_svp, probs = topic_model.fit_transform(docs_svp)\ntopic_model.get_topic_info()\ntopic_model.get_topic(0) # full of stop-words \n"

The topics were full of stopwords! I found yet another issue in `MaartenGr/BERTopic` about this, where they said again that too little data will result in stopwords flooding the results.

## Stopword issue

For dev I didn't want to increase the data, though, since it takes so long to train, so I threw in the code I found there assigning the `vectorizer_model` to `CountVectorizer` with German stop words as an argument. I'll remove this step for the final model.

# Full Data Analysis ( SVP & SP ) 

In [129]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('german')
#stops = set(stopwords.words('german'))
stop_words.extend(['geht', 'müssen', 'muss','ja','sagen','frage','mehr','immer','schon','wurde','000', 'gesagt','sowie','de','bänziger','gibt','deshalb'])
stops = set(stop_words)

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stops)

topic_model = BERTopic(verbose=True, language='German', vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)

topic_model.get_topic_info()
topic_model.get_topic(0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Batches:   0%|          | 0/174 [00:00<?, ?it/s]

2022-04-24 14:34:30,167 - BERTopic - Transformed documents to Embeddings
2022-04-24 14:34:46,540 - BERTopic - Reduced dimensionality with UMAP
2022-04-24 14:34:46,777 - BERTopic - Clustered UMAP embeddings with HDBSCAN


[('kommission', 0.0058161546705156626),
 ('bundesrat', 0.0054529228576496395),
 ('heute', 0.004848917864558722),
 ('minderheit', 0.004667703747043915),
 ('initiative', 0.004188996847788368),
 ('artikel', 0.004007288003161748),
 ('schweiz', 0.003975946503917055),
 ('vorlage', 0.003955318067026467),
 ('herr', 0.0037283346949688197),
 ('antrag', 0.0037179997701869464)]

In [130]:
topic_model.visualize_topics()

In [131]:
topic_model.visualize_barchart(top_n_topics = 48,  n_words = 3) 

In [132]:
topic_model.visualize_heatmap()

In [133]:
topic_model.visualize_hierarchy()

# SVP - Data: 

In [134]:
topic_model = BERTopic(verbose=True, language='German', vectorizer_model=vectorizer_model)
topics_svp, probs = topic_model.fit_transform(docs_svp)

topic_model.get_topic_info()
topic_model.get_topic(0)

Batches:   0%|          | 0/84 [00:00<?, ?it/s]

2022-04-24 14:48:34,421 - BERTopic - Transformed documents to Embeddings
2022-04-24 14:48:49,666 - BERTopic - Reduced dimensionality with UMAP
2022-04-24 14:48:49,880 - BERTopic - Clustered UMAP embeddings with HDBSCAN


[('kommission', 0.007767467400567754),
 ('bundesrat', 0.0076977241379395794),
 ('initiative', 0.007164449902951744),
 ('heute', 0.006415394270296625),
 ('artikel', 0.005714969984451405),
 ('vorlage', 0.00553398243031703),
 ('minderheit', 0.005393769969959506),
 ('schweiz', 0.005323504999277161),
 ('motion', 0.0050166898444387546),
 ('bitte', 0.004847449839187931)]



## Visualizing the topics SVP

In [135]:
topic_model.visualize_topics()

In [136]:
# as we can see above there are 47 + 0 = 48 topics so therefore we need to change the defalt parameter to 47 topics.
# For easier complehension we increase the n_words from 5 to 10 words per Topic  

#topic_model.visualize_barchart(top_n_topics = 48,  n_words = 10) 
topic_model.visualize_barchart(top_n_topics = 48,  n_words = 10) 


In [137]:
topic_model.visualize_heatmap()

These visualizations look cool, but the presence of Topic -1 is kind of annoying and I can't seem to find a way to remove it.

# Hierarchical Clustering

Clusters the articles that include similar topics together


In [138]:
topic_model.visualize_hierarchy()

In [181]:

topics,similarity = topic_model.find_topics("co2", top_n=5)
print(topics)
for top in topic:
  to = topic_model.get_topic(top) # lsva: (Leistungsabhängige Schwerverkehrsabgabe) 
  print(to[0])


[16, 12, 9, 19, 21]
('co2', 0.0614439804064923)
('lsva', 0.01295256603120509)
('energien', 0.017631765368218717)
('luftfahrt', 0.028443119183008677)
('forschung', 0.018187095702797373)


# SP - Data:  

In [182]:
topic_model = BERTopic(verbose=True, language='German', vectorizer_model=vectorizer_model)
topics_sp, probs = topic_model.fit_transform(docs_sp)

topic_model.get_topic_info()
topic_model.get_topic(0)

Batches:   0%|          | 0/91 [00:00<?, ?it/s]

2022-04-24 15:28:36,008 - BERTopic - Transformed documents to Embeddings
2022-04-24 15:28:52,673 - BERTopic - Reduced dimensionality with UMAP
2022-04-24 15:28:52,807 - BERTopic - Clustered UMAP embeddings with HDBSCAN


[('patienten', 0.009965335565900365),
 ('kosten', 0.007683977288511517),
 ('gesundheitswesen', 0.0066789226826184395),
 ('kantone', 0.00656847295417767),
 ('patientinnen', 0.00642367470567881),
 ('heute', 0.006259612191178983),
 ('initiative', 0.006240220941709791),
 ('grundversicherung', 0.006090622226047194),
 ('patientinnen patienten', 0.005749803226826428),
 ('versicherten', 0.005487596062982706)]

In [183]:
topic_model.visualize_topics()

In [188]:
topic_model.visualize_barchart(top_n_topics = 48,  n_words = 10) 

In [185]:
topic_model.visualize_heatmap()

In [186]:
topic_model.visualize_hierarchy()

In [187]:
topics,similarity = topic_model.find_topics("co2", top_n=5)
print(topics)
for top in topics:
  to = topic_model.get_topic(top)
  print(to[0])

[10, 26, 29, 16, 33]
('energien', 0.020609364708972882)
('erschöpfung', 0.04168836101149215)
('akw', 0.02703706663042125)
('gpk', 0.019364436399263007)
('natur', 0.01427017005899465)
