In [3]:
# coding: utf-8
import codecs
import os
import sklearn
import bertopic
import zipfile

In [5]:
with zipfile.ZipFile('/content/les_lem.zip', 'r') as zip_ref:
    zip_ref.extractall('corpus')

In [6]:
corpus_files = os.listdir('corpus/les_lem')

In [7]:
with open('swl_optimum.txt', encoding = 'utf-8') as f:
  stw_list = f.read().split()

In [9]:
documents = []
for name in corpus_files:
    if name.endswith(".txt"):
        with codecs.open('corpus/les_lem' + "/" + name, encoding = 'utf-8') as f:
            data = []
            for word in f.read().split():
                if word not in stw_list:
                    data.append(word)
            document = ' '.join(data)
            documents.append(document)
            # each document should contain lemmatized words separated by spaces

In [10]:
from sklearn.feature_extraction.text import CountVectorizer 
from bertopic import BERTopic

vectorizer_model = CountVectorizer(stop_words=stw_list, ngram_range=(1,2), min_df=1, max_df=0.95) 
topic_model = BERTopic(language="multilingual", vectorizer_model=vectorizer_model, calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(documents)

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

2022-12-21 21:58:18,356 - BERTopic - Transformed documents to Embeddings
2022-12-21 21:58:32,547 - BERTopic - Reduced dimensionality
2022-12-21 21:58:32,711 - BERTopic - Clustered reduced embeddings


In [11]:
freq = topic_model.get_topic_info(); freq.head(40)

Unnamed: 0,Topic,Count,Name
0,-1,426,-1_ряд_группа_категория_исторический
1,0,443,0_категория_ряд_семантический_речь
2,1,154,1_ряд_звуковой_ударение_согласные
3,2,43,2_реконструкция_исторический_праязык_исследование
4,3,41,3_группа_тюркский_категория_падеж
5,4,35,4_индоевропейский_славянский_артикль_праславян...
6,5,32,5_институт_литовский_ссср_славянский
7,6,32,6_арабский_знак_западносемитский_письма
8,7,32,7_русский язык_словарь_русский языка_ссср
9,8,28,8_романский_испанский_французский_романский язык


In [12]:
topic_model.get_topic(0)  # Select the most frequent topic

[('категория', 0.005266784268863703),
 ('ряд', 0.004974539798885833),
 ('семантический', 0.004923794709351456),
 ('речь', 0.004802903552947709),
 ('единица', 0.004775782015229298),
 ('синтаксический', 0.004543711480567601),
 ('группа', 0.0044816894157778805),
 ('понятие', 0.0044534957953424395),
 ('исторический', 0.004312916251312472),
 ('теория', 0.004244919922592249)]

In [13]:
topic_model.visualize_topics()

In [14]:
topic_model.visualize_barchart()

In [15]:
topic_model.visualize_heatmap()