In [1]:
import os
import glob

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import nltk
nltk.download('punkt_tab')

In [9]:
f_path = 'corpus_chin'

In [10]:
txt_files = glob.glob(os.path.join(f_path, '*.txt'))

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

In [17]:
import re
import string
def clean_tokens(tokens):
    cleaned_tokens = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'[{}]+'.format(string.punctuation), '', token)
        if token: 
            cleaned_tokens.append(token)
    return cleaned_tokens


In [None]:
all_texts = []
for f in txt_files:
    with open(f, 'r', encoding='utf8') as file:
        text = file.read()
        sentences = sent_tokenize(text, language="russian")
        words = word_tokenize(text, language="russian")
        cleaned_words = clean_tokens(words)
        all_texts.append(cleaned_words)
all_texts

In [None]:
!pip install pymorphy3 pymorphy3-dicts-ru

In [None]:
import pymorphy3

lemmas = []
morph = pymorphy3.MorphAnalyzer(lang='ru')

for doc in all_texts:
    normal_form = [morph.parse(word)[0].normal_form for word in doc]
    lemmas.append(normal_form)
lemmas

In [None]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

In [None]:
cleaned_lemmas = []
for doc in lemmas:
    cleaned_doc = [word for word in doc if word not in stop_words and not word.isdigit() and word not in string.punctuation
                   and word != '–' and word != 'n' and word != 'это' and word != 'который']
    cleaned_lemmas.append(cleaned_doc)
cleaned_lemmas

In [None]:
model = Word2Vec(cleaned_lemmas, vector_size=200, window=7, min_count=2, workers=4)

In [74]:
model.wv.most_similar("суд", topn=5)

[('арбитражный', 0.9968951344490051),
 ('дело', 0.9406508207321167),
 ('юрисдикция', 0.9315577149391174),
 ('судно', 0.92897629737854),
 ('инстанция', 0.9087719321250916)]

In [75]:
model.wv.most_similar("юрисдикция", topn=5)

[('судно', 0.9972535371780396),
 ('инстанция', 0.9934084415435791),
 ('решение', 0.987621009349823),
 ('третейский', 0.9859511852264404),
 ('судья', 0.9856148362159729)]

In [57]:
model.wv.most_similar("регистрация", topn=5)

[('учёт', 0.9862680435180664),
 ('недвижимый', 0.9854000210762024),
 ('кадастровый', 0.9841115474700928),
 ('недвижимость', 0.9792335033416748),
 ('государственный', 0.9732100367546082)]

In [76]:
model.wv.most_similar("срок", topn=5)

[('предусматривать', 0.9965723752975464),
 ('данный', 0.9957142472267151),
 ('упомянуть', 0.9953294992446899),
 ('типовой', 0.9950700402259827),
 ('таможенный', 0.9949985146522522)]

In [77]:
model.wv.most_similar("соответствие", topn=5)

[('июль', 0.9928719997406006),
 ('согласно', 0.9924408197402954),
 ('подпункт', 0.9920815825462341),
 ('статья', 0.9905498623847961),
 ('114фз', 0.9889357089996338)]

In [78]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [None]:
cleaned_lemmas

In [80]:
dictionary = Dictionary(cleaned_lemmas)
corpus = [dictionary.doc2bow(doc) for doc in cleaned_lemmas]

In [81]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [82]:
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.022*"суд" + 0.013*"дело" + 0.012*"арбитражный" + 0.009*"решение" + 0.009*"право" + 0.008*"мочь" + 0.008*"судебный" + 0.007*"спор" + 0.007*"год" + 0.006*"всё"
Topic 1: 0.020*"российский" + 0.020*"федерация" + 0.010*"закон" + 0.010*"орган" + 0.010*"гражданин" + 0.009*"федеральный" + 0.008*"лицо" + 0.007*"статья" + 0.007*"суд" + 0.007*"налоговый"
Topic 2: 0.011*"государственный" + 0.008*"кадастровый" + 0.008*"объект" + 0.007*"земельный" + 0.007*"участок" + 0.007*"мочь" + 0.007*"право" + 0.006*"недвижимость" + 0.006*"учёт" + 0.005*"регистрация"
Topic 3: 0.012*"федеральный" + 0.010*"орган" + 0.008*"государственный" + 0.008*"российский" + 0.007*"год" + 0.007*"федерация" + 0.006*"электронный" + 0.006*"закон" + 0.006*"бюджетный" + 0.006*"система"
Topic 4: 0.009*"закон" + 0.009*"российский" + 0.008*"год" + 0.008*"россия" + 0.006*"сотрудник" + 0.005*"внутренний" + 0.005*"орган" + 0.005*"полиция" + 0.005*"работа" + 0.005*"федерация"


In [83]:
!pip install pyLDAvis




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [84]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

pyLDAvis.display(lda_display)

In [None]:
pyLDAvis.save_html(lda_display, 'lda_visualization_officials_for_docs.html')