In [1]:
import json
import os

import stanza

In [2]:
stanza_analyzer = stanza.Pipeline(
    lang="be",
    processors="tokenize,pos,lemma",
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

2024-05-01 15:47:38 INFO: Loading these models for language: be (Belarusian):
| Processor | Package      |
----------------------------
| tokenize  | hse          |
| pos       | hse_nocharlm |
| lemma     | hse_nocharlm |

2024-05-01 15:47:38 INFO: Using device: cpu
2024-05-01 15:47:38 INFO: Loading: tokenize
2024-05-01 15:47:40 INFO: Loading: pos
2024-05-01 15:47:40 INFO: Loading: lemma
2024-05-01 15:47:40 INFO: Done loading processors!


In [3]:
BANNED_SYMBOLS = "abcdefghijklmnopqrstuvwxyzищъ"
BANNED_POS = ["PUNCT", "SYM", "X"]

In [4]:
sentences = []

for filename in os.scandir("results\\texts"):
    if not filename.is_file():
        continue
        
    print(filename.path, end="")
    with open(filename.path, mode="r", encoding="utf-8") as file:
        doc = stanza_analyzer(file.read())
        
        sentences_count = 0
        words_count = 0
        for sentence in doc.sentences:
            if any(char in BANNED_SYMBOLS for char in sentence.text.lower()):
                continue
                
            sentence_arr = []
            for word in sentence.words:
                if word.upos not in BANNED_POS:
                    sentence_arr.append(word.lemma) 
                    words_count += 1
                
            if len(sentence_arr) != 0:
                sentences.append(sentence_arr)
                sentences_count += 1
                
        print(f" | sentences: {sentences_count:<5} | words: {words_count}")
        
with open("results/sentences.json", mode="w", encoding="utf-8") as file:
    file.write(
        json.dumps(sentences, ensure_ascii=False)
    )

print(
    f"----------\n"
    f"Overall:\n"
    f"----------\n"
    f"Sentences: {len(sentences)}\n"
    f"Words: {sum(map(len, sentences))}"
)

results\texts\01.txt | sentences: 12865 | words: 111642
results\texts\02.txt | sentences: 1143  | words: 5384
results\texts\03.txt | sentences: 612   | words: 9597
results\texts\04.txt | sentences: 6196  | words: 80911
results\texts\05.txt | sentences: 1892  | words: 21174
results\texts\06.txt | sentences: 1314  | words: 28022
results\texts\07.txt | sentences: 1983  | words: 28287
results\texts\08.txt | sentences: 965   | words: 14301
results\texts\09.txt | sentences: 2335  | words: 27301
results\texts\10.txt | sentences: 4147  | words: 46929
results\texts\11.txt | sentences: 9244  | words: 63851
results\texts\12.txt | sentences: 3083  | words: 28372
results\texts\13.txt | sentences: 3893  | words: 42875
results\texts\14.txt | sentences: 6835  | words: 82240
results\texts\15.txt | sentences: 2478  | words: 31969
results\texts\16.txt | sentences: 8600  | words: 89304
results\texts\17.txt | sentences: 1905  | words: 25790
results\texts\18.txt | sentences: 3279  | words: 36618
results\tex