In [1]:
%config Completer.use_jedi = False

In [2]:
import re
from tqdm.notebook import tqdm
# import pymorphy2
import os
from typing import List

import nltk
nltk.download("stopwords")

stopwords = nltk.corpus.stopwords.words('english')

latin = re.compile(r'[^a-z ]')
mult_ws = re.compile(r'\s+')

nltk_lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/uadmin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# not word_normal_form in ['.', ',', ':', '?', '!'])

def lemmatize(text: str, lemmatizer, min_word_size: int = 5) -> List[str]:
    words = nltk.word_tokenize(text)
    res = []
    for word in words:
        if word not in stopwords:
            word_normal_form = lemmatizer.lemmatize(word)
            if len(word_normal_form) < min_word_size:# or word_normal_form.isdigit():
                continue
            res.append(word_normal_form)
    return res

def process_regex(re_words_to_save,
                  re_mult_ws,
                  text: str) -> str:
    '''
    строковый предпроцессинг страницы
    '''
    text = text.lower()
    text = re.sub(re_words_to_save, '', text)
    text = re.sub(re_mult_ws, ' ', text)
    text = text.strip()
    return text

def process_doc(doc: List[str], lemmatizer, min_word_size=3, min_sent_size=10) -> List[str]:
    '''
    осуществляет препроцессинг всего документа по предложениям
    также производится лемматизация по токенам
    '''
    proc_doc = []
    for sent in doc:
        proc_sent = process_regex(
            latin,
            mult_ws,
            text=sent
        )
        proc_sent_tokens = lemmatize(proc_sent, lemmatizer, min_word_size)
        # минимальная длина предложения
        if len(proc_sent_tokens) < min_sent_size:
            continue
        proc_sent = ' '.join(proc_sent_tokens)
        proc_doc.append(proc_sent)
    return proc_doc

def sentenize_doc(doc: str) -> List[str]:
    return [sent for sent in nltk.tokenize.sent_tokenize(doc) if len(sent)]

In [4]:
def process_all_docs(source_dir: str, min_word_size, min_sent_size) -> List:
    docs = []
    for doc_name in tqdm(os.listdir(source_dir)):
        if os.path.isdir(doc_name):
            continue
        with open(f'{source_dir}/{doc_name}') as fh:
            doc = fh.read()
            doc_sentenized = sentenize_doc(doc)
            proc_doc_sentenized = process_doc(doc_sentenized,
                                              lemmatizer=nltk_lemmatizer,
                                              min_word_size=min_word_size,
                                              min_sent_size=min_sent_size)
            docs.append(proc_doc_sentenized)
    return docs

In [5]:
procced_docs = process_all_docs('txt', min_word_size=3, min_sent_size=10)

  0%|          | 0/2657 [00:00<?, ?it/s]

In [6]:
def save_clean_text(doc: str, filename: str = None, target_dir: str = 'clean_txt') -> None:
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    if os.path.isdir(f'{target_dir}/{filename}'):
        return
    with open(f'{target_dir}/{filename}', 'w') as fh:
        fh.writelines([f'{sent}\n' for sent in doc])

In [7]:
for doc, fname in zip(procced_docs, os.listdir('txt')):
    save_clean_text(doc, fname)