### Импорт

In [1]:
from natasha import (
    Segmenter,
    MorphVocab,
    PER, 
    NewsEmbedding,
    Doc,
    NewsMorphTagger,
    NewsNERTagger,
    NamesExtractor
)
import nltk
from nltk.corpus import stopwords
from string import punctuation
import requests
import re
from requests.adapters import HTTPAdapter, Retry
import time

emb = NewsEmbedding()
segmenter = Segmenter()
morph_vocab = MorphVocab()
ner_tagger = NewsNERTagger(emb)
morph_tagger = NewsMorphTagger(emb)
names_extractor = NamesExtractor(morph_vocab)

session = requests.Session()
retries = Retry(
    total=5,              
    backoff_factor=0.5,   
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)


### Предобработка

In [None]:
noise = stopwords.words("russian") + list(punctuation) + ['—', '«', '»', '\'\'']

characters = [
    "Татарский",
    "Морковин",
    "Гусейн",
    "Сергей",
    "Лена",
    "Азадовский",
    "Слава",
    "Зайцев",
    "Вова", 
    "Морковина",
    "Вавилен",
    "Дмитрий",
    "Пугин",
    "Дмитрий Пугин",
    "Сергей Морковин",
    "Гиреев",
    "Андрей",
    "Андрей Гиреев",
    "Леонид",
    "Азадовский",
    "Леонид Азадовский",
    "Вовчик",
    "Малой",
    "Вовчик Малой",
    "Саша Бло",
    "Саша",
    "Леша Чикунов",
    "Эдик",
    "Григорий",
    "Владимир Ханин",
    "Ханин",
    "Владимир",
    "Малюта",
    "Аркаша",
    "Алла",
    "Семен",
    "Фарсейкин",
    "Фарсук Сейфуль-Фарсейкин",
    "Фарсук",
    "Фарсук Карлович",
    "Манька"]   

# def cleaner(text):
#     tokens = nltk.word_tokenize(text)
#     tokens = [token for token in tokens if token not in noise]
#     clean_text = " ".join(tokens)
#     return clean_text

text = ""

with open("Pelevin_Generation_p.txt", 'r', encoding="UTF-8") as f:
    for line in f:
        line = re.sub(r'^"', '', line)
        line = re.sub(r'"\s*$', ' ', line)
        line = re.sub(r'[\t\n\r]+', ' ', line)
        text += line

doc = Doc(text)
doc.text

In [3]:
#Токенизация 
doc.segment(segmenter)

#Морфологизация
doc.tag_morph(morph_tagger)

#Лемматизация
for token in doc.tokens:
    token.lemmatize(morph_vocab)

doc.tokens[:5]

[DocToken(stop=7, text='Leonard', pos='X', feats=<Yes>, lemma='leonard'),
 DocToken(start=8, stop=13, text='Cohen', pos='X', feats=<Yes>, lemma='cohen'),
 DocToken(start=14, stop=22, text='Когда-то', pos='ADV', feats=<Pos>, lemma='когда-то'),
 DocToken(start=23, stop=24, text='в', pos='ADP', lemma='в'),
 DocToken(start=25, stop=31, text='России', pos='PROPN', feats=<Inan,Loc,Fem,Sing>, lemma='россия')]

#### Функции

In [None]:
def char_filter(doc):
    filtered_spans = []
    for span in doc.spans:
        skip_span = False
        for token in span.tokens:
            lemma = token.lemma.capitalize()
            if lemma in characters:
                skip_span = True
                break
        if not skip_span:
            filtered_spans.append(span)
    return filtered_spans

### NER

In [None]:
doc.tag_ner(ner_tagger)


doc.spans = char_filter(doc) 

Получение списка личностей, организаций и локаций

In [None]:
list_per = list()
set_per = set()
list_loc = list()
list_org = list()
for span in doc.spans:
    if (span.type == "PER"):
        for token in span.tokens:
            list_per.append(token.lemma)
            set_per.add(token.lemma)
            print(token.lemma)
    elif (span.type == "LOC"):
        for token in span.tokens:
            list_loc.append(token.lemma)
    else:
        for token in span.tokens:
            list_org.append(token.lemma)

In [None]:
def is_valid_entity(entity): # Функция для фильтрации от ненужных сущностей
    if len(entity) < 3:
        return False
    if re.search(r'\d', entity):
        return False
    return True

unique = list(set_per)