In [None]:
import nltk
import string
import numpy as np
import pandas as pd

from collections import Counter
from abc import ABCMeta, abstractmethod

from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import sent_tokenize, wordpunct_tokenize

# Tools

In [None]:
PUNKT = set(string.punctuation + "«»№_—")
STOPWORDS = set(stopwords.words("russian"))

stemmer = RussianStemmer("stemmer")

    
def text_to_sentences(text):
    return [sentence for sentence in sent_tokenize(text)]


def is_punkt(word):
    return all(char in PUNKT for char in word) or word in STOPWORDS


def sentence_to_tokens(sentence):
    return [word for word in wordpunct_tokenize(sentence) if not is_punkt(word)]
 
    
def parse(text):
    for sentence in text_to_sentences(text):
        yield sentence_to_tokens(sentence)

        
def get_index(values, p):
    return len(values[np.cumsum(values / sum(values)) < p])


def is_allowed(ch):
    return (ch >= 'а' and ch <= 'я') or (ch >= 'А' and ch <= 'Я') or ch == ' '


def filter_text(text):
    return ''.join(char for char in text if is_allowed(char))

# Data

In [None]:
raw_data = '''
Барак Обама принимает в Белом доме своего французского коллегу Николя Саркози.
О возможном включении благотворительного фонда в список "иностранных агентов" 7 мая написала газета «Ведомости».
'''

In [None]:
data = list(filter(lambda it: len(it) > 0, raw_data.split('\n')))

In [None]:
russian_names = pd.read_csv('data/russian_names.csv', sep=';', usecols=['Name'])
russian_names = set(russian_names.values.squeeze())

In [None]:
russian_surnames = pd.read_csv('data/russian_surnames.csv', sep=';', usecols=['Surname'])
russian_surnames = set(russian_surnames.values.squeeze())

# Approaches

## Dummy

In [None]:
def is_person(token: str) -> bool:
    return token in russian_names or token in russian_surnames

In [None]:
def is_org(token: str) -> bool:
    return token == 'OOO'

In [None]:
def extract(sentence: str, token: str) -> [int]:
    last = -1
    
    while True:
        last = sentence.find(token, last + 1)
        
        if last == -1:
             break
        
        yield last

        
def extract_person(sentence: str, token: str):
    return map(lambda it: (it, len(token), 'PERSON'), extract(sentence, token))


def extract_org(sentence: str, token: str):
    return map(lambda it: (it, len(token), 'ORG'), extract(sentence, token))


def tag_to_str(tag) -> str:
    return f'{tag[0]} {tag[1]} {tag[2]}'

In [None]:
for sentence in data:
    tags = []
    
    for token in sentence_to_tokens(sentence):
        if is_person(token):
            tags += list(extract_person(sentence, token))
        elif is_org(token):
            tags += list(extract_org(sentence, token))
            
    result = ' '.join(list(map(tag_to_str, tags))) + ' EOL'
    print(result)