In [10]:
import requests
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load('ru_core_news_sm')

In [11]:
def crawl_next_page(url, next_button_xpath):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    next_button = soup.select_one(next_button_xpath)

    # Check if there is a next page
    if next_button:
        next_url = next_button.get('href')
        return next_url
    else:
        return None


In [13]:
def recursive_children(token):
    children = []
    for child in token.children:
        if child.pos_ != "VERB":
            children.append(child)
            children.extend(recursive_children(child))
    return children

In [17]:
def write_csv(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        for row in data:
            f.write(row + '\n')

**Code for extracting the necrolog from russian web site.**

Code loops through the text of headlines, looking for a words associated with death. If the word is found, all the children of this word are stored(recursively). Then we look throught those children. If the child is a PER(person), we store it in a list. Then we unify those children PER, which belong to one entity in text.

In [20]:
url = "https://tvtambov.ru/category/news/svo/page/"
heading_xpath = '//*[@id="posts-container"]/li/div/h2'
sub_heading_xpath = '//*[@id="posts-container"]/li/div/p'
death_words = ["проститься", "гибель", "погибнуть", "оборваться",
               "посмертно", "умереть", "скончаться", "покойный", "мертвый", "похоронили"]
page_count = 33
necrolog = set()

# Crawling through pages and scraping more headlines
i = 1
while url and i < page_count:
    response = requests.get(url+str(i))
    soup = BeautifulSoup(response.content, 'html.parser')
    headlines =soup.find_all(class_='post-title')
    sub_headlines = soup.find_all(class_='post-excerpt')
    items = [headline.text.strip() + '. ' + sub_headline.text.strip()
             for headline, sub_headline in zip(headlines, sub_headlines)]

    for item in items:
        doc = nlp(item)
        necrolog_separated = list()
        for token in doc:
            if token.lemma_ in death_words:
                children = recursive_children(token)
                for child in children:
                    if child.ent_type_ == "PER":
                        necrolog_separated.append(child)
        for ent in doc.ents:
            if all(token in necrolog_separated for token in ent):
                necrolog.add(ent)

    i += 1

text = [ent.text for ent in necrolog]
print(text)
write_csv('necrolog.csv', text)

['Павел Карев', 'Артем Милованов', 'Алексей Жирков', 'Александр Горнов', 'Эльдар Дубровин', 'Сергей Зайцев', 'Кирилл Исанбаев', 'Денис Иванов', 'Иван', 'Максим Блохин', 'Иванов', 'Андреем Кондрашкиным']


This is a sketch of the code that I used to extract the necrologies from the news website. I tested different approaches.

In [15]:
# url = "https://pestrecy-rt.ru/news/tag/list/specoperaciia/"
# heading_xpath = 'body > main > ul > li > a > div.all-news__list_text-container > h2'
# sub_heading_xpath = 'body > main > ul > li > a > div.all-news__list_text-container > p'
# next_button_xpath = 'body > main > div.all-news__buttons-container > div > a'
# text = "Мертвый сын Валерий Иваненко с честью погиб в зоне СВО, а младший, Алексей, служит сегодня там. Он до сих пор защищает свою родину."

# doc = nlp(text)
# death_words = ["проститься", "гибель", "погибнуть", "оборваться",
#                "посмертно", "умереть", "скончаться", "покойный", "мертвый"]
# necrolog_separated = list()
# necrolog = set()
# for token in doc:
#     if token.lemma_ in death_words:    
#         children = recursive_children(token)
#         # print(children)
#         for child in children:
#             if child.ent_type_ == "PER":
#                 # print(child.text, child.ent_type_)
#                 necrolog_separated.append(child)
# for ent in doc.ents:
#     if all(token in necrolog_separated for token in ent):
#         necrolog.add(ent)
# print([dead.text for dead in necrolog])     


# # Iterate over the entities in the document
# for ent in doc.ents:
#     if ent.label_ == "PER":
#         entity_tokens = [token for token in ent]
#         entity_text = " ".join([token.text for token in ent])
#         # same_head = all(token.head == entity_tokens[0].head for token in entity_tokens)
#         print(entity_text, entity_tokens[0].head.text)
#         print([ancestor for ancestor in entity_tokens[0].ancestors])
#         # print([child for child in entity_tokens[0].children] )

# for token in doc:
#     print(token.text, token.pos_, token.dep_)
#     print([ancestor for ancestor in token.ancestors])
#     print([child for child in token.children])
#     print()