In [1]:
import lxml.html
import requests
import re
import lxml.etree as etree

In [2]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [3]:
import dataset
import json
import os

In [4]:
class QueryByIndex:
    def __init__(self, url):
        self.url = url
        q = requests.get(url)
        c = lxml.html.fromstring(q.text)
        self.matches = c.xpath("//li/span[@class='b-doc-expl']/parent::*")
        self.starts_from = int(c.xpath("//ol[@start]")[0].get('start'))
    
    def get_match_by_position(self, position):
        return self.matches[position - self.starts_from]
    
    def get_link_from_match(self, match, items):
        exp_links = match.xpath(".//a[contains(., '←…→')]")
        final_url = None
        nk_prefix = 'https://processing.ruscorpora.ru/'
        end_clear = re.compile(r'\s+\[[^\]]+\]\s+\[[^\]]+\]\s+←…→$')
        item_text = "".join([i["text"] for i in items])
        item_text = re.sub(r'\s+', ' ', item_text)
        if len(exp_links) == 1:
            final_url = nk_prefix + exp_links[0].get('href')
        else:
            hamming = []
            for n, link in enumerate(exp_links):
                li_text = link.xpath('string(./..)')
                li_text = re.sub(end_clear, "", li_text)
                li_text = re.sub(r'\s+', ' ', li_text)
                hamming.append([n, similar(li_text, item_text)])

            i = sorted(hamming, key=lambda x: x[1])[-1][0]
            final_url = nk_prefix + exp_links[i].get('href')
        return final_url

In [5]:
def update_on_file(dataset_name):
    j = json.loads(
            open(os.path.join("..", "files", "datasets", dataset_name), encoding="utf-8").read())
    has_expanded = False
    index_urls = {}
    for n, item in enumerate(j["nk:datasetContent"]["items"]):
        if "itemExpandURL" in item:
            has_expanded = True
            break
        
        url = item["itemURL"]
        page = int(item["itemPageIndex"])
        position = int(item["indexInPage"])
        if page not in index_urls:
            index_urls[page] = QueryByIndex(url)
        
        a = index_urls[page]
        l = a.get_link_from_match(a.get_match_by_position(position), item["text"])
        
        item["itemExpandURL"] = l
        
        j["nk:datasetContent"]["items"][n] = item
    
    if has_expanded:
        print(f"Skipping {dataset_name}")
        return
    
    
    with open(os.path.join("..", "files", "datasets", dataset_name), "w", encoding="utf-8") as json_writer:
        json_writer.write(json.dumps(j))
        json_writer.close()
        
    print(f"Updated file {dataset_name}")
        

In [6]:
from time import sleep

In [7]:
start_from = 20
for n, dataset_name in enumerate(os.listdir(os.path.join("..", "files", "datasets"))):
    if n < start_from:
        continue
    print(f"({n}/120) {dataset_name}")
    for x in range(5):
        try:
            update_on_file(dataset_name)
            break
        except IndexError:
            print("x")
        

(20/120) НКРЯ_за_ВАР_ДвРефлексивИнтерпозиция.json
Skipping НКРЯ_за_ВАР_ДвРефлексивИнтерпозиция.json
(21/120) НКРЯ_за_ВАР_ОтрМестИнтерпозиция.json
Skipping НКРЯ_за_ВАР_ОтрМестИнтерпозиция.json
(22/120) НКРЯ_за_ВАР_Удвоение.json
x
x
x
x
x
(23/120) НКРЯ_из-за_ВАР_АмальгамыИнтерпозиция.json
Skipping НКРЯ_из-за_ВАР_АмальгамыИнтерпозиция.json
(24/120) НКРЯ_из-за_ВАР_АппроксИнверсия.json
Skipping НКРЯ_из-за_ВАР_АппроксИнверсия.json
(25/120) НКРЯ_из-за_ВАР_ОтрМестИнтерпозиция.json
Skipping НКРЯ_из-за_ВАР_ОтрМестИнтерпозиция.json
(26/120) НКРЯ_из-за_ВАР_Удвоение.json
Skipping НКРЯ_из-за_ВАР_Удвоение.json
(27/120) НКРЯ_из_ВАР_АппроксИнверсия.json
Skipping НКРЯ_из_ВАР_АппроксИнверсия.json
(28/120) НКРЯ_из_ВАР_ДвРефлексивИнтерпозиция.json
Skipping НКРЯ_из_ВАР_ДвРефлексивИнтерпозиция.json
(29/120) НКРЯ_из_ВАР_ОтрМестИнтерпозиция.json
Skipping НКРЯ_из_ВАР_ОтрМестИнтерпозиция.json
(30/120) НКРЯ_из_ВАР_Удвоение.json
Skipping НКРЯ_из_ВАР_Удвоение.json
(31/120) НКРЯ_к_ВАР_АппроксИнверсия.json
Skipping Н