In [1]:
import pandas as pd
import os
from glob import glob

import re

In [2]:
RU_WEBS_DIR = "./../a-crawling/crawling/crawling/spiders/pages/ru"
EN_WEBS_DIR = "./../a-crawling/crawling/crawling/spiders/pages/en"

ru_webs_pl = glob(f"{RU_WEBS_DIR}/**.html")
en_webs_pl = glob(f"{EN_WEBS_DIR}/**.html")

print(f"Russian websites: {len(ru_webs_pl)}")
print(f"English websites: {len(en_webs_pl)}")

Russian websites: 195
English websites: 565


In [3]:
RU_LEMMAS_FILE = "./../b-tokens/ru_lemmas.txt"
EN_LEMMAS_FILE = "./../b-tokens/en_lemmas.txt"

RU_TOKENS_FILE = "./../b-tokens/ru_tokens.txt"
EN_TOKENS_FILE = "./../b-tokens/en_tokens.txt"

In [4]:
# config

# RU or EN
lang = "RU" 
webs_pl = ru_webs_pl
lemmas_file = RU_LEMMAS_FILE

index_output_file_name = "ru_inverted_index.txt"

In [5]:
# known lemmas. lemma - key, list of tokens - value
lemmas_map = dict()

In [6]:
# index. lemma - key, list of file indexes (special data-class container) - value
index = dict()

In [7]:
file = open(lemmas_file, 'r')

lines = file.readlines()
for line in lines:
    tokens = re.split('\\s+', line)
    
    lemma_dirty = tokens[0]
    lemma = lemma_dirty[:len(lemma_dirty)-1]
    
    lemmas_map[lemma] = []
    for i in range(1, len(tokens) - 1):
        token = tokens[i]
        if (not len(token.strip()) == 0):
            lemmas_map[lemma].append(token)

file.close()

In [8]:
import html2text
h2t = html2text.HTML2Text()
h2t.ignore_links = True

In [9]:
import spacy
import spacy_transformers

nlp_en = spacy.load("en_core_web_sm")
nlp_ru = spacy.load("ru_core_news_sm")

def get_tokens(text):
    if (lang == "RU"):
        doc = nlp_ru(text)
    else:
        doc = nlp_en(text)
    
    return [w for w in doc if (w.is_alpha and not w.is_stop)]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [10]:
class InvertedIndexEntry:
    def __init__(self, lemma):
        self.lemma = lemma
        self.count = 0
        self.files = set()
        
    def add_count(self, count):
        self.count += count

    def add_file(self, file_idx):
        self.files.add(file_idx)
        self.count += 1
    
    def __repr__(self):
        return f"InvertedIndexEntry(lemma = '{self.lemma}', count = {self.count}, files = {sorted(self.files)})"

In [11]:
for pl in webs_pl:
    with open(pl, "r") as file:
        file_idx_match = re.search(r'/(\d+)-.*\.html', file.name)
        file_idx = int(file_idx_match.group(1))

        file_content = file.read()
        file_text = h2t.handle(file_content)
        
        file_tokens = get_tokens(file_text)

        # print(any(token.text == "анализе" for token in file_tokens))
        # print(sum(1 for token in file_tokens if token.text == "Разбор"))
        
        for token in file_tokens:
            lemma = token.lemma_

            # знакомая лемма - уже есть в списке detected лемм
            if (lemma in lemmas_map.keys()):
                if lemma not in index.keys():
                    index[lemma] = InvertedIndexEntry(lemma)
                #
                index[lemma].add_file(file_idx)

In [12]:
print(index['анализ'])

InvertedIndexEntry(lemma = 'анализ', count = 68, files = [3, 4, 14, 15, 19, 25, 27, 31, 38, 56, 66, 82, 91, 102, 104, 109, 112, 113, 121, 122, 130, 150, 154, 171, 194])


In [15]:
# write to index file
with open(index_output_file_name, "w") as f:
    for index_entry in index.values():
        sorted_files = sorted(index_entry.files)
        f.write(f"{index_entry.lemma}[{index_entry.count}] {' '.join(map(str, sorted_files))}\n")