In [1]:
from math import log
from tqdm import tqdm

In [2]:
class Tree:
    def __init__(self):
        self.value = None
        self.left = None
        self.right = None
        self.mask = None
        self.sum = 0
        self.left_num = None
        self.right_num = None

In [3]:
def parse(s, tree, index):
    temp = ''
    balance = 0
    for i, letter in enumerate(s):
        if letter == '(':
            balance += 1
        if letter == ')':
            balance -= 1
        if balance > 0 or (balance == 0 and letter == ')'):
            temp += ' '
        else:
            temp += letter
    if temp == ' ' * len(temp):
        s = s[1:-1]
        temp = temp[1:-1]
        temp = s

    umper = temp.find('&')
    stick = temp.find('|')
    if stick + umper == -2:
        tree.value = index[stemmer.stem(s.lower())]
        return

    tree.left = Tree()
    tree.left.parent = tree
    tree.right = Tree()
    tree.right.parent = tree
    if stick != -1:  
        left_s = s[:stick].strip()
        right_s = s[stick + 1:].strip()
        tree.value = '|'
    else:
        left_s = s[:umper].strip()
        right_s = s[umper + 1:].strip()
        tree.value = '&'

    parse(left_s, tree.left, index)
    parse(right_s, tree.right, index)

In [4]:
def count_docs_sets(tree):
    if tree.left is None or tree.right is None:
        return set(tree.value)
    docs_left = count_docs_sets(tree.left)
    docs_right = count_docs_sets(tree.right)
    if tree.value == '&':
        return docs_left & docs_right
    else:
        return docs_left | docs_right

In [5]:
def count_docs_qtree(tree):
    def supp(tree):
        if tree.left is None or tree.right is None:
            if tree.mask is None:
                tree.mask = 1 << (int(log(tree.value, 2)) - 1)
            elif tree.mask == 0:
                return
            tree.mask, num = take_num(tree.mask, tree.value)
            tree.sum += num
            return tree.sum  
            
        if tree.left_num is None:
            tree.left_num = supp(tree.left)
        if tree.right_num is None:
            tree.right_num = supp(tree.right)
        if tree.value == '&':
            while tree.left_num != tree.right_num:
                if tree.left_num is None or tree.right_num is None:
                    return
                if tree.left_num < tree.right_num:
                    tree.left_num = supp(tree.left)
                else:
                    tree.right_num = supp(tree.right)
            result = tree.left_num
            tree.left_num = None
            tree.right_num = None
            return result
        else:
            if tree.left_num is None and tree.right_num is None:
                return
            elif tree.left_num is None:
                result = tree.right_num
                tree.right_num = None
                return result
            elif tree.right_num is None:
                result = tree.left_num
                tree.left_num = None
                return result
            elif tree.left_num < tree.right_num:
                result = tree.left_num
                tree.left_num = None
                return result
            elif tree.left_num > tree.right_num:
                result = tree.right_num
                tree.right_num = None
                return result
            result = tree.left_num
            tree.left_num = None
            tree.right_num = None
            return result
    
    
    docs = []
    new_doc = supp(tree)
    while new_doc is not None:
        docs.append(new_doc)
        new_doc = supp(tree)
    return docs

In [6]:
def index2jmp(index):
    jmp_index = dict()
    for key, item in index.items():
        jmp_index[key] = [item[0]]
        for i in range(1, len(item)):
            jmp_index[key].append(item[i] - item[i - 1])
    return jmp_index

In [7]:
fibs = [0, 1]
for _ in range(30):
    fibs.append(fibs[-2] + fibs[-1])

In [8]:
def num2fib(num):
    i = len(fibs) - 1
    r = 0
    while num:
        if fibs[i] <= num:
            r |= 1 << (i - 2)
            num -= fibs[i]
        i -= 1
    return r

In [9]:
def fib_in_index(index, word, num):
    fib_num = num2fib(num)
    if word not in index:
        index[word] = 1
    while fib_num:
        index[word] = (index[word] << 1) | (fib_num & 1)
        fib_num >>= 1
    index[word] = (index[word] << 1) | 1        

In [10]:
def bin_inverse(num):
    res = 0
    while num != 1:
        res |= num & 1
        res <<= 1
        num >>= 1
    return res >> 1

In [11]:
def index2fib(index):
    fInd = {}
    for key, item in index.items():
        for i in item:
            fib_in_index(fInd, key, i)
    return fInd

In [12]:
def fib2num(num):
    i = 2
    res = 0
    while num:
        res += (num & 1) * fibs[i]
        num >>= 1
        i += 1
    return res

In [13]:
def take_num(mask, value):
    mb_end = False
    fnum = 1
    while True:
        bit = mask & value
        if bit != 0 and mb_end:
            mask >>= 1
            break
        if bit != 0:
            fnum = (fnum << 1) | 1
            mb_end = True
        else:
            fnum <<= 1
            mb_end = False
        mask >>= 1
    while fnum & 1 == 0:
        fnum >>= 1
    return mask, fib2num(bin_inverse(fnum))

In [15]:
import re
import string
from nltk.stem import SnowballStemmer

In [16]:
index_real = {}
docs = []
more_punct = '«»–—‒―⸺⸻—¿¡'
stemmer = SnowballStemmer("russian") 
STEMMER_CACHE = {}
ind = 0

In [17]:
files = [
    'lenta.ru_d1f7e910-b5f1-4719-b724-090093e143fe_01',
    'lenta.ru_b6838708-1aa9-496f-bf88-e277374f93a8_01',
    'lenta.ru_b81aa623-ba55-43dc-b3c5-47ae2253ad27_01',
    'lenta.ru_aa5a1ef9-6ca4-4dc7-890f-308d4d62db59_01',
    'lenta.ru_6398c7e2-16da-40d2-8923-95f65aaaeb07_01',
    'lenta.ru_159b9f4b-972b-48b1-8ec3-44fbd6be33c4_01',
    'lenta.ru_80e74243-83da-4367-8ae3-fe38d333f283_01',
    'lenta.ru_4deb864d-3c46-45e6-85f4-a7ff7544a3fb_01'
]

In [18]:
for f in files:
    with open(f, 'rb') as file:
        for line in tqdm(file):
            try:
                words = [item.lower() for item in re.findall(r'[а-яА-ЯёЁ]+', line.decode())
                         if item not in string.punctuation + more_punct]
                for word in words:
                    word_hash = hash(word)
                    if word_hash not in STEMMER_CACHE:
                        STEMMER_CACHE[word_hash] = stemmer.stem(word)
                    stem_word = STEMMER_CACHE[word_hash]
                    if stem_word not in index_real:
                        index_real[stem_word] = []
                    if ind not in index_real[stem_word]:
                        index_real[stem_word].append(ind)
            except:
                mb = re.findall(r'https?://.*/', str(line))
                if mb:
                    docs.append(mb[0])
                    ind += 1

13679it [00:03, 4276.39it/s]
11358it [00:02, 4834.42it/s]
11020it [00:03, 3632.47it/s]
11470it [00:03, 3033.71it/s]
11043it [00:04, 2479.54it/s]
11966it [00:05, 2069.69it/s]
12842it [00:07, 1783.45it/s]
11348it [00:06, 1640.94it/s]


In [19]:
jmpfib = index2fib(index2jmp(index_real))

In [20]:
s = 'украина & сша & нато'

In [21]:
tree = Tree()

In [22]:
parse(s.strip(), tree, jmpfib)

In [20]:
ind = {
    'putin': [1, 4, 6, 12, 67],
    'medvedev': [2, 6, 12, 67, 123],
    'makron': [7, 12, 45, 87],
    'biden': [1, 6, 43, 45, 89]
}

In [23]:
jmpfib = index2fib(index2jmp(ind))
jmpfib

{'putin': 30264323,
 'medvedev': 6294605315,
 'makron': 180835859,
 'biden': 476329619}

In [483]:
import json

In [485]:
with open('index_real.json') as file:
    data = json.load(file)