In [1]:
import zipfile
import codecs

import pymorphy2
import re
import tqdm

In [2]:
def read_texts(fn='data/texts.zip'):
    with zipfile.ZipFile(fn) as zf:
        with zf.open('texts.txt') as f:
            yield from codecs.iterdecode(f, 'utf-8')

In [3]:
class Parser:
    def __init__(self):        
        self.pm = pymorphy2.MorphAnalyzer()
        self.r = re.compile(r'\W+')
    
    def parse(self, text):
        words = (word for word in self.r.split(text) if len(word) > 0)
        norm_form = (self.pm.normal_forms(word)[0] for word in words)
    
        return list(words)

In [4]:
from collections import defaultdict, Counter

class Doc:
    def __init__(self, doc_id, title, text):
        self.doc_id = doc_id
        self.title = title
        self.text = text

class Posting:
    def __init__(self, doc_id, positions: list):
        self.doc_id = doc_id
        self.positions = positions
        
class Indexer:
    def __init__(self):
        self.inv_index = defaultdict(list)
        self.dictionary = Counter()
        self.parser = Parser()
        self.docs = []
    
    def index_doc(self, doc: Doc):
        doc_index = defaultdict(list)
        for pos, word in enumerate(
                        self.parser.parse(doc.text)):
            doc_index[word].append(pos)
            
        self.dictionary.update(doc_index.keys())
        
        # ToDo: fix it to get tf*idf!!!
        for word, positions in doc_index.items():
            posting = Posting(len(self.docs), positions)
            self.inv_index[word].append(posting)   
            
        self.docs.append(doc)
        
    def search(self, q, n=10):
        q_words = self.parser.parse(q)
        scores = defaultdict(lambda: 0.)
        
        for q_word in q_words:
            df = self.dictionary[q_word]
            for posting in self.inv_index[q_word]:
                doc_id = posting.doc_id
                tf = len(posting.positions)
                scores[doc_id] += tf / df
                
        res = sorted(scores.items(), key=lambda x: -x[1])[:n]
        return [(self.docs[x], rel) for (x, rel) in res]

In [8]:
from itertools import islice

indexer = Indexer()

docs = enumerate(islice(read_texts(), 150))
for doc_id, text in tqdm.tqdm_notebook(docs, total=50):
    indexer.index_doc(Doc(doc_id, '', text))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [9]:
res = indexer.search('браун')

In [10]:
for doc, rel in res:
    print(rel, '------', doc.text[:50])