In [1]:
from collections import namedtuple

In [2]:
Article = namedtuple('Article', ['id', 'title', 'authors', 'info', 'abstract'])
Query = namedtuple('Query', ['id', 'query_string'])
Document = namedtuple('Document', ['id', 'text'])

In [3]:
DOCS_COUNT = 1400

In [4]:
articles = list()
with open('data/cran.all.1400') as f:
    raw_text = f.read()
documents_data = raw_text.split('.I')[1:]
assert len(documents_data) == DOCS_COUNT
for (i, doc_data) in enumerate(documents_data):
    (doc_id, doc_data) = doc_data.split('.T')
    doc_id = int(doc_id.strip())
    if doc_id != 240:
        (title, doc_data) = doc_data.split('.A')
    else:
        (title, doc_data, _) = doc_data.split('.A')
    title = title.strip()
    (authors, doc_data) = doc_data.split('.B')
    authors = authors.strip()
    if doc_id not in (576, 578):
        (info, abstract) = doc_data.split('.W')
    else:
        (info, abstract, _) = doc_data.split('.W')
    info = info.strip()
    info = abstract.strip()
    articles.append(Article(
        id=doc_id,
        title=title,
        authors=authors,
        info=info,
        abstract=abstract,
    ))

In [5]:
queries = list()

with open('data/cran.qry') as f:
    raw_text = f.read()
queries_data = raw_text.split('.I')[1:]
for (i, query_data) in enumerate(queries_data):
    query_data_splitted = query_data.split('.W')
    assert len(query_data_splitted) == 2
    queries.append(Query(id=i+1, query_string=query_data_splitted[1]))

In [12]:
from collections import defaultdict
from collections import namedtuple

Features = namedtuple('Features', ['query_token_count', 'doc_token_count', 'token_df', 'all_docs'])

def get_counts_dict(sequence):
    result = defaultdict(int)
    for el in sequence:
        result[el] += 1
    return result


class Search(object):
    def __init__(self, documents, tokenizer, scorer):
        self._tokenizer = tokenizer
        self._scorer = scorer
        self._documents = documents
        self._inverted_index = self._build_invert_index(documents)

    def _build_invert_index(self, documents):
        documents = sorted(documents, key=lambda x: x.id)
        index = defaultdict(list)
        for doc in documents:
            token_counts = get_counts_dict(self._tokenizer(doc.text))
            for token, count in token_counts.items():
                index[token].append((doc.id, count))
        return index

    def search(self, query):
        query_tokens_counts = get_counts_dict(self._tokenizer(query))
        tokens_by_doc = defaultdict(list)
        for token, token_count in query_tokens_counts.items():
            token_docs = self._inverted_index.get(token, list())
            for doc_id, doc_token_count in token_docs:
                tokens_by_doc[doc_id].append(Features(
                    query_token_count=token_count,
                    doc_token_count=doc_token_count,
                    token_df=len(token_docs),
                    all_docs=len(self._documents),
                ))
        docs_scores = [(doc_id, self._scorer(doc_info)) for doc_id, doc_info in tokens_by_doc.items()]
        return [doc_id for doc_id, doc_score in sorted(docs_scores, key=lambda doc_id_score: doc_id_score[1], reverse=True)]

In [31]:
import string
import nltk

STOPWORDS = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))

STEMMER = nltk.stem.porter.PorterStemmer()

class Tokenizer(object):
    def __call__(self, text):
        tokens = [el.lower() for el in nltk.word_tokenize(text)]
        tokens = [STEMMER.stem(el) for el in tokens if el not in STOPWORDS]
        return tokens

class Scorer(object):
    def __call__(self, doc_data):
        return random.randint(1, 10000)

In [34]:
Tokenizer()("Hello, PlAyers a")

['hello', 'player']

In [29]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/v_satanevsky/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [38]:
search = Search(documents=[Document(id=article.id, text=article.abstract) for article in articles], tokenizer=Tokenizer(), scorer=Scorer())

In [37]:
search.search('')

[]