In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
import functools
import operator
from sklearn.feature_extraction.text import CountVectorizer
import requests

import pymorphy2
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
morph = pymorphy2.MorphAnalyzer()

In [2]:
docnames = []

for d in os.listdir('./content/content'):
    listdocs = os.listdir('./content/content' + '/' + d)
    listdocs = [d + '/' + doc_name for doc_name in listdocs]
    docnames.append(listdocs)
docnames = sorted(functools.reduce(operator.iconcat, docnames, []))
print(docnames[:2])

['20170702/doc.0000.dat', '20170702/doc.0001.dat']


In [3]:
urls = pd.read_csv('./urls.numerate.txt', sep='\t', header=None)
urls.index = urls[1]

In [4]:
def extract_doc_url(docname):
    with open('./content/content/' + docname, errors='ignore') as read_file:
        html = ' '.join(list(read_file))
        return html.split()[0]

In [5]:
from tqdm.notebook import tqdm

result = []
for doc_name in tqdm(docnames):
    url = extract_doc_url(doc_name)
    doc_id = urls.at[url, 0]    
    result.append((doc_id, doc_name))
docnames_ordered_by_doc_id = [v[1] for v in sorted(result, key=lambda x: x[0])]

HBox(children=(FloatProgress(value=0.0, max=38114.0), HTML(value='')))




In [6]:
docnames_ordered_by_doc_id[:20]

['20170707/doc.2351.dat',
 '20170707/doc.2661.dat',
 '20170707/doc.1883.dat',
 '20170707/doc.0713.dat',
 '20170707/doc.0996.dat',
 '20170707/doc.0995.dat',
 '20170707/doc.2381.dat',
 '20170707/doc.3112.dat',
 '20170707/doc.1037.dat',
 '20170707/doc.3958.dat',
 '20170707/doc.1590.dat',
 '20170707/doc.0645.dat',
 '20170707/doc.3294.dat',
 '20170707/doc.4243.dat',
 '20170707/doc.1350.dat',
 '20170707/doc.3269.dat',
 '20170707/doc.0558.dat',
 '20170707/doc.3762.dat',
 '20170707/doc.1911.dat',
 '20170707/doc.1544.dat']

In [7]:
sample_subm = pd.read_csv('./sample.technosphere.ir1.textrelevance.submission.txt')
tuples = [sample_subm.values[i] for i in range(len(sample_subm))]

groups_mapping = dict()
for elem in tuples:
    if elem[0] not in groups_mapping:
        groups_mapping[elem[0]] = [elem[1] - 1]
    else:
        groups_mapping[elem[0]].append(elem[1] - 1)
        

## Обработка текстов

In [8]:
from tqdm.notebook import tqdm
class GEN_file_to_tokens_title():
    def __iter__(self):
        for docname in tqdm(docnames_ordered_by_doc_id):
            with open('./text-relevance-parsed/content/content/' + docname + '.txt', errors='ignore') as read_file:
                title_words = []
                for line in read_file:
                    if line == '\n':
                        break
                    title_words.append(line.rstrip('\n'))
            yield title_words

In [9]:
from tqdm.notebook import tqdm
class GEN_file_to_tokens_body():
    def __iter__(self):
        for docname in tqdm(docnames_ordered_by_doc_id):
            with open('./text-relevance-parsed/content/content/' + docname + '.txt', errors='ignore') as read_file:
                body_words = []
                passed_title = False
                for line in read_file:
                    if not passed_title:
                        if line == '\n':
                            passed_title = True
                    else:
                        body_words.append(line.rstrip('\n'))
            yield body_words

In [10]:
vectorizer1 = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
gen_title_corpus = GEN_file_to_tokens_title()
sparse_all_docs_title = vectorizer1.fit_transform(gen_title_corpus)

HBox(children=(FloatProgress(value=0.0, max=38114.0), HTML(value='')))




In [11]:
vectorizer2 = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
gen_body_corpus = GEN_file_to_tokens_body()
sparse_all_docs_body = vectorizer2.fit_transform(gen_body_corpus)

HBox(children=(FloatProgress(value=0.0, max=38114.0), HTML(value='')))




In [12]:
sparse_all_docs_title

<38114x34400 sparse matrix of type '<class 'numpy.int64'>'
	with 255180 stored elements in Compressed Sparse Row format>

In [13]:
sparse_all_docs_body

<38114x3644258 sparse matrix of type '<class 'numpy.int64'>'
	with 78445533 stored elements in Compressed Sparse Row format>

In [14]:
term_index_in_sparse_title = vectorizer1.vocabulary_
term_index_in_sparse_body = vectorizer2.vocabulary_

In [15]:
r = requests.get('https://raw.githubusercontent.com/arosh/BM25Transformer/master/bm25.py')
if r.status_code == 200:
    with open('Bm25.py', 'w+') as f:
        f.write(r.text)

In [16]:
from Bm25 import BM25Transformer

vectorizer3 = BM25Transformer()
bm25_vectorized_title = vectorizer3.fit_transform(sparse_all_docs_title)

vectorizer4 = BM25Transformer()
bm25_vectorized_body = vectorizer4.fit_transform(sparse_all_docs_body)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


## Обработка запросов

In [17]:
queries = pd.read_csv('./queries.numerate.txt', sep='\t', header=None)
queries = queries.values[:, 1]

json_path = 'https://speller.yandex.net/services/spellservice.json/checkText?text='

corrected_queries = []
for query in queries:
    r = requests.get(json_path + query)
    if r.status_code != 200:
        print('ERROR')
    changes = {change['word']: change['s'][0] for change in r.json() if len(change['s']) > 0}

    corr_query = query
    for word, suggestion in changes.items():
        corr_query = corr_query.replace(word, suggestion)
    
    corrected_queries.append(corr_query)
queries = corrected_queries

In [18]:
my_stopwords = list(set(stopwords.words('russian')) - set(['как', 'когда', 'почему', 'зачем', 'чтобы', 'что']))
my_stopwords[:5]

['же', 'какая', 'во', 'вот', 'себя']

In [19]:
PYMORPHY_CACHE = {}
def lemmatizer(words):
    global PYMORPHY_CACHE
    for word in words:
        word_hash = hash(word)
        if word_hash not in PYMORPHY_CACHE:
            PYMORPHY_CACHE[word_hash] = morph.parse(word)[0].normal_form
        yield PYMORPHY_CACHE[word_hash]

In [20]:
def clean_query(query):
    query_tokens = list(lemmatizer(query.split()))
    query_tokens = [q_tok for q_tok in query_tokens if q_tok not in my_stopwords
                    and q_tok not in stopwords.words('english')]
    return query_tokens

## Ранжирование

In [21]:
def ranking(vectorized, term_index_in_sparse, query_tokens):
    indexes_in_sparse = sorted([term_index_in_sparse[q_tok] for q_tok in query_tokens 
                                     if q_tok in term_index_in_sparse])
    relevant_docs = vectorized[:, indexes_in_sparse].toarray().sum(axis=1)
    
    return relevant_docs

In [22]:
DocsId = []
QueryId = []

weight = 1.5 # Вес заголовка

from tqdm import tqdm
for q_id, query in tqdm(enumerate(queries)):
    query_tokens = clean_query(query)

    candidates = ranking(bm25_vectorized_title[groups_mapping[q_id + 1]], term_index_in_sparse_title, query_tokens) * weight +\
        ranking(bm25_vectorized_body[groups_mapping[q_id + 1]], term_index_in_sparse_body, query_tokens)
    
    most_relevant_docs_idx = candidates.argsort()[::-1][:10]
    
    most_relevant_docs = np.array(groups_mapping[q_id + 1])[most_relevant_docs_idx]
    for doc_id in most_relevant_docs:
        # Запросы в queries.txt нумеруются с единицы
        QueryId.append(q_id + 1)
        # Документы тоже нумеруются с единицы
        DocsId.append(doc_id + 1)


399it [00:04, 88.07it/s]


In [23]:
result_df = pd.DataFrame({
    'QueryId' : QueryId,
    'DocumentId' : DocsId
})
result_df

Unnamed: 0,QueryId,DocumentId
0,1,78
1,1,28
2,1,51
3,1,44
4,1,7
...,...,...
3985,399,38029
3986,399,38067
3987,399,38019
3988,399,38043


In [24]:
result_df.to_csv('subm_final.txt', index=False)