In [40]:
from gensim.models import FastText
import re
import json
import string
from stop_words import get_stop_words
import pickle
import numpy as np
from tqdm import notebook
import json
import pymorphy2
import hnswlib
from string import ascii_letters
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.phrases import Phrases, Phraser

Для того, чтобы определить наиболее близкий вопрос к заданному полезны могут быть слова, написанные кириллицей и латиницей. Пунктуация в данном датасете обширно используется дя придания экспрессивной окраски, но на содержательную составляющую влияет слабо. Числа могли бы быть полезны.

В некоторых текстах попадаются html теги, которые целесообразно удалить (тематика вопросов всё-таки не такая узкая).

In [23]:
tag_re = re.compile(r'<[^>]+>')

In [24]:
token_re = re.compile(rf'[А-Яа-яЁёA-Za-z]+|[0-9]+')

In [25]:
def tokenize(text):
    return re.findall(token_re, text)

In [26]:
def clean(text):
    text = tag_re.sub('', text)
    tokens = tokenize(text)
    return [token.lower() for token in tokens]

In [35]:
EMBEDDING_SIZE = 200
MAX_LINES = 100_000

In [49]:
def read_corpus(path, limit=0):
    sentences = []
    count = 0
    with open(path, 'r') as file:
        for line in notebook.tqdm(file):
            sentence = clean(line)
            count += 1
            if sentence:
                sentences.append(sentence)
            if limit and count > limit:
                break
    return sentences

In [157]:
def preprocess(text):
    text = tag_re.sub('', text)
    tokens = tokenize(text)
    
    return [token for token in tokens if token not in stop_words]

In [148]:
def vectorize_sent(tokens, model):
    vector = np.zeros(model.vector_size)
    n_tokens = len(tokens)
    if not n_tokens:
        return vector
    for token in tokens:
        vector += model.wv.get_vector(token)
    vector /= n_tokens
    return vector

In [2]:
morph = pymorphy2.MorphAnalyzer()

In [47]:
def normalize(tokens):
    return [morph.parse(token.lower())[0].normal_form for token in tokens]

In [117]:
stop_words = set(get_stop_words("ru")).union(set(('стать', 'иметь', 'быть')))

def remove_stop_words(tokens):
    return [token for token in tokens if token not in stop_words]

In [118]:
sentences = read_corpus('/mnt/f/data/answers.txt', MAX_LINES)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [121]:
tmp = []
for sentence in notebook.tqdm(sentences):
    tmp.append(normalize(sentence))

HBox(children=(FloatProgress(value=0.0, max=84779.0), HTML(value='')))




In [124]:
sentences = tmp

In [125]:
%%time
phrases = Phrases(sentences,
                  common_terms=list(stop_words),
                  threshold=10,
                  min_count=5)
bigram = Phraser(phrases)
tokens = list(bigram[sentences])

CPU times: user 13.8 s, sys: 594 ms, total: 14.3 s
Wall time: 14.4 s


In [128]:
tokens = [remove_stop_words(sent) for sent in tokens]

In [130]:
%%time
ft = FastText(sentences=sentences,
              size=EMBEDDING_SIZE,
              min_count=10,
              window=3,
              workers=-1,
              max_vocab_size=30_000,
              negative=10,
              bucket=1000
             )

CPU times: user 2.8 s, sys: 78.1 ms, total: 2.88 s
Wall time: 2.88 s


In [156]:
p = hnswlib.Index(space ='cosine', dim=EMBEDDING_SIZE)

In [157]:
p.init_index(max_elements=len(tokens), ef_construction=200, M=16)

In [158]:
v_sents = np.vstack([vectorize_sent(sentence, ft) for sentence in tokens])

In [159]:
v_sents.shape

(84779, 200)

In [160]:
p.add_items(v_sents, list(range(len(tokens))))

In [161]:
labels, distances = p.knn_query(v_sents[600], k=3)

In [162]:
labels

array([[  600, 18621, 18628]], dtype=uint64)

In [163]:
distances

array([[-3.5762787e-07, -3.5762787e-07, -3.5762787e-07]], dtype=float32)

In [165]:
tokens[600]

['огонь']