In [75]:
import annoy
from gensim.models import FastText
from nltk import word_tokenize
import re
import json
import string
from stop_words import get_stop_words
import pickle
import numpy as np

In [23]:
stop_words = set(get_stop_words("ru"))
exclude = set(string.punctuation).union(stop_words)

In [31]:
tag_re = re.compile(r'<[^>]+>')

In [39]:
def preprocess(text):
    text = tag_re.sub('', text)
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token not in exclude and token.isalpha()]
    return tokens

In [43]:
def read_corpus(path, limit=0):
    sentences = []
    count = 0
    with open(path, 'r') as file:
        for line in notebook.tqdm(file):
            sentence = preprocess(line)
            count += 1
            if sentence:
                sentences.append(sentence)
            if limit and count > limit:
                break
    return sentences
    

In [47]:
if 1:
    sentences = read_corpus('/mnt/f/data/answers.txt', 1_000_000)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [70]:
def prepare_questions(path):
    
    with open(path, 'r') as file:
        lines = file.readlines()
    questions_and_answers = ''.join(lines).split('--\n')
    
    result = {}
    
    for q_n_a in questions_and_answers:
        q_n_a = q_n_a.split('\n')
        if (q_n_a and len(q_n_a[0]) > 3 and
            len(q_n_a) > 1 and
            q_n_a[0] not in result.keys()):
            
            result[q_n_a[0]] = q_n_a[1:]
    return result


In [72]:
if 1:
    q_and_a = prepare_questions('/mnt/f/data/answers.txt')

In [78]:
if 1:
    with open('/mnt/f/data/questions_and_answers_separated.txt', 'w') as file:
        json.dump(q_and_a, file)

In [54]:
EMBEDDING_SIZE = 70

In [55]:
%%time
ft = FastText(sentences=sentences, size=EMBEDDING_SIZE, min_count=10, window=5, workers=-1)

CPU times: user 26.7 s, sys: 4.48 s, total: 31.2 s
Wall time: 31.1 s


In [56]:
ft_index = annoy.AnnoyIndex(EMBEDDING_SIZE ,'angular')

In [86]:
def make_index(questions_and_answers, model, index):
    
    index_map = {}
    counter = 0
    
    for question in notebook.tqdm(questions_and_answers.keys()):
        
         
        answers = questions_and_answers[question]
        index_map[counter] = answers
        
        question = preprocess(question)
        
        question_vec = np.zeros(model.wv.vector_size)
        for word in question:
            question_vec += model.wv[word]
        index.add_item(counter, question_vec)
        
        counter += 1
    
    index.build(10)
    return index_map

In [87]:
index_map = make_index(q_and_a, ft, ft_index)

HBox(children=(FloatProgress(value=0.0, max=1158521.0), HTML(value='')))




In [92]:
def get_response(question, index, model, index_map, num_candidates=1):
    question = preprocess_txt(question)
    vector = np.zeros(model.wv.vector_size)
    norm = len(question)
    for word in question:
        vector += model.wv[word]
        norm += 1
    if norm > 1:
        vector = vector / norm
    answers = index.get_nns_by_vector(vector, num_candidates)
    return [index_map[i] for i in answers]

In [103]:
get_response('Как починить кран?', ft_index, ft, index_map)

[['Ни какая!!. ',
  'у неё температура кипения всего 115 градусов, у современных тормознух выше. ',
  'БСК (бутиловый спирт и касторовое масло) лучше заменить на современную ТЖ., промыв всюмозную систему, а лучше заменить все манжеты на новые и залить свежую ТЖ. БСК может работать только в барабанных тормозных механизмах и в узком диапазоне температур.. ',
  'ЭСК-то есть ЭТИЛОВЫЙ спирт+касторка.. ',
  '-']]

In [99]:
if 1:
    with open('/mnt/f/data/index_map.txt', 'w') as file:
        json.dump(index_map, file)

In [100]:
if 1:
    ft_index.save('/mnt/f/data/ft_index.txt')

In [102]:
from gensim.test.utils import get_tmpfile
if 1:
    fname = get_tmpfile("/mnt/f/data/fasttext.model")
    ft.save(fname)