In [17]:
import yaml
import hnswlib
import pickle
import json
from src.utils import clean, normalize, remove_stop_words
from gensim.models import FastText
import numpy as np

In [2]:
NUM_ELEMENTS = 154536
DIM = 100
MAX_WORDS = 50

In [3]:
ft = FastText.load('/mnt/f/data/bot/ft.model')

In [6]:
p = hnswlib.Index(space ='cosine', dim=DIM)

In [7]:
p.load_index('/mnt/f/data/bot/index.bin', max_elements=NUM_ELEMENTS)

In [15]:
with open('/mnt/f/data/bot/tfidf.pkl', 'rb') as file:
    tfidf = pickle.load(file)

In [9]:
with open('/mnt/f/data/bot/answers.json', 'r') as file:
    answers = json.load(file)

In [30]:
def vectorize_sent(tokens, model, tfidf):
    vector = np.zeros(model.vector_size)
    n_tokens = len(tokens)
    weight_sum = 0
    tfidf_feat = tfidf.get_feature_names()
    final_tf_idf = tfidf.transform([' '.join(tokens)])
    
    if not n_tokens:
        return vector
    
    for token in tokens:
        try:
            weight = final_tf_idf[0, tfidf_feat.index(token)]
        except:
            weight = 0
        vector += (model.wv.get_vector(token) * weight)
        weight_sum += weight
        
    if not weight_sum:
        return vector * 0
        
    vector /= weight_sum
    
    return vector

In [48]:
def get_answer_candidate(question: str, model=ft, tfidf=tfidf):
    tokens = normalize(clean(question))
    tokens_vect = vectorize_sent(tokens, model, tfidf)
    label, distance = p.knn_query(tokens_vect, k=1)
    return label[0][0], distance[0][0]

In [64]:
def get_answer(question: str, answers=answers):
    label, distance = get_answer_candidate(question)
    if distance > 0.3:
        return 'Ответ не найден!'
    else:
        answer_candidates = answers[str(label)]
        if len(answer_candidates) > 1:
            return answer_candidates[np.random.choice(len(answer_candidates))]
        else:
            return answer_candidates[0]

In [75]:
get_answer('Кто такие вампиры?')

'Токие живые трупы, пьющие кровь - вампиры или упыри. \n'

In [76]:
get_answer('Кто такие вампиры?')

'вампиры очень бледные кровососущие твари!!! =))))). \n'

In [83]:
get_answer('Почему заходит солнце?')

'тоже Леонардо? ну удачи))). \n'