In [1]:
from os import walk

_, _, filenames = next(walk('./data'))

In [2]:
import re

questions = []
answers = []

for fname in filenames:
    f = open('./data/' + fname,'r')
    sentences = f.readlines()
    for sent in sentences:
        q_a = sent.split('__eou__')
        for i in range(len(q_a)):
            q_a[i] = re.sub('^ | $','',q_a[i])
        try:
            questions.append(q_a[0])
            answers.append(q_a[1])
        except Exception:
            continue

# remove punctuation
questions = [re.sub(r'[^\w\s]','',sent) for sent in questions]
            
print(len(questions))
print(len(answers))
print(questions[:2])
print(answers[:2])

5855
5855
['Bạn thuê trọ bao nhiêu tiền', 'Nhà bạn có bao nhiêu người vậy ']
['Mình thuê khoảng 3 tr một tháng', 'Nhà mình có 7 người']


In [3]:
import vncorenlp

annotator = vncorenlp.VnCoreNLP("VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

In [4]:
questions_tokenized = []

for sent in questions:
    questions_tokenized.extend(annotator.tokenize(sent.lower()))

In [5]:
answers_tokenized = []

for sent in answers:
    answers_tokenized.extend(annotator.tokenize(sent.lower()))

#Word2Vec

In [6]:
from gensim.models import KeyedVectors, Word2Vec
# pretrained_w2v = KeyedVectors.load_word2vec_format("./PretrainW2V/wiki.vi.model.bin.gz", binary=True)

# print("Vocab size: " + str(len(pretrained_w2v.vocab)))

Vocab size: 231486


In [7]:
w2v_model = Word2Vec(sentences=questions_tokenized+answers_tokenized,
                     size=50, window=5, sg=1, min_count=2)

print("Vocab size: " + str(len(w2v_model.wv.vocab)))

Vocab size: 2928


In [8]:
import numpy as np

def sum_vector(list_of_vector, length=50):
    sum = np.zeros(length)
    for vec in list_of_vector:
        sum = sum + vec

    return sum

def concat_vector(list_of_vector):
    return np.concatenate(list_of_vector, axis=0)

In [10]:
questions_vectorized = []

for i,sent in enumerate(questions_tokenized):
    sent_vectorized = []
    for word in sent:
        try:
            sent_vectorized.append(w2v_model.wv[word])
        except:
            continue

    questions_vectorized.append(sum_vector(sent_vectorized))
    
corpus = questions_vectorized

In [71]:
import math

def euclidean_distance(x, y):
    total = 0
    for i in range(len(x)):
        total += math.pow(x[i]-y[i],2)
        
    return math.sqrt(total)

def cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def get_best_index_byEuclidDistance(question, corpus=corpus):
    min_d = math.inf
    best_index = 0
    
    for i,sent in enumerate(corpus):
        d = euclidean_distance(question, corpus[i])
        if d < min_d:
            min_d = d
            best_index = i
    
    return best_index

def get_best_index_byCosineSimilarity(question, corpus=corpus):
    max_cosine = -(math.inf)
    best_index = 0
    
    for i,sent in enumerate(corpus):
        cosine = cosine_similarity(question, corpus[i])
        if cosine > max_cosine:
            max_cosine = cosine
            best_index = i
    
    return best_index

def get_answer_byEuclidDistance(question):
    question = re.sub(r'[^\w\s]','',question)
    
    question_tokenized = annotator.tokenize(question.lower())[-1]
    
    question_vectorized = []
    for word in question_tokenized:
        try:
            question_vectorized.append(w2v_model.wv[word])
        except Exception:
            continue

    question_vectorized = sum_vector(question_vectorized)
    
    best_index = get_best_index_byEuclidDistance(question_vectorized)
    
    return answers[best_index]

def get_answer_byCosineSimilarity(question):
    question = re.sub(r'[^\w\s]','',question)
    
    question_tokenized = annotator.tokenize(question.lower())[-1]
    
    question_vectorized = []
    for word in question_tokenized:
        try:
            question_vectorized.append(w2v_model.wv[word])
        except Exception:
            continue

    question_vectorized = sum_vector(question_vectorized)
    
    best_index = get_best_index_byCosineSimilarity(question_vectorized)
    
    return answers[best_index]

In [72]:
cosine_similarity([1,2,1,1],[1,1,1,1])

0.944911182523068

In [78]:
question = "chào bạn, cho mình làm quen nhé"

print("Answer by euclid distance: " + get_answer_byEuclidDistance(question))
print("Answer by cosine similarity: " + get_answer_byCosineSimilarity(question))

Answer by euclid distance: mình có người yêu rồi
Answer by cosine similarity: công ty mình môi trường chuyên nghiệp


In [79]:
question = "mình yêu bạn, làm người yêu mình nhé"

print("Answer by euclid distance: " + get_answer_byEuclidDistance(question))
print("Answer by cosine similarity: " + get_answer_byCosineSimilarity(question))

Answer by euclid distance: Okay luôn nhá
Answer by cosine similarity: Yêu chứ, yêu cả đất nước này luôn cơ
