Libraries

In [18]:
import streamlit as st
import pandas as pd
import faiss
import gensim
import numpy as np


###Component 1.

In [19]:
import pandas as pd

df = pd.read_csv("/workspaces/word2vec-qa-chatbot-2-miku-fa/data/Question_Answer_Dataset_v1.2_S10.csv")
df.head()


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy,data/set4/a10
1,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy,data/set4/a10
2,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,data/set4/a10
3,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium,data/set4/a10
4,Alessandro_Volta,What did Alessandro Volta invent in 1800?,"In 1800, Alessandro Volta invented the voltaic...",medium,easy,data/set4/a10


In [20]:
import re

# initializing string
test_str = "Gfg, is best : for ! Geeks ;? 123 a 9..."

# printing original string
print("The original string is : " + test_str)

# Removing punctuations in string
res = re.sub(r'[^\w\s]', '', test_str)
# Replace all sequences of two or more spaces with a single space.
res = re.sub(' +', ' ', res)

# printing result
print("The string after punctuation filter : ", res.strip().split(" "))


The original string is : Gfg, is best : for ! Geeks ;? 123 a 9...
The string after punctuation filter :  ['Gfg', 'is', 'best', 'for', 'Geeks', '123', 'a', '9']


###Component 2

In [21]:
# train word2vec model with all questions

# Split the text into sentences
sentences = df["Question"].to_list() + df["Question"].to_list()

def token(text):
  # Removing punctuations in string
  res = re.sub(r'[^\w\s]', '', text)
  # Replace all sequences of two or more spaces with a single space.
  res = re.sub(' +', ' ', res)
  # lower case
  res = res.lower()
  return res.strip().split(" ")

# Convert sentences to words
sentences = [token(text) for text in sentences]

In [22]:
w2v = gensim.models.Word2Vec(
    sentences=sentences,  # input data
    vector_size=128,  # size of the vectors
    window=5,  # window size
    min_count=1,  # minimum count of words
    epochs=3,  # number of iterations
    hs=0,  # Turn off hierarchical softmax and use negative sampling
    sg=1,  # Use skip-gram instead of CBOW
)

w2v.save("/workspaces/word2vec-qa-chatbot-2-miku-fa/data/w2v-advance.model")

## Component 3: generate embeddings

In [23]:
# calculate sentence vector for each sentence
def sentence_vec(sent):
    # Filter out terms that are not in the vocabulary from the question sentence
    tm_voc = [tm for tm in sent if tm in w2v.wv]
    # Get the embedding of the characters
    emb = np.vstack([w2v.wv[tm] for tm in tm_voc])
    # Calculate the vectors of each included word to get the vector of the question
    ave_vec = np.mean(emb, axis=0)
    return ave_vec

In [24]:
ques_vec = [sentence_vec(sent) for sent in sentences[:df.shape[0]]]
ques_vec = np.array(ques_vec)
ans_vec = [sentence_vec(sent) for sent in sentences[df.shape[0]:]]
ans_vec = np.array(ans_vec)


np.savez("/workspaces/word2vec-qa-chatbot-2-miku-fa/data/vector-advance.npz", x=ques_vec, y=ans_vec)

In [25]:
ques_vec.shape

(596, 128)

In [26]:
ans_vec.shape

(596, 128)

## Component 4.

In [29]:
def trained_sentence_vec(sent):
    # Filter out terms that are not in the vocabulary from the question sentence
    qu_voc = [tm for tm in sent if tm in w2v.wv]
    # Get the embedding of the characters
    emb = np.vstack([w2v.wv[tm] for tm in sent if tm in w2v.wv])
    # Calculate the vectors of each included word to get the vector of the question
    ave_vec = np.mean(emb, axis=0)
    return ave_vec


def find_answer(qr_sentence, ques_vec, ans_vec):
    # use one query sentence to retrieve answer
    qr_sentence = gensim.utils.simple_preprocess(qr_sentence)
    qr_sent_vec = trained_sentence_vec(qr_sentence)

    # perform vector search through similarity comparison
    # define the number of feature (vector) dimensions
    n_dim = ques_vec.shape[1]
    # define the number of pairs of question and answer
    n_q_a = ques_vec.shape[0] 
    # define ques_vec as a numpy array that is a float of size 32 bits
    x = np.vstack(ques_vec).astype(np.float32)
    # define ans_vec as a numpy array that is a float of size 32 bits
    y = np.vstack(ans_vec).astype(np.float32)
    # reshape qr_sent_vec
    q = qr_sent_vec.reshape(1, -1)
    # build the faiss index, n_dim=size of vectors using faiss.index_factory with METRIC_INNER_PRODUCT parameter
    index = faiss.index_factory(n_dim, "Flat", faiss.METRIC_INNER_PRODUCT)
	
    # add all questions into the faiss index
    faiss.normalize_L2(x)
    index.add(x)
	
    # add all answers into the faiss index
    faiss.normalize_L2(y)
    index.add(y)
	
    # do vector search for the query sentence
    # return similarity score and idx using index.search function
    faiss.normalize_L2(q)
    similarity, idx = index.search(q, k=index.ntotal)
    ans_idx = idx[0][0]
	
    # find out the optimal answer index
    # Hint: if ans_idx is over the number of question-answer pairs, we need to make a if-statement to 
    # return an answer index align with our question-answer dataset
    if  ans_idx >= n_q_a:
        ans_idx -= n_q_a
      
    return ans_idx

In [30]:
qr_sentence = "What did Alessandro Volta invent in 1800?"
ans_idx = find_answer(qr_sentence, ques_vec, ans_vec)
print("Query: ", qr_sentence)
print("Question: ", df["Question"][ans_idx])
print("Answer: ", df["Answer"][ans_idx])

Query:  What did Alessandro Volta invent in 1800?
Question:  What did Alessandro Volta invent in 1800?
Answer:  In 1800, Alessandro Volta invented the voltaic pile.
