In [None]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import unicodedata
import nltk
import gensim
import math
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [None]:
df1 = pd.read_csv('./NLP-QuestionAnswerSystem/dataset/S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('./NLP-QuestionAnswerSystem/dataset/S09_question_answer_pairs.txt', sep='\t')
df3 = pd.read_csv('./NLP-QuestionAnswerSystem/dataset/S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')
frames = [df1, df2, df3]
df = pd.concat(frames)

def getArticleText(file):
  fpath = './NLP-QuestionAnswerSystem/dataset/text_data/'+file+'.txt.clean'
  try:
    f = open(fpath, 'r')
    text = f.read()
  except UnicodeDecodeError:
    f = open(fpath, 'r', encoding = 'ISO-8859-1')
    text = f.read()
  return text

df = df.dropna(subset=['ArticleFile'])
df['ArticleText'] = df['ArticleFile'].apply(lambda x: getArticleText(x))
df['ArticleText'] = df['ArticleText'].apply(lambda x: re.sub(r'(\n)+', '. ', x))
df = df.drop(['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleFile'], axis='columns')

def cleanQuestion(text):
  text = str(text)
  wnl = nltk.stem.WordNetLemmatizer()
  text = text.lower()
  words = re.sub(r'[^\w\s]', '', text).split()
  return " ".join([word for word in words])

def cleanAnswer(text):
  text = str(text)
  wnl = nltk.stem.WordNetLemmatizer()
  text = text.lower()
  words = re.sub(r'[^\w\s]', '', text).split()
  return " ".join([word for word in words])

def cleanText(text):
  text = str(text)
  wnl = nltk.stem.WordNetLemmatizer()
  text = text.lower()
  words = re.sub(r'[^\w\s\.\?]', '', text).split()
  return " ".join([word for word in words])

df['Question'] = df['Question'].apply(lambda x: cleanQuestion(x))
df['Answer'] = df['Answer'].apply(lambda x: cleanAnswer(x))
df['ArticleText'] = df['ArticleText'].apply(lambda x: cleanText(x))

In [None]:
dataset = []
title = ""
for i in range(0, len(df), 2):
    this_title = df.iloc[i]['ArticleTitle']
    if (this_title!=title):
        title = this_title
        text = df.iloc[i]['ArticleText']
        splitted = text.split(sep='.')
        for j in range(len(splitted)):
            text = splitted[j]
            if(text!=''):
                words = text.split()
                dataset.append(words)
    dataset.append(df.iloc[i]['Question'].split())
    dataset.append(df.iloc[i]['Answer'].split())

In [None]:
model = gensim.models.Word2Vec(dataset, size=100, window=8, min_count=1, sg=0, workers=8) # I have 8 cpu cores
# sg = {0, 1} – Training algorithm: 1 for skip-gram; otherwise CBOW

In [None]:
model.train(dataset, total_examples=len(dataset), compute_loss=True, epochs=50)

(25700542, 32458150)

In [None]:
def get_embedding(sentence):
  pos_sum = [0.0 for i in range(100)]
  num = 0
  words = sentence.split()
  for i in words:
    try:
      embed = model.wv[i]
    except:
      continue
    else:
      pos_sum += embed
      num +=1
  if(num==0):
    return pos_sum
  else:
    pos_sum /= num
    return pos_sum

def get_answer(question, answer_para):
  question_embedding = get_embedding(rem_stop(question))
  min_distance = math.inf
  answer = 0
  for i in range(len(answer_para)):
    answer_embedding = get_embedding(rem_stop(answer_para[i]))
    distance = np.linalg.norm(question_embedding-answer_embedding)
    if (distance < min_distance):
      answer = i
      # print(answer)
      min_distance = distance
  return answer_para[answer]

def rem_stop(sentence):
    strr=''
    my_string = sentence.split()
    for i in range(len(my_string)):
        if my_string[i] not in stopwords.words('english'):
            strr = strr+' '+my_string[i]
    return strr[1:]

def get_answer_cosine(question, answer_para):
  question_embedding = get_embedding(rem_stop(question))
  max_similarity = -math.inf
  answer = 0
  for i in range(len(answer_para)):
    answer_embedding = get_embedding(rem_stop(answer_para[i]))
    similarity = cosine_similarity(np.expand_dims(question_embedding,0), np.expand_dims(answer_embedding,0))
    if (similarity > max_similarity):
      answer = i
      max_similarity = similarity
  return answer_para[answer]

In [None]:
index = 296
my_text = df.iloc[index]['ArticleText']
temp_sentences = my_text.split(sep='.')
sentences=[]
for i in range(len(temp_sentences)):
    if(temp_sentences[i]!=''):
        sentences.append(temp_sentences[i])
my_question = df.iloc[index]['Question']

In [None]:
print(my_question) # Actual Question
print(rem_stop(my_question)) # Answer without stopwords
print(df.iloc[index]['Answer']) # Actual Answer

what was the consitution act formerly called
consitution act formerly called
british north america act


In [None]:
print(get_answer(my_question, sentences)) # Our model's prediction using euclidean distance
print("\n")
print(get_answer_cosine(my_question, sentences)) # Our model's prediction using cosine similarity

0
1
3
6
7
45
 to accommodate englishspeaking loyalists in quebec the constitutional act of 1791 divided the province into frenchspeaking lower canada and englishspeaking upper canada granting each their own elected legislative assembly


 later it was split into two british colonies called upper canada and lower canada until their union as the british province of canada in 1841
