# Question Answer System

In [1]:
#All the imports

import re
import nltk
import requests
import pandas as pd

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from bs4 import BeautifulSoup




### All the required functions 

In [2]:
#Get the content from the wiki url and store it into single data variable.
 
def getDataFromURL(url):
    print("Fetching Data from url")
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    data = ""
    for item in soup.find_all("p"):
        #Removes all the refrences e.g "[12]" type of string from the text.
        withoutRef = re.sub(r'\[\d+\]', '', item.text)
        data= data+"\n"+withoutRef 
    return data

In [3]:
# pre processing data
def cleanData(sentence):
    # convert words to lowercase, ignore all special characters data only contains alpha-numericals and spaces
    sentence = re.sub(r'[^A-Za-z0-9\s]', r'', str(sentence).lower())

    # remove stop words
    sentence = " ".join([word for word in sentence.split()
                        if word not in stopwords.words('english')])

    return sentence

In [4]:
#Build Model for given text
def buildModel(sentences):
    print("Building the model")
    data = [cleanData(x).split(" ") for x in sentences]
    # train model
    model = Word2Vec(data, min_count=1)
    # summarize vocabulary
    words = list(model.wv.vocab)
    # save model
    model.save('model.bin')
    # load model
    new_model = Word2Vec.load('model.bin')
    return new_model

In [5]:
#Compute similarity between two sentences
def compute_sentence_similarity(sentence_1, sentence_2, model_wv): 
    tokens_1 = word_tokenize(sentence_1)   
    tokens_2 = word_tokenize(sentence_2)
    return model_wv.n_similarity(tokens_1, tokens_2)

In [9]:
#Predict sentence have max probabity to be the answer to the query.
def predictAnswer(url,query):
    
    # fetch text and split in sentences.
    data = getDataFromURL(url)
    sentences = sent_tokenize(data)
    
    #build new model
    new_model = buildModel(sentences)
    
    
    print("Predicting.......")
    max_similarity=0
    answer=""

    #Loop all the sentences to get the best match
    for sentence in sentences:
        similarity = compute_sentence_similarity(cleanData(sentence), cleanData(query), new_model)
        if(max_similarity<similarity):
            max_similarity = similarity
            answer = sentence
    print("Prediction completed")
    return answer,max_similarity
    

### Main Code 

In [7]:
#Intialize url and query
#One a change them accordingly
url = "https://en.wikipedia.org/wiki/R2-D2"
query="When was R2-D2 inducted into the Robot Hall of Fame"

In [10]:
answer,sim = predictAnswer(url,query)

Fetching Data from url
Building the model


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  """


Predicting.......
Prediction completed


In [12]:
print(answer," ", round(sim,3))

R2-D2 was inducted into the Robot Hall of Fame in 2003.   0.929
