# Question Answering System

##### Importing libraries

In [7]:
#for webpage and snippet extraction
import urllib
import requests
from bs4 import BeautifulSoup
from requests import get
import re
import collections
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#for answer filtering
import io
import random
import string # to process standard python strings
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
from nltk.stem import WordNetLemmatizer
nltk.download('popular', quiet=True)
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nbuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nbuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

In [17]:
# For Keyword Matching
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]

### Web Page Extraction

In [22]:
def webpageextraction(query):
    query=query.replace(' ','+')
    URL = f"https://google.com/search?q={query}"
    ## user agent specification, as google returns different results for mobile and desktop
    ### desktop user-agent
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    ### mobile user-agent
    MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"

    ## user-agent must be in the headers. We pass in a dictionary for the headers in requests.
    headers = {"user-agent" : USER_AGENT}
    response = requests.get(URL, headers=headers)

    ## checking if the request was successful (if we obtain 200 in return, it is successful), then we put it into BeautifulSoup for content parsing
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

    ## parsing the data and extracting all anchor links from the page. storing the results in the results list.
    results = []
    for g in soup.find_all('div', class_='r'):
        anchors = g.find_all('a')
        if anchors:
            link = anchors[0]['href']
            title = g.find('h3').text
            item = {
                "title": title,
                "link": link
            }
            results.append(item['link'])
    return results

### Snippet Extraction

In [60]:
def snippetextraction(results,i):
    # Snippet Extraction from the obtained web page urls
    try:
        raw = get(results[i]).text ###this is how we can extract raw html from web pages
    except:
        return 'error604'
    html = requests.get(results[i]).content
    #1 Recoding
    unicode_str = html.decode("utf8")
    encoded_str = unicode_str.encode("ascii",'ignore')
    news_soup = BeautifulSoup(encoded_str, "html.parser")
    a_text = news_soup.find_all('p')
    #2 Removing
    y=[re.sub(r'<.+?>',r'',str(a)) for a in a_text]
    y=[x.replace('\n','') for x in y]
    y=[x.strip() for x in y]
    y=' '.join(y)
    return y

### Tokenisation

In [13]:
def tokenise(raw):
    #Tokenisation
    sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
    #print(sent_tokens)
    word_tokens = nltk.word_tokenize(raw)# converts to list of words
    return sent_tokens

### Preprocessing

In [15]:
def LemTokens(tokens):
    lemmer = WordNetLemmatizer()
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

### Generating Response

In [66]:
def response(query,sent_tokens):
    robo_response=''
    sent_tokens.append(query)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        print(robo_response)
        return 'error502'
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response

### Edulexa Greeting & Bye

In [57]:
# for greeting the user
def greeting(query):
    """If user's input is a greeting, return a greeting response"""
    for word in query.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
        
# for saying bye and ending the program
def byee():
    print("Edulexa: Bye! take care..")
    return False

#for setting flag to true and introduction
def introduce():
    #as we are taking a variable flag (true until user keeps asking questions)
    print("Edulexa: Hi! My name is Edulexa. I will answer all your queries. If you want to exit, type something like 'Bye' !")
    return True

## Final Operation

In [67]:
#Edulexa introduces itself
flag=introduce()
#site index
print('May I know your name?')
#User introduces
user_name=input()
print('Edulexa: Hello ',user_name)

while(flag==True):
    i=0
    #taking user question
    query = input(user_name+': ')
    query=query.lower()
    
    if('bye' not in query.split(' ')):
        if(query=='thanks' or query=='thank you' ):
            flag=False
            print("Edulexa: You are welcome..")
        else:
            if(greeting(query)!=None):
                print("Edulexa: "+greeting(query))
            else:
                result=webpageextraction(query)
                raw=''
                while((not raw.strip()) or (('access' in raw.lower()) and ('denied' in raw.lower()))):
                    raw=snippetextraction(result,i)
                    if raw=='error604':
                        break
                    i+=1
                if raw=='error604':
                    print("Sorry I couldn't find an answer.")
                    continue
                if raw:
                    sent_tokens=tokenise(raw)
                else:
                    print("Sorry I couldn't find an answer.")
                print("Edulexa: ",end='')
                answer=response(query,sent_tokens)
                if answer=='error502':
                    continue
                print(answer)
                
                
                #now checking satisfaction
                satisfied=False
                while(not satisfied):
                    print('Are you satisfied by my answer? Enter "yes" or "no".')
                    if (input().lower()=='yes'):
                        break
                    else:
                        i+=1
                        raw=snippetextraction(result,i)
                        while((not raw.strip()) or (('access' in raw.lower()) and ('denied' in raw.lower()))):
                            raw=snippetextraction(result,i)
                            i+=1
                        if raw=='error604':
                            print('Sorry, I am out of answers now.')
                            print("Anything else you would like to ask?")
                            break
                        if raw:
                            sent_tokens=tokenise(raw)
                        else:
                            print("Sorry I couldn't find an answer.")
                        print("Edulexa: ",end='')
                        print(response(query,sent_tokens))
                        
                    
                sent_tokens.remove(query)
                
    elif ('bye' in query.split(' ')):
        flag=False
        byee()

Edulexa: Hi! My name is Edulexa. I will answer all your queries. If you want to exit, type something like 'Bye' !
May I know your name?
Saloni
Edulexa: Hello  Saloni
Saloni: what's in a name?
Edulexa: I am sorry! I don't understand you
Saloni: How is gold?
Edulexa: Of course, here at Scottsdale Bullion and Coin, we can help you find all the gold that you want, whether its in the form of gold bullion bars or numismatic gold coins.
Are you satisfied by my answer? Enter "yes" or "no".
no
Edulexa: But there was still no gold in the Universe.
Are you satisfied by my answer? Enter "yes" or "no".
no
Edulexa: It is the modern gold pan method used in the gold rush.
Are you satisfied by my answer? Enter "yes" or "no".
no
Edulexa: While transmuting mercury into gold is easiest, gold can be made from other elementseven lead!
Are you satisfied by my answer? Enter "yes" or "no".
yes
Saloni: what is mercury?
Edulexa: Mercury poisoning can result from exposure to water-soluble forms of mercury (such a