# Webscrapping and inserting into database (sqlite) to make knowledge base

In [4]:
import sqlite3
import urllib
from bs4 import BeautifulSoup
import requests

In [7]:
conn = sqlite3.connect('knowledgeBase.db')
c = conn.cursor()
c.execute("CREATE TABLE knowledgeBase (question TEXT, answer TEXT)")

<sqlite3.Cursor at 0x7fde5dcedce0>

In [8]:
def insertDB(q_and_a):
    c.executemany("insert into knowledgeBase(question, answer) values (?,?)", q_and_a)  

In [9]:
def get_questions_answers(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content)
    question = soup.find_all(class_ = 'question')
    answer = soup.find_all(class_ = 'answer')
    q_and_a=[]
    for i in range(len(question)):
        q_and_a_set = (question[i].text,answer[i].text)
        q_and_a.append(q_and_a_set)
    return q_and_a

In [10]:
q_and_a = get_questions_answers('https://www.vat19.com/faq')

In [11]:
insertDB(q_and_a)

# Creating model and querying from database to check similarity

In [75]:
from gensim import corpora, models, similarities
import json

In [71]:
questions = c.execute('SELECT question FROM knowledgeBase').fetchall()
answers = c.execute('SELECT answer FROM knowledgeBase').fetchall()

In [72]:
#converting to list of strings
for i,ques in enumerate(questions):
    questions[i] = ques[0]
    answers[i] = answers[i][0]

In [50]:
#tokenizing into words
texts = [[word.lower() for word in question.split()]
         for question in questions]

In [51]:
#dictionary of text
dictionary = corpora.Dictionary(texts)
#corpus of text
corpus = [dictionary.doc2bow(text) for text in texts]
#latent model creation
lsi = models.LsiModel(corpus, id2word=dictionary)

In [55]:
#example data
data = 'When will my order ship'

In [56]:
#corpus of data
vec_bow = dictionary.doc2bow(data.lower().split()
# convert the query to LSI space
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])  
# perform a similarity query against the corpus
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])[0:5]

In [65]:
#first is index and second is similarity score. Index value is the position of question in questions list
sims

[(0, 0.83871585),
 (3, 0.76563936),
 (12, 0.44204205),
 (6, 0.36092582),
 (9, 0.27957195)]

# Function take query question and output the top 5 similar data from database

In [105]:
def query_to_output_similarity(query, dictionary, lsi):
    #corpus of data
    vec_bow = dictionary.doc2bow(query.lower().split())
    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])  
    # perform a similarity query against the corpus
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return sims

In [106]:
def response_with_data(query, dictionary, lsi, num_resp = 5):
    sims = query_to_output_similarity(query,dictionary, lsi)
    sims = sims[0:num_resp]
    index = []
    score = []
    for i in sims:
        index.append(i[0])
        score.append(i[1])
    ans = [answers[i] for i in index]
    ques = [questions[i] for i in index]
    output = {'questions': ques, 'answers':ans, 'similarity_score': score}
    return output

In [107]:
query = 'How much is shipping'
response_with_data(query, dictionary, lsi)

{'questions': ['How much is shipping?',
  'What is your return policy?',
  'How do I track my order?',
  'I made a mistake with my order. How do I change it?',
  'Can I cancel my order?'],
 'answers': ['Shipping costs depend on the weight (and size) of your order, your destination, and the shipping method you choose.\nFor orders shipped within the US, however, you can choose our flat rate shipping method. This means your order ships for the same low rate no matter how much you order!\nFor all other methods, you can determine prices by using the shipping calculator on any product page or the shopping cart page.',
  'Instructions for what you should do in the event  that something is wrong with your order are available here.',
  'You should have received an email from us with instructions on how to track your order after it shipped.You can also view details of your order by visiting our order tracking page.',
  "If your order has not yet shipped or isn't in the process of being packed, w