In [1]:
#import the required libraries
import spacy
import pandas as pd
import re
import string
import os
from transformers import BertTokenizer, BertForQuestionAnswering, AutoTokenizer
from transformers import pipeline
import time
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
#define and load the pretrained bert model
modelname = 'deepset/bert-base-cased-squad2'
tokenizer = AutoTokenizer.from_pretrained(modelname)
bModel = BertForQuestionAnswering.from_pretrained(modelname)
QAPipeline = pipeline('question-answering', model=bModel, tokenizer=tokenizer)

In [3]:
#ask the user to input the topic they want to query about
print("Enter the topic you want to chat about")
inputTopic = input()

Enter the topic you want to chat about


In [4]:
#read the corresponding artcile file and store it as the context input to our models
a_file = inputTopic + ".txt"
with open(a_file,  encoding="ISO-8859-1") as f:
    text = f.readlines()
    context = ' '.join([str(item) for item in text])

In [5]:
lemmer = nltk.stem.WordNetLemmatizer() # initialize the lemmatizer. WordNet is a semantically-oriented dictionary of English included in NLTK.

context = context.lower()# converts to lowercase
sentenceTokens = nltk.sent_tokenize(context)# converts to list of sentences 

In [6]:
#define functions to implement the lemmatization
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
punctuationDictionary = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(punctuationDictionary)))

In [14]:
#define the function to implement the tf-idf plus cosine similarity model.
#it checks for the similarity threshold and calls the pre-trained bert model if the similarity is below the threshold
def response(userQuery):
    botResponse=''
    sentenceTokens.append(userQuery)
    vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    vectors = vectorizer.fit_transform(sentenceTokens)
    similarity = cosine_similarity(vectors[-1], vectors)
    index =similarity.argsort()[0][-2]
    similarity = similarity.flatten()
    similarity.sort()
    result = similarity[-2]
    if(result<0.6):
        print("Please wait...")
        botResposne = bertLookup(userQuery, context)
        return botResposne
    else:
        botResponse = botResponse+sentenceTokens[index]
        return botResponse

In [8]:
#define the function to call the bert pipeline
def bertLookup(userQuery, context):
    ans = QAPipeline({
    'question': userQuery,
    'context': context
    })
    return ans['answer']

In [15]:
#Run the bot iteratively until the user inputs 'Exit'
flag=True
while(flag==True):
    userQuery = input()
    userQuery=userQuery.lower()
    if(userQuery!='exit'):
        print(userQuery)
        print(response(userQuery))
        sentenceTokens.remove(userQuery)
    else:
        print("Thank you, bye!")
        flag=False

what is canada's national unemployment rate?
Please wait...
5.9%
