In [1]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words
import sqlite3

In [2]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect('Z:/chatbot/chatbot/db.sqlite3')
df = pd.read_sql_query("SELECT * from chatbotApp_text_keyword", con)
df = df.rename(columns={'desc':'Text Response','keyword_value':'Context'})
df = df.reindex(columns=['Context', 'Text Response'])

df.head(20)

Unnamed: 0,Context,Text Response
0,,"Sorry, I don't have an answer for this right n..."
1,How to view the manual for calculating indicat...,From the Guides icon at the top of the page or...
2,What is the accreditation mechanism in the EQA...,From the Forms Department - Approval of a Form...
3,What is the endorsement mechanism in EQAUP in ...,The form is sent from the coordinator - the co...
4,What is the endorsement mechanism in EQAUP for...,The form is sent from the coordinator --- the ...
5,What is the accreditation mechanism in the EQA...,The form is sent from the coordinator --- the ...
6,Is the EQAUP form sent for approval after fill...,The form is sent for approval once after filli...
7,When will the EQAUP models be activated?,The forms for filling out are activated annual...
8,When entering the EQAUP system from the unifie...,Allow popups to appear.
9,How is the EQAUP system accessed?,The system is accessed through:\r\n1. The univ...


In [3]:
def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [4]:
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(15)

Unnamed: 0,Context,Text Response,lemmatized_text
36,Is it necessary for each learning outcome to b...,"No, each learning outcome has a number of cour...",be it necessary for each learn outcome to be c...
37,How is the alignment of the learning outcomes ...,It must be ensured that the learning outcomes ...,how be the alignment of the learn outcome of t...
38,How are learning strategies defined?,It must be taken into account that the learnin...,how be learn strategy define
39,How are strategies for measuring learning outc...,It must be taken into account that the strateg...,how be strategy for measure learn outcome iden...
40,What is the difference between a program descr...,The program description is the program plan an...,what be the difference between a program descr...
41,How to apply for academic accreditation?,The application is made by submitting an elect...,how to apply for academic accreditation
42,Can I get ISO 9001?,"Yes, but in view of the intellectual property ...",can i get iso
43,What are the most prominent quality characteri...,1- Quality management system according to ISO ...,what be the most prominent quality characteris...
44,What are the steps for establishing a quality ...,1- The preparation stage includes spreading th...,what be the step for establish a quality system
45,What is the training schedule for the deanship?,You can view the training schedule on the Dean...,what be the training schedule for the deanship


In [5]:
# all the stop words we have 

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [7]:
# returns all the unique word from data 

features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

Unnamed: 0,absence,academic,access,accord,accordance,accreditation,achieve,activate,actual,after,...,view,we,website,what,when,where,who,will,with,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:

# Function that removes stop words and process the text

def stopword_(text):   
    tag_list=pos_tag(nltk.word_tokenize(text),tagset=None)
    stop=stopwords.words('english')
    lema=wordnet.WordNetLemmatizer()
    lema_word=[]
    for token,pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('J'):
            pos_val='a'
        elif pos_token.startswith('R'):
            pos_val='r'
        else:
            pos_val='n'
        lema_token=lema.lemmatize(token,pos_val)
        lema_word.append(lema_token)
    return " ".join(lema_word)

In [40]:

# defining a function that returns response to query using bow

def chat_bow(text):
    s=stopword_(text)
    lemma=text_normalization(s) # calling the function to perform text normalization
    bow=cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    df['similarity_bow']=cosine_value 
    df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took
    df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
#     print(df_simi_sort.head())
    threshold = 0.5 # considering the value of p=smiliarity to be greater than 0.2
    df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] 
#     print(df_threshold)
    index_value=cosine_value.argmax() # getting index value
#     return df_threshold
    if index_value == 0:
        return "Sorry, I don\'t have an answer for this right now.."
    else:
        return df_threshold['Text Response'][index_value]

In [51]:
inputStr = input('Enter question : ')
chat_bow(inputStr)

Enter question : endorsement mechanism in EQUAP


KeyError: 4