In [1]:
# Importing dependencies
import pandas as pd
import numpy as np
from numpy import array
import nltk 
#nltk.download('punkt') - need to be downloaded for the first instance
import re
import heapq
from nltk.stem import wordnet # to perform lemmitization
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words
from nltk import pos_tag # for parts of speech
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity

In [2]:
# loading the corpus
pd.set_option('display.max_colwidth', 1) # to display the full text in a column
df_corpus = pd.read_excel("Infobot9.xlsx")

In [3]:
# Data cleaning
df_corpus.ffill(axis = 0,inplace=True) # fills the null value with the previous value.

In [4]:
# Stopwords removal

stop = stopwords.words('english')
def stopword(text):
    word_list=[]
    text_split=text.split()
    for word in text_split:
        if word in stop:
            continue
        else:
            word_list.append(word_list)
        return" ".join(word)

In [5]:
# Function for normalizing the data

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [6]:
df_corpus['normalized_text']=df_corpus['Inputs'].apply(text_normalization) # applying the fuction to the dataset to get clean text

In [7]:
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df_corpus['normalized_text']).toarray()

In [8]:
# returns all the unique word from data 

features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)


In [9]:
#Defining the question
question ='NRT Key deals report' # considering an example query

In [10]:
# defining a function that returns response to query using bow

def chat_bow(text):
    
    lemma=text_normalization(text) # calling the function to perform text normalization
    question_bow=cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,question_bow, metric = 'cosine' )
    df_corpus['similarity_bow']=cosine_value
    df_simi = pd.DataFrame(df_corpus, columns=['Details','Short Link','similarity_bow'])
    df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False)
    index_value = heapq.nlargest(5, range(len(cosine_value)), cosine_value.__getitem__)
    return df_simi_sort[['Details','Short Link','similarity_bow']]

In [11]:

def scoring():
    df_simi_sort = chat_bow(question)
    threshold = 0.2
    df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold]
 #scoring
    for i in df_threshold['similarity_bow']:
        if i > 0.799:
            df_threshold=df_threshold.head(2)
            df_threshold=df_threshold['Details'].str.split(",",expand=True).stack()
            blankIndex=[''] * len(df_threshold)
            df_threshold.index=blankIndex
            return(df_threshold)
            break
        elif i > 0.5:
            df_threshold=df_threshold.head(2)
            df_threshold=df_threshold['Details'].str.split(",",expand=True).stack()
            blankIndex=[''] * len(df_threshold)
            df_threshold.index=blankIndex
            return(df_threshold)
            break
        else:
            return('There is no exact match for the question, However the below results are a close match')
            return(df_threshold.head(1))
            df_threshold=df_threshold['Details'].str.split(",",expand=True).stack()
            blankIndex=[''] * len(df_threshold)
            df_threshold.index=blankIndex
            return(df_threshold)
        return (scoring())


In [12]:
scoring()

    Report name : EnC Key Deals NRT Report with Node (SMS9020_01)       
     Report number : SMS9020_01                                         
     Cognos package : Opt 5 Roadmap (Daily) and F&P Actuals and Budgets 
     Report level : Global                                              
    Below Market                                                        
     Content : RoadMap                                                  
     View : Global View                                                 
    Geo View                                                            
    Market View                                                         
    Sub Market View                                                     
     Cognos report link : http://ibm.biz/SMS9020_01                     
    Report name : NRT Key Deals Report (SMS9689_05)                     
     Report number : SMS9689_05                                         
     Cognos package : Opt 5 Roadmap (Daily) and F&P

In [15]:
pip install ChatterBot

Note: you may need to restart the kernel to use updated packages.


In [23]:
import tkinter
from tkinter import *




def send():
    question = EntryBox.get("1.0",'end-1c').strip()
    EntryBox.delete("0.0",END)

    if question != '':
        ChatBox.config(state=NORMAL)
        ChatBox.insert(END, "You: " + question + '\n\n')
        ChatBox.config(foreground="#446665", font=("Verdana", 12 ))
    
        def chat_bow(text):
    
            lemma=text_normalization(text) # calling the function to perform text normalization
            question_bow=cv.transform([lemma]).toarray() # applying bow
            cosine_value = 1- pairwise_distances(df_bow,question_bow, metric = 'cosine' )
            df_corpus['similarity_bow']=cosine_value
            df_simi = pd.DataFrame(df_corpus, columns=['Details','Short Link','similarity_bow'])
            df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False)
            index_value = heapq.nlargest(5, range(len(cosine_value)), cosine_value.__getitem__)
            return df_simi_sort[['Details','Short Link','similarity_bow']]
            

        def scoring():
            df_simi_sort = chat_bow(question)
            threshold = 0.2
            df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold]
         #scoring
            for i in df_threshold['similarity_bow']:
                if i > 0.799:
                    df_threshold=df_threshold.head(2)
                    df_threshold=df_threshold['Details'].str.split(",",expand=True).stack()
                    blankIndex=[''] * len(df_threshold)
                    df_threshold.index=blankIndex
                    return(df_threshold)
                    break
                elif i > 0.5:
                    df_threshold=df_threshold.head(2)
                    df_threshold=df_threshold['Details'].str.split(",",expand=True).stack()
                    blankIndex=[''] * len(df_threshold)
                    df_threshold.index=blankIndex
                    return(df_threshold)
                    break
                else:
                    return('There is no exact match for the question, However the below results are a close match')
                    return(df_threshold.head(1))
                    df_threshold=df_threshold['Details'].str.split(",",expand=True).stack()
                    blankIndex=[''] * len(df_threshold)
                    df_threshold.index=blankIndex
                    return(df_threshold)
            

        res = str(scoring())
        ChatBox.insert(END, "Infobot: " + res)
        
        
        with open("chatlog1.txt", 'a', encoding="utf-8") as f:
            f.write("User: " + question + '\n' + "Infobot: " + res)
        
        f.close()
       
    

                
        
    
        ChatBox.config(state=DISABLED)
        ChatBox.yview(END)
        

        

            

root = Tk()
root.title("Infobot")
root.geometry("550x500")
root.resizable(width=TRUE, height=TRUE)

#Create Chat window
ChatBox = Text(root, bd=0, bg="white", height="8", width="20", font="Arial",)

ChatBox.config(state=DISABLED)

#Bind scrollbar to Chat window
scrollbar = Scrollbar(root, command=ChatBox.yview, cursor="Arrow")
ChatBox['yscrollcommand'] = scrollbar.set

#Create Button to send message
SendButton = Button(root, font=("Verdana",12,'bold'), text="Send", width="11", height=5,
                    bd=0, bg="green", activebackground="#3c9d9b",fg='#000000',
                    command= send )

#Create the box to enter message
EntryBox = Text(root, bd=0, bg="white",width="50", height="5", font="Arial")
EntryBox.bind("<Return>", send)


#Place all components on the screen
scrollbar.place(x=600,y=6, height=386)
ChatBox.place(x=6,y=6, height=386, width=600)
EntryBox.place(x=128, y=401, height=90, width=400)
SendButton.place(x=6, y=401, height=90)




root.mainloop()


            