In [35]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization

from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [36]:
df=pd.read_excel('dialog_chatbot.xlsx')
df.head(20)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
3,Describe yourself,
4,tell me about yourself,
5,all about you,
6,tell me some stuff about you,
7,talk some stuff about you,
8,talk about yourself,
9,about yourself,


In [37]:
df.shape[0] # returns the number of rows in dataset

1649

In [38]:
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
3,Describe yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
4,tell me about yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
5,all about you,"Hello, I am the chatbot of Pentol Resto Medan...."
6,tell me some stuff about you,"Hello, I am the chatbot of Pentol Resto Medan...."
7,talk some stuff about you,"Hello, I am the chatbot of Pentol Resto Medan...."
8,talk about yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
9,about yourself,"Hello, I am the chatbot of Pentol Resto Medan...."


In [39]:
df1=df.head(10) # copy of first ten rows of dataset

In [40]:
# function that converts text into lower case and removes special characters

def step1(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(p)

In [41]:
step1(df1['Context'])

tell me about your personality
i want to know you better
define yourself
describe yourself
tell me about yourself
all about you
tell me some stuff about you
talk some stuff about you
talk about yourself
about yourself


In [42]:
 # word tokenizing
    
s='tell me about your personality'
words=word_tokenize(s)
print(words)

['tell', 'me', 'about', 'your', 'personality']


In [43]:
lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer
lemma.lemmatize('absorbed', pos = 'v')

'absorb'

In [44]:
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word

[('tell', 'VB'),
 ('me', 'PRP'),
 ('about', 'IN'),
 ('your', 'PRP$'),
 ('personality', 'NN')]

In [45]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [46]:
text_normalization('telling you some stuff about me')

'tell you some stuff about me'

In [47]:
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(15)

Unnamed: 0,Context,Text Response,lemmatized_text
1634,What is the ingredients of Mie Goreng?,The basic ingredients of the Mie Goreng consis...,what be the ingredient of mie goreng
1635,What is the ingredients of Soto?,The basic ingredients of the Soto consist of:\...,what be the ingredient of soto
1636,What is the ingredients of Sup Ayam?,The basic ingredients of the Sup Ayam consist ...,what be the ingredient of sup ayam
1637,What is the ingredients of Mie Ayam?,The basic ingredients of the Mie Ayam consist ...,what be the ingredient of mie ayam
1638,What is the ingredients of Mie Ayam?,The basic ingredients of the Mie Ayam consist ...,what be the ingredient of mie ayam
1639,what are the nutritional facts of Pentol Ayam?,The nutritional facts of Pentol Ayam are as fo...,what be the nutritional fact of pentol ayam
1640,what are the nutritional facts of Pentol Puyuh?,The nutritional facts of Pentol Puyuh are as f...,what be the nutritional fact of pentol puyuh
1641,what are the nutritional facts of Pentol Sapi?,The nutritional facts of Pentol Sapi are as fo...,what be the nutritional fact of pentol sapi
1642,what are the nutritional facts of Bakso?,The nutritional facts of Bakso are as follows:...,what be the nutritional fact of bakso
1643,what are the nutritional facts of Siomay?,The nutritional facts of Siomay are as follows...,what be the nutritional fact of siomay


In [48]:
# all the stop words we have 

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# tf-idf

In [49]:
Question1 ='Tell me about yourself.'

In [50]:
# using tf-idf

tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray()

In [51]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf

In [52]:
df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) 
df_tfidf.head()

Unnamed: 0,abort,about,absolutely,abysmal,account,actually,address,adore,advice,advise,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0.0,0.404803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.337976,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.220068,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642125,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642125,0.0
4,0.0,0.45286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.623363,0.0


SIMILARITY

In [53]:
cos=1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')  # applying cosine similarity
cos

array([[ 0.54653468],
       [ 0.        ],
       [ 0.40027722],
       ..., 
       [ 0.        ],
       [ 0.        ],
       [ 0.        ]])

In [54]:
df['similarity_tfidf']=cos # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf 

Unnamed: 0,Text Response,similarity_tfidf
0,Just think of me as the ace up your sleeve.,0.546535
1,I can help you work smarter instead of harder,0.000000
2,"Hello, I am the chatbot of Pentol Resto Medan....",0.400277
3,"Hello, I am the chatbot of Pentol Resto Medan....",0.400277
4,"Hello, I am the chatbot of Pentol Resto Medan....",1.000000
5,"Hello, I am the chatbot of Pentol Resto Medan....",0.277750
6,"Hello, I am the chatbot of Pentol Resto Medan....",0.496083
7,"Hello, I am the chatbot of Pentol Resto Medan....",0.177876
8,"Hello, I am the chatbot of Pentol Resto Medan....",0.659519
9,"Hello, I am the chatbot of Pentol Resto Medan....",0.770496


In [55]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values
df_simi_tfidf_sort.head(10)

Unnamed: 0,Text Response,similarity_tfidf
4,"Hello, I am the chatbot of Pentol Resto Medan....",1.0
9,"Hello, I am the chatbot of Pentol Resto Medan....",0.770496
16,"Hello, I am the chatbot of Pentol Resto Medan....",0.758843
8,"Hello, I am the chatbot of Pentol Resto Medan....",0.659519
379,I should get one. It's all work and no play la...,0.575922
500,The virtual world is my playground. I'm always...,0.571904
1606,The menus available at Pentol Resto Medan are ...,0.567168
0,Just think of me as the ace up your sleeve.,0.546535
1600,Pentol Resto Medan is a restaurant that was fo...,0.503471
6,"Hello, I am the chatbot of Pentol Resto Medan....",0.496083


In [56]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

Unnamed: 0,Text Response,similarity_tfidf
4,"Hello, I am the chatbot of Pentol Resto Medan....",1.0
9,"Hello, I am the chatbot of Pentol Resto Medan....",0.770496
16,"Hello, I am the chatbot of Pentol Resto Medan....",0.758843
8,"Hello, I am the chatbot of Pentol Resto Medan....",0.659519
379,I should get one. It's all work and no play la...,0.575922
500,The virtual world is my playground. I'm always...,0.571904
1606,The menus available at Pentol Resto Medan are ...,0.567168
0,Just think of me as the ace up your sleeve.,0.546535
1600,Pentol Resto Medan is a restaurant that was fo...,0.503471
6,"Hello, I am the chatbot of Pentol Resto Medan....",0.496083


In [57]:
index_value1 = cos.argmax() # returns the index number of highest value
index_value1

4

In [58]:
Question1

'Tell me about yourself.'

In [59]:
df['Text Response'].loc[index_value1]  # returns the text at that index

'Hello, I am the chatbot of Pentol Resto Medan. I will help you get to know more about Pentol Resto Medan. Please ask me a question.'

# Model using tf-idf