In [1]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
#from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from sklearn.metrics.pairwise import cosine_similarity
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [4]:
df=pd.read_excel('dialog_chatbot.xlsx')
df.head(20)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
3,Describe yourself,
4,tell me about yourself,
5,all about you,
6,tell me some stuff about you,
7,talk some stuff about you,
8,talk about yourself,
9,about yourself,


In [5]:
df.shape[0] # returns the number of rows in dataset

1649

In [6]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

In [7]:
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
3,Describe yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
4,tell me about yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
5,all about you,"Hello, I am the chatbot of Pentol Resto Medan...."
6,tell me some stuff about you,"Hello, I am the chatbot of Pentol Resto Medan...."
7,talk some stuff about you,"Hello, I am the chatbot of Pentol Resto Medan...."
8,talk about yourself,"Hello, I am the chatbot of Pentol Resto Medan...."
9,about yourself,"Hello, I am the chatbot of Pentol Resto Medan...."


In [8]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    tags_list=pos_tag(tokens,tagset=None)
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lemmatizer.lemmatize(token,pos_val) # performing lemmatization
        lemma_words.append(lema_token) # appending the lemmatized token into a list
    return " ".join(filtered_words)


In [9]:
df['lemmatized_text']=df['Context'].apply(preprocess) # applying the fuction to the dataset to get clean text
df.tail(15)

Unnamed: 0,Context,Text Response,lemmatized_text
1634,What is the ingredients of Mie Goreng?,The basic ingredients of the Mie Goreng consis...,ingredients mie goreng
1635,What is the ingredients of Soto?,The basic ingredients of the Soto consist of:\...,ingredients soto
1636,What is the ingredients of Sup Ayam?,The basic ingredients of the Sup Ayam consist ...,ingredients sup ayam
1637,What is the ingredients of Mie Ayam?,The basic ingredients of the Mie Ayam consist ...,ingredients mie ayam
1638,What is the ingredients of Mie Ayam?,The basic ingredients of the Mie Ayam consist ...,ingredients mie ayam
1639,what are the nutritional facts of Pentol Ayam?,The nutritional facts of Pentol Ayam are as fo...,nutritional facts pentol ayam
1640,what are the nutritional facts of Pentol Puyuh?,The nutritional facts of Pentol Puyuh are as f...,nutritional facts pentol puyuh
1641,what are the nutritional facts of Pentol Sapi?,The nutritional facts of Pentol Sapi are as fo...,nutritional facts pentol sapi
1642,what are the nutritional facts of Bakso?,The nutritional facts of Bakso are as follows:...,nutritional facts bakso
1643,what are the nutritional facts of Siomay?,The nutritional facts of Siomay are as follows...,nutritional facts siomay


# tf-idf

In [10]:
# using tf-idf

tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray()

In [11]:
df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) 
df_tfidf.head()

Unnamed: 0,abort,absolutely,abysmal,account,actually,address,adore,advice,advise,affirmative,...,wrong,yap,yea,yeah,years,yeh,yep,yes,yet,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


SIMILARITY

In [13]:
cos=1-pairwise_distances(df_tfidf,x_tfidf,metric='cosine')  # applying cosine similarity
cos

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.50060585,
        0.51585297],
       [0.        , 0.        , 0.        , ..., 0.50060585, 1.        ,
        0.69825137],
       [0.        , 0.        , 0.        , ..., 0.51585297, 0.69825137,
        1.        ]])

In [17]:
df['similarity_tfidf']=cos # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf 

ValueError: Wrong number of items passed 1649, placement implies 1

In [18]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=preprocess(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return df['Text Response'].loc[index_value]

In [19]:
chat_tfidf('hi')

'Just think of me as the ace up your sleeve.'

In [36]:
chat_tfidf('how are you')

'Just think of me as the ace up your sleeve.'

In [37]:
chat_tfidf('how about the operation hour?')

'Pentol Resto Medan opens at 10.00 WIB and closes at 22.00 WIB every day.'