In [1]:
#building a conversational chatbot for store's answer to questions with TF=IDF
# Reference: "Python Deep Learning Projects", M. Lamons, R. Kumar, A. Nagaraja

In [2]:
#step 1: prepare the dataset and preprocessing

In [3]:
import pandas as pd
import numpy as np
import operator ,os
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
filepath='sample_data.csv'
csv_reader =pd.read_csv(filepath)
print(csv_reader)

                            Question  \
0          When does your shop open?   
1           What is today's special?   
2  What is the cost of an americano?   
3            Do you sell Ice-creams?   

                                              Answer  
0  Our shop timings are 9:00 am - 9:00 pm on week...  
1  Today we have variety of Italian pasta, with s...  
2  Americano with a single shot will cost 1.4$ an...  
3  We do have desserts like ice-cream, brownies, ...  


In [5]:
question_list = csv_reader[csv_reader.columns[0]].values.tolist()
answers_list  = csv_reader[csv_reader.columns[1]].values.tolist()

In [6]:
print(question_list)

['When does your shop open?', "What is today's special?", 'What is the cost of an americano?', 'Do you sell Ice-creams?']


In [7]:
query ='can I get an Americano, btw how much it will cost ?'

In [8]:
# creating the vector
vectorizer = TfidfVectorizer(min_df=0, ngram_range=(2, 4), strip_accents='unicode',norm='l2' , encoding='ISO-8859-1')
print(vectorizer)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='ISO-8859-1',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0, ngram_range=(2, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


In [9]:
#step 2: train the model on the questions

In [10]:
X_train = vectorizer.fit_transform(np.array([''.join(que) for que in question_list]))
print(X_train)

  (0, 9)	0.3333333333333333
  (0, 32)	0.3333333333333333
  (0, 37)	0.3333333333333333
  (0, 8)	0.3333333333333333
  (0, 31)	0.3333333333333333
  (0, 20)	0.3333333333333333
  (0, 36)	0.3333333333333333
  (0, 7)	0.3333333333333333
  (0, 30)	0.3333333333333333
  (1, 29)	0.4217647821447532
  (1, 15)	0.4217647821447532
  (1, 28)	0.4217647821447532
  (1, 24)	0.4217647821447532
  (1, 14)	0.4217647821447532
  (1, 25)	0.3325241986862672
  (2, 3)	0.26151864623057924
  (2, 23)	0.26151864623057924
  (2, 13)	0.26151864623057924
  (2, 27)	0.26151864623057924
  (2, 17)	0.26151864623057924
  (2, 2)	0.26151864623057924
  (2, 22)	0.26151864623057924
  (2, 12)	0.26151864623057924
  (2, 26)	0.26151864623057924
  (2, 0)	0.26151864623057924
  (2, 16)	0.26151864623057924
  (2, 1)	0.26151864623057924
  (2, 21)	0.26151864623057924
  (2, 11)	0.26151864623057924
  (2, 25)	0.20618430452425712
  (3, 35)	0.3333333333333333
  (3, 6)	0.3333333333333333
  (3, 19)	0.3333333333333333
  (3, 34)	0.3333333333333333
  (3, 5

In [11]:
# step 3: transform the query to chatbot

In [12]:
X_query=vectorizer.transform([query])
print(X_query)

  (0, 0)	1.0


In [13]:
#step 4: computing similarity score for the query

In [14]:
XX_similarity=np.dot(X_train.todense(), X_query.transpose().todense())
XX_sim_scores= np.array(XX_similarity).flatten().tolist()
print(XX_sim_scores)

[0.0, 0.0, 0.26151864623057924, 0.0]


In [15]:
#step 5; ranking results

In [16]:
dict_sim= dict(enumerate(XX_sim_scores))
sorted_dict_sim = sorted(dict_sim.items(), key=operator.itemgetter(1), reverse =True)

In [17]:
# step 6: retrieve the answer result

In [18]:
# checking the index with the most similar question and the response with the index
if sorted_dict_sim[0][1]==0:
    print("Sorry I have no answer, please try asking again in a nicer way :)")
    resp = "Sorry I have no answer, please try asking again in a nicer way :)"
elif sorted_dict_sim[0][1]>0:
    print (answers_list [sorted_dict_sim[0][0]])        
    resp = answers_list [sorted_dict_sim[0][0]]

Americano with a single shot will cost 1.4$ and the double shot will cost 2.3$.


In [19]:
query ='do you have fruits ?'
X_query=vectorizer.transform([query])
XX_similarity=np.dot(X_train.todense(), X_query.transpose().todense())
XX_sim_scores= np.array(XX_similarity).flatten().tolist()
dict_sim= dict(enumerate(XX_sim_scores))
sorted_dict_sim = sorted(dict_sim.items(), key=operator.itemgetter(1), reverse =True)
if sorted_dict_sim[0][1]==0:
    print("Sorry I have no answer, please try asking again in a nicer way :)")
    resp = "Sorry I have no answer, please try asking again in a nicer way :)"
elif sorted_dict_sim[0][1]>0:
    print (answers_list [sorted_dict_sim[0][0]])        
    resp = answers_list [sorted_dict_sim[0][0]]

We do have desserts like ice-cream, brownies, and pastries.
