<a href="https://colab.research.google.com/github/raynerz/nlp/blob/main/Chatbot_Excercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chatbot

In [16]:
# Example partially based on: https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#Tfidfvectorizer-Usage

import nltk
nltk.download('punkt')
import numpy
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [98]:
#Preparing Data

sent_tokens = []


f = open('Switzerland.txt', mode='r')

for line in f.readlines():
  line = nltk.sent_tokenize(line)
  for a in line:
    sent_tokens.append(a)

sent_tokens.append("What happened in year 1200")  # add another sentence


In [99]:
## TF

cv=CountVectorizer()
word_count_vector=cv.fit_transform(sent_tokens)
print(word_count_vector.shape)




(654, 3376)


In [100]:
## IDF

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"])

print(df_idf)
 


            idf_weights
000            4.776585
003            6.791488
018            6.791488
049            6.791488
06             6.791488
...                 ...
züriputsch     6.791488
ˈzvittsera     6.791488
ˈʃvaɪts        6.791488
ˈʒviːtsrɐ      6.791488
ˈʒviːtsʁɐ      6.791488

[3376 rows x 1 columns]


In [101]:
## TF-IDF

tf_idf_vector=tfidf_transformer.transform(word_count_vector)  # computes tfidf as tf*idf

## Printing the results for the first document
feature_names = cv.get_feature_names()

# get tfidf vector for first document
first_document_vector = tf_idf_vector[0]

#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

print(df)



            tfidf
000           0.0
003           0.0
018           0.0
049           0.0
06            0.0
...           ...
züriputsch    0.0
ˈzvittsera    0.0
ˈʃvaɪts       0.0
ˈʒviːtsrɐ     0.0
ˈʒviːtsʁɐ     0.0

[3376 rows x 1 columns]


In [102]:
## Comparing the newly added sentence to the existing sentences

# compare the first element from the right to the rest of the documents
vals = cosine_similarity(tf_idf_vector[-1], tf_idf_vector)
vals = vals.flatten()  # returns a copy of the array collapsed into one dimension
closest = numpy.amax(vals[:-1])  # skip last one, since it is itself (similarity = 1)
closestIndex = int(numpy.where(vals == numpy.amax(vals[:-1]))[0])  # index of the max element
print("Newly added sentence: ", sent_tokens[-1])
print("The closest sentence is: ", sent_tokens[closestIndex])

Newly added sentence:  What happened in year 1200
The closest sentence is:  In the 2000s, domestic and international institutions expressed concern about what was perceived as an increase in xenophobia, particularly in some political campaigns.


In [113]:
class Chatbot:
  sent_tokens = []
  word_count_vector = None
  tf_idf_vector = None

  def __init__(self,path="Switzerland.txt"):

    #Preparing Data
    f = open(path, mode='r')

    for line in f.readlines():
      line = nltk.sent_tokenize(line)
      for a in line:
        self.sent_tokens.append(a)

    f.close()


  def answer_question(self, question=""):
    self.sent_tokens.append(question)  # add another sentence

      # Calculating TF
    cv=CountVectorizer()
    self.word_count_vector=cv.fit_transform(self.sent_tokens)

    # Calculating the tdidf Vector
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(self.word_count_vector)
    self.tf_idf_vector=tfidf_transformer.transform(self.word_count_vector)
    
    ## Comparing the newly added sentence to the existing sentences

    # compare the first element from the right to the rest of the documents
    vals = cosine_similarity(self.tf_idf_vector[-1], self.tf_idf_vector)
    vals = vals.flatten()  # returns a copy of the array collapsed into one dimension
    closest = numpy.amax(vals[:-1])  # skip last one, since it is itself (similarity = 1)
    print("Response Cosine Similarity Score: " + str(closest))
    
    if closest < 0.4:
      return "Sorry, I can't understand your question"

    closestIndex = int(numpy.where(vals == numpy.amax(vals[:-1]))[0])  # index of the max element
    
    reply = self.sent_tokens[closestIndex]
    self.sent_tokens.pop()
    return reply

  def get_reply(self, message):
    if 'Hi' in message or 'Hey' in message:
        reply = "Hi there"
    else:
        reply = self.answer_question(message)
    print('Bot: ' + reply)

  def chatbot(self):
    print('Bot: Hi there, I will answers all your questions about Switzerland!! Ask me whatever you want')
    message = ""
    while ('bye' not in message):
      message = input('Human: ')
      if message not in 'bye':
        self.get_reply(message)
    print("Bot: Bye Human")


Chatbot().chatbot()


Bot: Hi there, I will answers all your questions about Switzerland!! Ask me whatever you want
Human: What is the most popular drink of Switzerland
Response Cosine Similarity Score: 0.601714765170884
Bot: The most popular alcoholic drink in Switzerland is wine.
Human: bye
Bot: Bye Human
