<a href="https://colab.research.google.com/github/sam4410/Chatbot-from-Scratch-in-Python/blob/main/Chatbot_from_Scratch_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mounting the Google Drive
from google.colab import drive
drive.mount('content/drive')
data_root = 'content/drive/My Drive/Chatbot'

In [5]:
# Import libraries
import json
import string
import random
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Loading dataset
data_file = open(data_root + '/intents.json').read()
data = json.loads(data_file)

In [4]:
# Creating data_X and data_Y
words = []   # for BoW model/Vocab for patterns
classes = [] # for BoW model/Vocab for tags
data_X = [] # for storing each pattern
data_Y = [] # for storing each tag corresponding to each pattern in data_X

# Iterating over all the intents
for intent in data['intent']:
  for pattern in intent['patterns']:
    tokens = nltk.word_tokenize(pattern)    # tokenize each pattern
    words.extend(tokens)                    # append pattern tokens to words list
    data_X.append(pattern)                  # appending patterns to data_X
    data_Y.append(intent['tag'])            # appending the corresponding tag to data_Y

    # adding the tag to classes
    if intent["tag"] no in classes:
      classes.append(intent["tag"])

# initialize lemmatizer to get stem of words
lemmatizer = WordNetLemmatizer()

# lemmatize all the words in vocab and convert them to lowercase
words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in string.punctuation]

# sorting the vocab and classes in alphabetical order
words = sorted(set(words))
classes = sorted(set(classes))

In [6]:
# Making the data machine friendly by converting text into numbers using BoW model
training = []
out_empty = [0]*len(classes)

# creating the bag of words model
for idx, doc in enumerate(data_X):
  bow = []
  text = lemmatizer.lemmatize(doc.lower())
  for word in words:
    bow.append(1) if word in text else bow.append(0)

  # mark the index of class that the current pattern is associated to
  output_row = list(out_empty)
  output_row[classes.index(data_Y[idx])] = 1

  # add the 1 hot encoded BoW and associated classes to training
  training.append([bow, output_row])

# shuffle the data and convert it into array
random.shuffle(training)
training = np.array(training, dtype=object)

# split the features and target labels
train_X = np.array(list(training[:, 0]))
train_Y = np.array(list(training[:, 1]))

In [7]:
# Building the neural network model
model = Sequential()
model.add(Dense(128, input_shape = (len(train_X[0]),), activation="relu"))
model.add(Dropout=0.5)
model.add(Dense(64, activation="relu"))
model.add(Dropout=0.5)
model.add(Dense(len(train_Y[0]), activation="softmax"))
adam = tf.keras.optimizers.Adam(learning_rate=0.01, decay=1e-6)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])
print(model.summary)
model.fit(x=train_X, y=train_Y, epochs=150, verbose=1)

In [8]:
# Preprocessing the input - User's query
def clean_text(text):
  tokens = nltk.word.tokenize(text)
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return tokens

def bag_of_words(text, vocab):
  tokens = clean_text(text)
  bow = [0] * len(vocab)
  for w in tokens:
    for idx, word in enumerate(vocab):
      if word == w:
        bow[idx] = 1
  return np.array(bow)

def pred_class(text, vocab, labels):
  bow = bag_of_words(text, vocab)
  result = model.predict(np.array([bow]))[0]   # extracting probabilities
  thresh = 0.5
  y_pred = [[indx, res] for indx, res in enumerate(result) if res > thresh]
  y_pred.sort(key=lambda x: x[1], reverse = True)
  return_list = []

  for r in y_pred:
    return_list.append(labels[r[0]])    # contains labels (tags) for highest probability
  return return_list

def get_response(intents_list, intents_json):
  if len(intents_list) == 0:
    result = "Sorry! I don't understand"
  else:
    tag = intents_list[0]
    list_of_intents = intents_json["intents"]
    for i in list_of_intents:
      if i["tag"] == tag:
        result = random.choice(i["responses"])
        break
    return result

Clean_text(text): This function receives text (string) as an input and then tokenizes it using the nltk.word_tokenize(). Each token is then converted into its root form using a lemmatizer. The output is basically a list of words in their root form.

Bag_of_words(text, vocab): This function calls the above function, converts the text into an array using the bag-of-words model using the input vocabulary, and then returns the same array.

Pred_class(text, vocab, labels): This function takes text, vocab, and labels as input and returns a list that contains a tag corresponding to the highest probability.

Get_response(intents_list, intents_json): This function takes in the tag returned by the previous function and uses it to randomly choose a response corresponding to the same tag in intent.json. And, if the intents_list is empty, that is when the probability does not cross the threshold, we will pass the string “Sorry! I don’t understand” as ChatBot’s response.

In [9]:
# Calling the relevant functions to interact with chatbot
print("Press 0 if you don't want to chat with our Chatbot")
while True:
  message = input("")
  if message == "0":
    break
  intents = pred_class(message, words, classes)
  result = get_response(intents, data)
  print(result)