In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import random

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle

In [None]:
words=[]
classes = []
documents = []
ignore_words = ['?', '!']
data = open('data.json').read()
intents = json.loads(data)

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
for intent in intents['intents']:
    for pattern in intent['patterns']:

        # take each word and tokenize it
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        # adding documents
        documents.append((w, intent['tag']))

        # adding classes to our class list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
!pip freeze

absl-py==1.3.0
aeppl==0.0.33
aesara==2.7.9
aiohttp==3.8.3
aiosignal==1.3.1
alabaster==0.7.12
albumentations==1.2.1
altair==4.2.0
appdirs==1.4.4
arviz==0.12.1
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
async-timeout==4.0.2
atari-py==0.2.9
atomicwrites==1.4.1
attrs==22.2.0
audioread==3.0.0
autograd==1.5
Babel==2.11.0
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==5.0.1
blis==0.7.9
bokeh==2.3.3
branca==0.6.0
bs4==0.0.1
CacheControl==0.12.11
cachetools==5.2.0
catalogue==2.0.8
certifi==2022.12.7
cffi==1.15.1
cftime==1.6.2
chardet==4.0.0
charset-normalizer==2.1.1
click==7.1.2
clikit==0.6.2
cloudpickle==1.5.0
cmake==3.22.6
cmdstanpy==1.0.8
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.0.3
cons==0.4.5
contextlib2==0.5.5
convertdate==2.4.0
crashtest==0.3.1
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.3.0
cvxpy==1.2.2
cycler==0.11.0
cymem==2.0.7
Cython==0.29.32
daft==0.0.4
dask==2022.2.1
datascience==0.17.5
db-dtypes==1.0.5
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
desc

In [None]:
# lemmaztize and lower each word and remove duplicates
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print (len(documents), "documents")
# classes = intents
print (len(classes), "classes", classes)
# words = all words, vocabulary
print (len(words), "unique lemmatized words", words)
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

140 documents
14 classes ['Asking about car colors', 'Asking about car delivery', 'Asking about car parts', 'Asking about car warranties', 'about', 'available banks', 'can I credit', 'car', 'goodbye', 'greeting', 'how to credit', 'payment option', 'payment process', 'thanks']
129 unique lemmatized words ["'m", "'s", ',', 'a', 'about', 'accept', 'aftermarket', 'an', 'any', 'are', 'available', 'bank', 'black', 'blue', 'bye', 'can', 'car', 'card', 'color', 'colour', 'come', 'cover', 'credit', 'custom', 'dealership', 'debit', 'deliver', 'delivered', 'delivering', 'delivery', 'describe', 'discount', 'do', 'doe', 'engine', 'enter', 'extended', 'financing', 'for', 'form', 'goodbye', 'gray', 'green', 'have', 'hay', 'hello', 'help', 'helpful', 'hey', 'hi', 'home', 'hood', 'how', 'i', 'in', 'information', 'interested', 'into', 'is', 'it', 'job', 'later', 'list', 'looking', 'make', 'me', 'mod', 'modification', 'my', 'nationwide', 'need', 'oem', 'of', 'offer', 'ok', 'okay', 'on', 'online', 'option

In [None]:
# initializing our training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
    # initialize bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])
# here shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test lists. X_patterns, Y_intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data has created")

Training data has created


  training = np.array(training)


In [None]:
# Creating model of 3 layers. First layer contains 128 neurons, second layer contains 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax function
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

# Compile model. And Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=10000, batch_size=10, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

  super(SGD, self).__init__(name, **kwargs)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 7502/10000
Epoch 7503/10000
Epoch 7504/10000
Epoch 7505/10000
Epoch 7506/10000
Epoch 7507/10000
Epoch 7508/10000
Epoch 7509/10000
Epoch 7510/10000
Epoch 7511/10000
Epoch 7512/10000
Epoch 7513/10000
Epoch 7514/10000
Epoch 7515/10000
Epoch 7516/10000
Epoch 7517/10000
Epoch 7518/10000
Epoch 7519/10000
Epoch 7520/10000
Epoch 7521/10000
Epoch 7522/10000
Epoch 7523/10000
Epoch 7524/10000
Epoch 7525/10000
Epoch 7526/10000
Epoch 7527/10000
Epoch 7528/10000
Epoch 7529/10000
Epoch 7530/10000
Epoch 7531/10000
Epoch 7532/10000
Epoch 7533/10000
Epoch 7534/10000
Epoch 7535/10000
Epoch 7536/10000
Epoch 7537/10000
Epoch 7538/10000
Epoch 7539/10000
Epoch 7540/10000
Epoch 7541/10000
Epoch 7542/10000
Epoch 7543/10000
Epoch 7544/10000
Epoch 7545/10000
Epoch 7546/10000
Epoch 7547/10000
Epoch 7548/10000
Epoch 7549/10000
Epoch 7550/10000
Epoch 7551/10000
Epoch 7552/10000
Epoch 7553/10000
Epoch 7554/10000
Epoch 7555/10000
Epoch 7556/10000


In [None]:
#@title Cleaning
from keras.models import load_model
model = load_model('chatbot_model.h5')
import json
import random
intents = json.loads(open('data.json').read())
words = pickle.load(open('words.pkl','rb'))
classes = pickle.load(open('classes.pkl','rb'))

In [None]:
!zip -r chat_model.zip chatbot_model.h5
!zip -r class.zip classes.pkl
!zip -r words.zip words.pkl

  adding: chatbot_model.h5 (deflated 28%)
  adding: classes.pkl (deflated 34%)
  adding: words.pkl (deflated 45%)
