In [5]:
import os
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
import random

intents_path = '/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/data/intents'

# inference model variables
inference_load_intents_from = os.path.join(intents_path, 'intents_job_intents.json')

words = []
classes = []
documents = []
ignore_words = ['?', '!']
data_file = open(inference_load_intents_from, encoding='cp1252').read()
intents = json.loads(data_file)

for intent in intents['intents']:
     for pattern in intent['patterns']:
          w = nltk.word_tokenize(pattern)
          words.extend(w)

          documents.append((w, intent['tag']))

          if intent['tag'] not in classes:
               classes.append(intent['tag'])

words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print(len(documents), 'documents')
print(len(classes), 'classes', classes)
print(len(words), 'unique lemmatized words', words)

pickle.dump(words,open(os.path.join(intents_path, 'intents_words.pkl'),'wb'))
pickle.dump(classes,open(os.path.join(intents_path, 'intents_classes.pkl'),'wb'))

# init training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
     bag = []
     pattern_words = doc[0]
     pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

     for w in words:
          bag.append(1) if w in pattern_words else bag.append(0)

     output_row = list(output_empty)
     output_row[classes.index(doc[1])] = 1

     training.append([bag, output_row])

random.shuffle(training)
training = np.array(training)
# create train and test lists.  X - patterns, y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print('Training data created')
print(f'Train_x: {train_x}')
print(f'Train_y: {train_y}')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.25, 
                                                    random_state=1)
print(f'X_train: {X_train}')
print(f'X_train.shape: {len(X_train)}')
print(f'y_train: {y_train}')
print(f'y_train.shape: {len(y_train)}')
print(f' X_test:  {X_test}')
print(f' X_test.shape: {len(X_test)}')
print(f' y_test:  {y_test}')
print(f' y_test.shape: {len(y_test)}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
61 documents
11 classes ['compliment', 'feeling', 'goodbye', 'greeting', 'manager', 'name', 'package_tracking', 'profane', 'return_product', 'thanks', 'tracking']
88 unique lemmatized words ["'", "'s", ',', '.', '0983834298342341', '1234509873234323', '1983-2343-2343-2343', 'a', 'am', 'are', 'back', 'broken', 'bye', 'can', 'computer', 'dead', 'device', 'die', 'do', 'doing', 'drop', 'equipment', 'feel', 'feeling', 'fix', 'fixed', 'get', 'give', 'go', 'going', 'good', 'goodbye', 'hate', 'have', 'hear', 'hello', 'help', 'helpful', 'here', 'hi', 'how', 'i', 'is', 'it', 'later', 'like', 'long', 'manager', 'me', 'money', 'my', 'name', 'need', 'not', 'now', 'package', 'product', 'refund', 'report', 'return', 'see', 'shipment', 'some', 'something', 'speak', 'speaking', 's



In [10]:


# Create model with 3 layers.  First layer 128 neurons, second layer 64 neurons
# and 3rd output layer contains number of neurons equal to number of intents to
# predict
# output intent with softmax
model = Sequential()
model.add(Embedding
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

# Compile model.  Stochastic gradient descent with Nesterov accelerated
# gradient gives good
# results for this model
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

# fitting and saving the model
hist = model.fit(np.array(X_train), np.array(y_train),
                 epochs=1000,batch_size=5,verbose=1,
                 validation_data=(np.array(X_test), np.array(y_test)))
model.save(os.path.join(intents_path, 'intents_chatbot_model.h5'), hist)

print('model created')



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E