# Download Embeddings

In [23]:
import os
import tqdm
import requests
import zipfile

URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

def fetch_data(url=URL, target_file='/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/embeddings/glove.zip', delete_zip=False):
    # if dataset exists exit
    if os.path.isfile(target_file):
        print('datasets already downloaded')
        return

        #download (large) zip file
    #for large https request on stream mode to avoid out of memory issues
    #see : http://masnun.com/2016/09/18/python-using-the-requests-module-to-download-large-files-efficiently.html
    print("**************************")
    print("  Downloading zip file")
    print("  >_<  Please wait >_< ")
    print("**************************")
    response = requests.get(url, stream=True)
    #read chunk by chunk
    handle = open(target_file, "wb")
    for chunk in tqdm.tqdm(response.iter_content(chunk_size=512)):
        if chunk:  
            handle.write(chunk)
    handle.close()  
    print("  Download completed ;) :") 
    #extract zip_file
    zf = zipfile.ZipFile(target_file)
    print("1. Extracting {} file".format(target_file))
    zf.extractall(path='/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/embeddings')
    if delete_zip:
        print("2. Deleting {} file".format(dataset_name+".zip"))
        os.remove(path=zip_file)

fetch_data()

**************************
  Downloading zip file
  >_<  Please wait >_< 
**************************


4251502it [06:51, 10337.96it/s]


  Download completed ;) :
1. Extracting /content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/embeddings/glove.zip file


# Imports

In [143]:
import os
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
import random
from sklearn import preprocessing


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Set Variables

In [163]:
MODEL_NAME1 = 'best_model_scratch.h5'
MODEL_NAME2 = 'best_model_pretrained.h5'
model_path = '/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/models'

model_path_scratch = os.path.join(model_path, MODEL_NAME1)
model_path_pretrained = os.path.join(model_path, MODEL_NAME2)

intents_path = '/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/data/intents'

# inference model variables
inference_load_intents_from = os.path.join(intents_path, 'intents_job_intents.json')

words = []
tags = []
documents = []
all_patterns = []
all_tags = []
label_encoded_Y = []
x_tr_seq = []
x_val_seq = []
y_tr = []
y_val = []
ignore_words = ['?', '!']

# Load JSON

In [164]:
data_file = open(inference_load_intents_from, encoding='cp1252').read()
intents = json.loads(data_file)

## Read in patterns and tags

Patterns are the user input (i.e., 'Hi,' 'How are you?').

Nothing is tokenized here.

In [165]:
# print classes
for intent in intents['intents']:
    all_patterns.extend(intent['patterns'])
    for pattern in intent['patterns']:
        all_tags.append(intent['tag'])

print(all_tags)
print(all_patterns)

['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'feeling', 'feeling', 'feeling', 'feeling', 'feeling', 'feeling', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'compliment', 'thanks', 'thanks', 'name', 'name', 'name', 'name', 'manager', 'manager', 'manager', 'manager', 'manager', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'return_product', 'package_tracking', 'package_tracking', 'package_tracking', 'package_tracking', 'package_tracking', 'package_tracking', 'profane', 'profane', 'profane', 'tracking', 'tracking', 'tracking']
['Hello', 'Hi', 'Good to see you.', 'Hello, there!', 'Can you hear me?', 'Where are you?', 'How are you today?', 'How are you doing toda

## Encode Tags



### Fit

In [168]:
# create label encoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# fit on all tags from JSON file
le.fit(all_tags)
print(f'Number of classes: {len(list(le.classes_))}')

Number of classes: 11


### Transform

In [169]:
label_encoded_Y = le.transform(all_tags)
print(f'Label_encoded_Y: {label_encoded_Y}')
print(f'Label_encoded_Y bincount: {np.bincount(label_encoded_Y)}')

Label_encoded_Y: [ 3  3  3  3  3  3  1  1  1  1  1  1  2  2  2  2  2  0  9  9  5  5  5  5
  4  4  4  4  4  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
  8  6  6  6  6  6  6  7  7  7 10 10 10]
Label_encoded_Y bincount: [ 1  6  5  6  5  4  6  3 20  2  3]


## Create x_all, y_all

In [170]:
X_all = np.asarray(all_patterns)
y_all = np.asarray(label_encoded_Y)
print(f'X all shape: {X_all.shape}')
print(f'Y all shape: {y_all.shape}')

X all shape: (61,)
Y all shape: (61,)


# Tokenize

In [171]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(X_all))
X_all_seq = tokenizer.texts_to_sequences(X_all)
print(X_all_seq)
print(type(X_all_seq))

[[27], [43], [44, 3, 28, 2], [27, 45], [15, 2, 46, 14], [29, 9, 2], [6, 9, 2, 30], [6, 9, 2, 47, 30], [48, 31, 18, 49], [6, 9, 2, 50], [6, 32, 2, 51], [6, 10, 31, 18], [1, 33, 3, 52], [53], [54], [28, 2, 34], [55, 3, 2, 34], [2, 9, 56, 57], [58], [59, 2], [60, 10, 7, 35], [35], [61, 9, 2], [36, 62, 19, 1, 63], [15, 1, 64, 3, 7, 37], [38, 14, 65, 37], [1, 5, 7, 39], [1, 19, 18, 3, 66, 2], [20, 14, 3, 7, 39], [1, 19, 67, 36, 7, 11], [1, 12, 3, 16, 8, 11], [1, 12, 8, 21], [1, 12, 3, 20, 68, 40], [1, 5, 4, 22, 23], [4, 24, 10, 25], [4, 69, 10, 25], [4, 24, 70, 71], [1, 12, 2, 3, 41, 72], [1, 32, 73, 74, 7, 11], [1, 5, 4, 22, 23], [1, 5, 8, 21, 75], [38, 14, 4, 22, 23], [1, 42, 7, 11], [4, 76, 10, 25], [1, 12, 2, 3, 41, 4, 24], [1, 5, 8, 21], [1, 12, 3, 16, 8, 13], [1, 5, 3, 16, 8, 11], [1, 33, 8, 11, 77, 1, 5, 3, 16], [6, 15, 1, 17, 4, 13], [29, 10, 4, 13], [6, 78, 79, 4, 13, 80, 3, 20, 81], [1, 5, 3, 17, 4, 13], [15, 2, 40, 14, 17, 4, 13], [1, 5, 3, 17, 4, 82], [83, 84], [1, 42, 2], [1, 5

### Pad

In [205]:
# padding to prepare sequences of same length
X_all_seq = pad_sequences(X_all_seq, maxlen=10)
print(X_all_seq)
#type is now a numpy.ndarray
print(type(X_all_seq))
print(f'Shape (X_all): {X_all_seq.shape}')

[[ 0  0  0  0  0  0  0  0  0 27]
 [ 0  0  0  0  0  0  0  0  0 43]
 [ 0  0  0  0  0  0 44  3 28  2]
 [ 0  0  0  0  0  0  0  0 27 45]
 [ 0  0  0  0  0  0 15  2 46 14]
 [ 0  0  0  0  0  0  0 29  9  2]
 [ 0  0  0  0  0  0  6  9  2 30]
 [ 0  0  0  0  0  6  9  2 47 30]
 [ 0  0  0  0  0  0 48 31 18 49]
 [ 0  0  0  0  0  0  6  9  2 50]
 [ 0  0  0  0  0  0  6 32  2 51]
 [ 0  0  0  0  0  0  6 10 31 18]
 [ 0  0  0  0  0  0  1 33  3 52]
 [ 0  0  0  0  0  0  0  0  0 53]
 [ 0  0  0  0  0  0  0  0  0 54]
 [ 0  0  0  0  0  0  0 28  2 34]
 [ 0  0  0  0  0  0 55  3  2 34]
 [ 0  0  0  0  0  0  2  9 56 57]
 [ 0  0  0  0  0  0  0  0  0 58]
 [ 0  0  0  0  0  0  0  0 59  2]
 [ 0  0  0  0  0  0 60 10  7 35]
 [ 0  0  0  0  0  0  0  0  0 35]
 [ 0  0  0  0  0  0  0 61  9  2]
 [ 0  0  0  0  0 36 62 19  1 63]
 [ 0  0  0  0 15  1 64  3  7 37]
 [ 0  0  0  0  0  0 38 14 65 37]
 [ 0  0  0  0  0  0  1  5  7 39]
 [ 0  0  0  0  1 19 18  3 66  2]
 [ 0  0  0  0  0 20 14  3  7 39]
 [ 0  0  0  0  1 19 67 36  7 11]
 [ 0  0  0

# Vocab Size

In [206]:
size_of_vocabulary = len(tokenizer.word_index) + 1 #+1 for padding
print(tokenizer.word_index)
print(f'Size of vocab: {size_of_vocabulary}')

{'i': 1, 'you': 2, 'to': 3, 'my': 4, 'want': 5, 'how': 6, 'your': 7, 'a': 8, 'are': 9, 'is': 10, 'product': 11, 'need': 12, 'package': 13, 'me': 14, 'can': 15, 'return': 16, 'track': 17, 'going': 18, 'am': 19, 'get': 20, 'refund': 21, 'money': 22, 'back': 23, 'computer': 24, 'broken': 25, '2343': 26, 'hello': 27, 'see': 28, 'where': 29, 'today': 30, 'it': 31, 'do': 32, 'have': 33, 'later': 34, 'name': 35, 'with': 36, 'manager': 37, 'give': 38, 'supervisor': 39, 'help': 40, 'fix': 41, 'hate': 42, 'hi': 43, 'good': 44, 'there': 45, 'hear': 46, 'doing': 47, "how's": 48, "'": 49, 'feeling': 50, 'feel': 51, 'go': 52, 'bye': 53, 'goodbye': 54, 'talk': 55, 'very': 56, 'helpful': 57, 'thanks': 58, 'thank': 59, 'what': 60, 'who': 61, 'whom': 62, 'speaking': 63, 'speak': 64, 'the': 65, 'report': 66, 'unhappy': 67, 'some': 68, 'equipment': 69, 'needs': 70, 'fixed': 71, 'something': 72, 'not': 73, 'like': 74, 'now': 75, 'device': 76, 'that': 77, 'long': 78, 'will': 79, 'take': 80, 'here': 81, 'shi

# Build Model

In [212]:
from keras.models import *
from keras.layers import *
from keras.callbacks import *


model = Sequential()
#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=100,trainable=True))
#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))
#Global Maxpooling
model.add(GlobalMaxPooling1D())
#Dense Layer
model.add(Dense(64,activation='relu'))
model.add(Dense(len(list(le.classes_)),activation='softmax'))
#Add loss function, metrics, optimizer
model.compile(optimizer='adam',loss='categorical_crossentropy',
              metrics=['acc'])
#addingcallbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc = ModelCheckpoint(model_path_scratch, monitor='val_acc', mode='max', 
                         save_best_only=True, verbose=1)
print(model.summary())


Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_36 (Embedding)     (None, 100, 300)          26700     
_________________________________________________________________
lstm_36 (LSTM)               (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_36 (Glo (None, 128)               0         
_________________________________________________________________
dense_72 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_73 (Dense)             (None, 11)                715       
Total params: 255,319
Trainable params: 255,319
Non-trainable params: 0
_________________________________________________________________
None


# Fit Model

In [214]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=1)
lst_accu_stratified = []
for train_index, test_index in skf.split(X_all_seq, y_all):
    x_train_fold, x_test_fold = X_all_seq[train_index], X_all_seq[test_index]
    y_train_fold, y_test_fold = y_all[train_index], y_all[test_index]
    print(x_train_fold, y_train_fold)
    model.fit(x_train_fold,y_train_fold,batch_size=128,epochs=10,
              validation_data=(x_test_fold, y_test_fold),
              verbose=1)


[[ 0  0  0  0  0  0  0  0  0 27]
 [ 0  0  0  0  0  0 44  3 28  2]
 [ 0  0  0  0  0  0 15  2 46 14]
 [ 0  0  0  0  0  0  0 29  9  2]
 [ 0  0  0  0  0  0  6  9  2 30]
 [ 0  0  0  0  0  6  9  2 47 30]
 [ 0  0  0  0  0  0  6  9  2 50]
 [ 0  0  0  0  0  0  6 32  2 51]
 [ 0  0  0  0  0  0  6 10 31 18]
 [ 0  0  0  0  0  0  1 33  3 52]
 [ 0  0  0  0  0  0  0  0  0 53]
 [ 0  0  0  0  0  0  0  0  0 54]
 [ 0  0  0  0  0  0 55  3  2 34]
 [ 0  0  0  0  0  0  2  9 56 57]
 [ 0  0  0  0  0  0  0  0  0 58]
 [ 0  0  0  0  0  0  0  0 59  2]
 [ 0  0  0  0  0  0 60 10  7 35]
 [ 0  0  0  0  0  0  0 61  9  2]
 [ 0  0  0  0  0 36 62 19  1 63]
 [ 0  0  0  0 15  1 64  3  7 37]
 [ 0  0  0  0  0  0 38 14 65 37]
 [ 0  0  0  0  0  0  1  5  7 39]
 [ 0  0  0  0  0 20 14  3  7 39]
 [ 0  0  0  0  0  0  1 12  8 21]
 [ 0  0  0  0  1 12  3 20 68 40]
 [ 0  0  0  0  0  1  5  4 22 23]
 [ 0  0  0  0  0  0  4 24 10 25]
 [ 0  0  0  0  0  0  4 69 10 25]
 [ 0  0  0  0  0  0  4 24 70 71]
 [ 0  0  0  0  1 12  2  3 41 72]
 [ 0  0  0



ValueError: ignored

In [None]:

# convert for k-fold sampling
X, y = X_all, y_all

from sklearn.model_selection import cross_val_score

model 
# Tokenize the sentences
#tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
for train, val in skf.split(X, y):
    print(f'train  - {np.bincount(y[train])} | test - {np.bincount(y[test])}')
    # preparing vocabulary
    tokenizer.fit_on_texts(list(X[train]))
    # convert text into integer sequences
    x_tr_seq.extend(tokenizer.texts_to_sequences(X[train]))
    x_val_seq.extend(tokenizer.texts_to_sequences(X[val]))
    y_tr.extend(y[train])
    y_val.extend(y[val])
   
    #print(x_tr_seq[:5])
    #print(x_val_seq[:5])
    #print(x_tr_seq)
    # padding to prepare sequences of same length
    #all_sequences = pad_sequences(all_sequences, maxlen=100)
x_tr_seq = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)
size_of_vocabulary = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(f' Size of vocab: {size_of_vocabulary}')
    #print(all_patterns.shape)
    #print(label_encoded_Y.shape)


In [67]:
print(len(x_tr_seq))
print(len(y_tr))
print(len(x_val_seq))
print(len(y_val))

122
122
61
61


# Load the Whole Embedding into Memory

In [27]:
# load the whole embedding into memory
path_to_glove_file = '/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/embeddings/glove.840B.300d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

  


Found 2195884 word vectors.


In [74]:
# create a weight matrix for words in training docs
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))
hits = 0
misses = 0
missedWords = []
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and embedding_vector.shape[0] != 0:       
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        missedWords.append(word)
print(f'Converted {hits} words ({misses} misses)')
print(missedWords)       


Converted 83 words (5 misses)
['to', 'is', "how's", '1234509873234323', '0983834298342341']


In [75]:
MODEL_NAME1 = 'best_model_scratch.h5'
MODEL_NAME2 = 'best_model_pretrained.h5'
model_path = '/content/drive/MyDrive/Colab Notebooks/chatbot-flask-simple/models'

# build two different NLP models of the same architecture.  The first learns
# embeddings from scratch the second uses pretrained word embeddings
from keras.models import *
from keras.layers import *
from keras.callbacks import *

#training = os.path.join(data_path, TRAIN_CSV)
#validation = os.path.join(data_path, VALID_CSV)
model_path_scratch = os.path.join(model_path, MODEL_NAME1)
model_path_pretrained = os.path.join(model_path, MODEL_NAME2)

model = Sequential()
# Create model with 3 layers.  First layer 128 neurons, second layer 64 neurons
# and 3rd output layer contains number of neurons equal to number of intents to
# predict
# output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

# Compile model.  Stochastic gradient descent with Nesterov accelerated
# gradient gives good
# results for this model
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#embedding layer
model.add(Embedding(size_of_vocabulary,300,
                    weights=[embedding_matrix],
                    input_length=100,trainable=False))

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

model.add(Dense(64,activation='relu'))
model.add(Dense(len(,activation='softmax'))

# add loss, metrics, optimizer
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['acc'])

# adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc = ModelCheckpoint(model_path_pretrained, monitor='val_acc', mode='max', 
                     save_best_only=True,verbose=1)

#print summary of model
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 300)          26700     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 65        
Total params: 254,669
Trainable params: 227,969
Non-trainable params: 26,700
_________________________________________________________________
None


In [77]:
y_tr = np.array(y_tr)
y_val = np.array(y_val)


In [78]:
history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,epochs=10,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1,callbacks=[es,mc])




Epoch 1/10

Epoch 00001: val_acc did not improve from 0.09836
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.09836
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.09836
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.09836
Epoch 00004: early stopping


In [17]:

from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
print(X)
print(y)


[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 1 1]


In [16]:
#tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenize the sentences
tokenizer = Tokenizer()
# preparing vocabulary
tokenizer.fit_on_texts(list(all_patterns))
# convert text into integer sequences
all_sequences = tokenizer.texts_to_sequences(all_patterns)
print(all_sequences)
# padding to prepare sequences of same length
all_sequences = pad_sequences(all_sequences, maxlen=100)
size_of_vocabulary = len(tokenizer.word_index) + 1
print(f' Size of vocab: {size_of_vocabulary}')

[[27], [43], [44, 3, 28, 2], [27, 45], [15, 2, 46, 14], [29, 9, 2], [6, 9, 2, 30], [6, 9, 2, 47, 30], [48, 31, 18, 49], [6, 9, 2, 50], [6, 32, 2, 51], [6, 10, 31, 18], [1, 33, 3, 52], [53], [54], [28, 2, 34], [55, 3, 2, 34], [2, 9, 56, 57], [58], [59, 2], [60, 10, 7, 35], [35], [61, 9, 2], [36, 62, 19, 1, 63], [15, 1, 64, 3, 7, 37], [38, 14, 65, 37], [1, 5, 7, 39], [1, 19, 18, 3, 66, 2], [20, 14, 3, 7, 39], [1, 19, 67, 36, 7, 11], [1, 12, 3, 16, 8, 11], [1, 12, 8, 21], [1, 12, 3, 20, 68, 40], [1, 5, 4, 22, 23], [4, 24, 10, 25], [4, 69, 10, 25], [4, 24, 70, 71], [1, 12, 2, 3, 41, 72], [1, 32, 73, 74, 7, 11], [1, 5, 4, 22, 23], [1, 5, 8, 21, 75], [38, 14, 4, 22, 23], [1, 42, 7, 11], [4, 76, 10, 25], [1, 12, 2, 3, 41, 4, 24], [1, 5, 8, 21], [1, 12, 3, 16, 8, 13], [1, 5, 3, 16, 8, 11], [1, 33, 8, 11, 77, 1, 5, 3, 16], [6, 15, 1, 17, 4, 13], [29, 10, 4, 13], [6, 78, 79, 4, 13, 80, 3, 20, 81], [1, 5, 3, 17, 4, 13], [15, 2, 40, 14, 17, 4, 13], [1, 5, 3, 17, 4, 82], [83, 84], [1, 42, 2], [1, 5

In [None]:


     for pattern in intent['patterns']:
          w = nltk.word_tokenize(pattern)
          words.extend(w)

          documents.append((w, intent['tag']))

          if intent['tag'] not in classes:
               classes.append(intent['tag'])

words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print(len(documents), 'documents')
print(len(classes), 'classes', classes)
print(len(words), 'unique lemmatized words', words)

pickle.dump(words,open(os.path.join(intents_path, 'intents_words.pkl'),'wb'))
pickle.dump(classes,open(os.path.join(intents_path, 'intents_classes.pkl'),'wb'))

# init training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
    bag = []
    # english representation of words
    pattern_words = doc[0]
    # convert to lowercase and lemmatized versions
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences

    #Tokenize the sentence 
    for w in words:
         bag.append(1) if w in pattern_words else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

random.shuffle(training)
training = np.array(training)
# create train and test lists.  X - patterns, y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print('Training data created')

# Create model with 3 layers.  First layer 128 neurons, second layer 64 neurons
# and 3rd output layer contains number of neurons equal to number of intents to
# predict
# output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

# Compile model.  Stochastic gradient descent with Nesterov accelerated
# gradient gives good
# results for this model
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

# fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=1000, batch_size=5, verbose=1)
model.save(os.path.join(intents_path, 'intents_chatbot_model.h5'), hist)

print('model created')

