In [1]:
import nltk
from nltk.stem.lancaster import LancasterStemmer

stemmer = LancasterStemmer()

nltk.download('punkt')

import numpy as np
import tflearn
import tensorflow as tf
import random

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sonicname\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Instructions for updating:
non-resource variables are not supported in the long term
curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
import json
with open('data/intents.json', encoding="utf8") as json_data:
    intents = json.load(json_data)

In [3]:
def ngrams(str, n):
    tokens = str.split(' ')
    arr = []
    for i in range(len(tokens)):
        new_str = ''
        if i == 0 and n>1:
            new_str = '_'
            for j in range(n):
                if j < n - 1:
                    if (i + j) <= len(tokens):
                        new_str += ' '+tokens[i+j]
                    else:
                        new_str += ' _'
        else:
            for j in range(n):
                if j < n:
                    if (i + j) < len(tokens):
                        if j == 0:
                            new_str += tokens[i+j]
                        else:
                            new_str += ' '+tokens[i+j]
                    else:
                        new_str += ' _'
        arr.append(new_str)
    return arr

In [4]:
ngrams('a b c d e f g h', 4)

['_ a b c',
 'b c d e',
 'c d e f',
 'd e f g',
 'e f g h',
 'f g h _',
 'g h _ _',
 'h _ _ _']

In [5]:
words = []
classes = []
documents = []
ignore_words = ['?', 'và', 'à', 'ừ', 'ạ', 'vì', 'từng', 'một_cách']

for intent in intents['intents']:
    for pattern in intent['patterns']:

        w = nltk.word_tokenize(pattern)
        words.extend(w)
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

33 documents
7 classes ['goodbye', 'greeting', 'hours', 'menu', 'opentoday', 'payments', 'thanks']
56 unique stemmed words ['ai', 'biệt', 'bye', 'bạn', 'cho', 'chào', 'chấp', 'các', 'còn', 'có', 'cảm', 'của', 'cửa', 'dụng', 'gian', 'giúp', 'goodby', 'gì', 'gặp', 'hay', 'hiện', 'hoạt', 'hàng', 'hôm', 'hẹn', 'hữu', 'không', 'kiểu', 'làm', 'lại', 'mở', 'nay', 'nhận', 'nào', 'rất', 'thanh', 'thank', 'thật', 'thể', 'thời', 'toán', 'tôi', 'tại', 'tạm', 'với', 'xin', 'you', 'ích', 'đang', 'đi', 'điều', 'đây', 'đó', 'động', 'ơn', 'ở']


In [6]:
#Create training data
training = []
output = []

output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    pattern_words = doc[0]
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]

    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

random.shuffle(training)
training = np.array(training, dtype=object)

train_x = list(training[:,0])
train_y = list(training[:,1])

In [7]:
print(train_x[1])
print(train_y[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1, 0]


In [8]:
tf.compat.v1.reset_default_graph()

net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')

model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
model.save('models_save/model.tflearn')

Training Step: 4999  | total loss: [1m[32m0.12835[0m[0m | time: 0.005s
| Adam | epoch: 1000 | loss: 0.12835 - acc: 0.9897 -- iter: 32/33
Training Step: 5000  | total loss: [1m[32m0.11658[0m[0m | time: 0.007s
| Adam | epoch: 1000 | loss: 0.11658 - acc: 0.9907 -- iter: 33/33
--
INFO:tensorflow:E:\Github\chatbot-AI\AI\models_save\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [9]:
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "models_save/training_data", "wb" ) )