In [None]:
import nltk
from nltk.stem.lancaster import LancasterStemmer

stemmer = LancasterStemmer()

nltk.download('punkt')

import numpy as np
import tflearn
import tensorflow as tf
import random

In [2]:
import json
with open('data/intents.json', encoding="utf8") as json_data:
    intents = json.load(json_data)

In [3]:
def ngrams(str, n):
    tokens = str.split(' ')
    arr = []
    for i in range(len(tokens)):
        new_str = ''
        if i == 0 and n>1:
            new_str = '_'
            for j in range(n):
                if j < n - 1:
                    if (i + j) <= len(tokens):
                        new_str += ' '+tokens[i+j]
                    else:
                        new_str += ' _'
        else:
            for j in range(n):
                if j < n:
                    if (i + j) < len(tokens):
                        if j == 0:
                            new_str += tokens[i+j]
                        else:
                            new_str += ' '+tokens[i+j]
                    else:
                        new_str += ' _'
        arr.append(new_str)
    return arr

In [4]:
words = []
classes = []
documents = []
ignore_words = ['?', 'và', 'à', 'ừ', 'ạ', 'vì', 'từng', 'một_cách']

for intent in intents['intents']:
    for pattern in intent['patterns']:

        w = nltk.word_tokenize(pattern)
        words.extend(w)
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

151 documents
37 classes ['Acer', 'Asus', 'DELL', 'Lenovo', 'MSI', 'Ram', 'Ram_16_32', 'Ram_4_8', 'Ram_under_4', 'address', 'cpu_intel', 'gaming', 'goodbye', 'greeting', 'hours', 'menu', 'office', 'opentoday', 'others', 'payments', 'platform', 'platform_Mac_Os', 'platform_windows', 'platform_windows_10', 'platform_windows_11', 'platform_windows_7', 'size', 'size_11_12', 'size_13_14', 'size_15_16', 'size_17', 'storage', 'storage_HHD', 'storage_SSD', 'thanks', 'trademark', 'type_usage']
143 unique stemmed words ['10', '11', '12', '13', '14', '15', '16', '16gb', '17', '2gb', '32gb', '4gb', '7', '8gb', 'ac', 'ai', 'as', 'biệt', 'bye', 'bán', 'bên', 'bạn', 'bị', 'chi', 'chip', 'cho', 'chào', 'chơi', 'chấp', 'chỉ', 'chọn', 'cung', 'các', 'còn', 'có', 'cảm', 'cấp', 'của', 'cứng', 'cửa', 'del', 'dụng', 'gam', 'gian', 'giá', 'giúp', 'goodby', 'gì', 'gặp', 'hay', 'hhd', 'hiểu', 'hiện', 'hiệu', 'hoạt', 'hàng', 'hành', 'hôm', 'hẹn', 'hệ', 'học', 'hỏi', 'hợp', 'hữu', 'inch', 'intel', 'khác', 'không

In [5]:
#Create training data
training = []
output = []

output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    pattern_words = doc[0]
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]

    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

random.shuffle(training)
training = np.array(training, dtype=object)

train_x = list(training[:,0])
train_y = list(training[:,1])

In [6]:
tf.compat.v1.reset_default_graph()

net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')

model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
model.save('models_save/model.tflearn')

Training Step: 18999  | total loss: [1m[32m0.01657[0m[0m | time: 0.021s
| Adam | epoch: 1000 | loss: 0.01657 - acc: 0.9794 -- iter: 144/151
Training Step: 19000  | total loss: [1m[32m0.01502[0m[0m | time: 0.022s
| Adam | epoch: 1000 | loss: 0.01502 - acc: 0.9815 -- iter: 151/151
--
INFO:tensorflow:E:\Github\chatbot-AI\AI\models_save\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [7]:
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "models_save/training_data", "wb" ) )