In [10]:
import nltk
import numpy as np
import json
import pickle
import random

from nltk.stem import WordNetLemmatizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

In [11]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
with open('/kaggle/input/intents-dataset/intents.json','r',  encoding='utf-8') as file:
    intents_data = json.load(file)
intents_data['intents'][:2]

[{'tag': 'greeting',
  'patterns': ['Hi',
   'Hey',
   'Is anyone there?',
   'Hi there',
   'Hello',
   'Hey there',
   'Howdy',
   'Hola',
   'Bonjour',
   'Hay',
   'Sasa',
   'Good Evening',
   'Good afternoon',
   "What's up?",
   'Hey bot',
   'Good day',
   'Yo',
   'Hi, how are you?',
   "What's going on?",
   "How's it going?",
   "What's happening?",
   'Good to see you',
   "What's new?",
   'How are you doing?',
   "How's your day?",
   'How are things?',
   'Hey, how are you?',
   "Hi, what's up?",
   'How’s life?',
   'What’s up with you?',
   'Hey, good to see you',
   'Hi, how’s it going?',
   'Hello, how are you?',
   'Hey, how’s everything?',
   "What's good?",
   'What’s going on with you?',
   'Good morning, how are you?',
   'Greetings',
   'Hi there, how’s everything?',
   'Hey, what’s happening?',
   'How are you feeling?',
   'Hey there, how’s life?',
   'Yo, how’s it going?',
   'Sup?',
   'Hey, how’s your day going?',
   'Morning',
   'Evening',
   'Good after

In [19]:
import unicodedata
from nltk.stem import WordNetLemmatizer
import nltk, pickle, json

def normalize_text(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

lemmatizer = WordNetLemmatizer()
vocabulary = []
intent_tags = []
training_samples = []
ignore_symbols = ['?', '!']

for intent in intents_data['intents']:
    tag = intent['tag']
    if tag not in intent_tags:
        intent_tags.append(tag)
    for pattern in intent['patterns']:
        pattern_clean = normalize_text(pattern)
        tokens = nltk.word_tokenize(pattern_clean)
        vocabulary.extend(tokens)
        training_samples.append((tokens, tag))

vocabulary = [lemmatizer.lemmatize(w.lower()) for w in vocabulary if w not in ignore_symbols]
vocabulary = sorted(set(vocabulary))
intent_tags = sorted(set(intent_tags))

pickle.dump(vocabulary, open("chat_vocab.pkl", "wb"))
pickle.dump(intent_tags, open("chat_labels.pkl", "wb"))

print(f"Total tags: {len(intent_tags)}")
print(f"Vocabulary size: {len(vocabulary)}")


Total tags: 18
Vocabulary size: 368


In [20]:
def build_training_data(samples, vocab_list, tags):
    X_data = []
    y_data = []
    base_output = [0] * len(tags)

    for words, label in samples:
        bag_vector = []
        word_list = [lemmatizer.lemmatize(word.lower()) for word in words]
        for vocab_word in vocab_list:
            bag_vector.append(1 if vocab_word in word_list else 0)

        output_row = base_output[:]
        output_row[tags.index(label)] = 1

        X_data.append(bag_vector)
        y_data.append(output_row)

    return np.array(X_data), np.array(y_data)

X, y = build_training_data(training_samples, vocabulary, intent_tags)

combined_data = list(zip(X, y))
random.shuffle(combined_data)
X, y = zip(*combined_data)

X = np.array(X)
y = np.array(y)

In [21]:
from keras.layers import BatchNormalization

model = Sequential()
model.add(Dense(128, input_shape=(len(X[0]),), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(len(y[0]), activation='softmax'))

optimizer = Adam(learning_rate=0.004)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

In [22]:
history = model.fit(X, y, epochs=500, batch_size=8, verbose=1)

Epoch 1/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.1837 - loss: 3.1895 
Epoch 2/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5642 - loss: 1.5109
Epoch 3/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7115 - loss: 1.0057
Epoch 4/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6837 - loss: 0.8805
Epoch 5/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8009 - loss: 0.7411
Epoch 6/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8632 - loss: 0.5130
Epoch 7/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8947 - loss: 0.4161
Epoch 8/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8729 - loss: 0.4104
Epoch 9/500
[1m39/39[0m [32m━━━━━━━━━━━━━━━

In [23]:
model.save("chatbot_model.h5")