In [2]:
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re

In [3]:
def clean_text(text):
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

In [4]:
with open('/content/chatbot.json') as file:
    data = json.load(file)

training_sentences = []
training_labels = []
labels = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        cleaned_pattern = clean_text(pattern)
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])

    if intent['tag'] not in labels:
        labels.append(intent['tag'])

num_classes = len(labels)

In [5]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)
training_labels = lbl_encoder.transform(training_labels)

In [6]:
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [7]:
# Data Augmentation
augmented_sentences = []
augmented_labels = []

for sequence, label in zip(padded_sequences, training_labels):
    augmented_sentences.append(sequence)
    augmented_labels.append(label)

    # Shuffle words in the sentence
    import random
    words = sequence.tolist()
    random.shuffle(words)
    augmented_sentences.append(np.array(words))
    augmented_labels.append(label)

    # Add noise to the sequence
    noise = np.random.normal(0, 0.05, sequence.shape)
    noisy_sequence = sequence + noise
    augmented_sentences.append(noisy_sequence)
    augmented_labels.append(label)


In [8]:
# Combine original and augmented data
padded_sequences = np.vstack((padded_sequences, np.array(augmented_sentences)))
training_labels = np.concatenate((training_labels, np.array(augmented_labels)))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, training_labels, test_size=0.2, random_state=42)


In [10]:

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))


model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 16)            16000     
                                                                 
 lstm (LSTM)                 (None, 20, 64)            20736     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 16)                1040      
                                                                 
 dense_1 (Dense)             (None, 17)                289       
                                                                 
Total params: 71089 (277.69 KB)
Trainable params: 71089 (277.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:

epochs = 15
history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [12]:
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break

    # Preprocess the user input
    cleaned_user_input = clean_text(user_input)
    user_input_sequence = tokenizer.texts_to_sequences([user_input])
    user_input_padded = pad_sequences(user_input_sequence, truncating='post', maxlen=max_len)

    # Get the model's prediction
    prediction = model.predict(user_input_padded)
    predicted_label = lbl_encoder.inverse_transform([np.argmax(prediction)])

    # Find the appropriate response
    for intent in data['intents']:
        if intent['tag'] == predicted_label:
            response = np.random.choice(intent['responses'])
            print("Chatbot:", response)
            break


You: hi
Chatbot: Hey there, how can I make your interaction with NIT-Trichy more productive?
You: pragyan
Chatbot: Pragyan is NIT Trichy's annual techno-managerial festival, typically conducted during the even semester. It showcases a diverse range of technical, managerial, and cultural events, attracting participants from all over the country. Pragyan aims to provide a platform for students to showcase their talents, learn from experts, and engage in various enriching activities. For the latest updates and detailed information about Pragyan, you can visit the official Pragyan website or page.
You: quit
