<a href="https://colab.research.google.com/github/saiteja-namani-11/chatbot/blob/main/ChatBot_Seq2Seq2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the libraries
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing the dataset
lines = open('/content/drive/MyDrive/movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('/content/drive/MyDrive/movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [None]:
# Creating a dictionary that maps each line and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [None]:
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [None]:
# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [None]:
# Doing a first cleaning of the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;$:<>{}`+=~|.!?,]", "", text)
    return text

In [None]:
# Cleaning the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

In [None]:
# Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [None]:
# Filtering out the questions and answers that are too short or too long
short_questions = []
short_answers = []
i = 0
for question in clean_questions:
    if 2 <= len(question.split()) <= 25:
        short_questions.append(question)
        short_answers.append(clean_answers[i])
    i += 1
clean_questions = []
clean_answers = []
i = 0
for answer in short_answers:
    if 2 <= len(answer.split()) <= 25:
        clean_answers.append(answer)
        clean_questions.append(short_questions[i])
    i += 1

In [None]:
# Creating a dictionary that maps each word to its number of occurrences
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [None]:
# Creating two dictionaries that map the questions words and the answers words to a unique integer
threshold_questions = 15
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 15
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [None]:
# Adding the last tokens to these two dictionaries
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

In [None]:
# Creating the inverse dictionary of the answerswords2int dictionary
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [None]:
# Adding the End Of String token to the end of every answer
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'

In [None]:
# Translating all the questions and the answers into integers
# and Replacing all the words that were filtered out by <OUT> 
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [None]:
# Sorting questions and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])


In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np
import tensorflow as tf
import re
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import random
import gc
from tensorflow.keras.optimizers import Adam

In [None]:
# 1. Reduce the threshold for word occurrence
threshold_questions = 10
threshold_answers = 10

In [None]:
# Creating the Seq2Seq model

# Set the parameters for the model
input_vocab_size = len(questionswords2int) + 1
output_vocab_size = len(answerswords2int) + 1
max_length = 25


In [None]:
# Define the encoder input
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_vocab_size, 128)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(128, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [None]:
# Define the decoder input
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(output_vocab_size, 128)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

In [None]:
# Define the output layer
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = Dense(output_vocab_size, activation='softmax')(decoder_outputs)

In [None]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')


In [None]:
# Split the dataset into training and testing sets
import random

sample_size = 10000  # Adjust this value according to your needs
sample_indices = random.sample(range(len(sorted_clean_questions)), sample_size)

sample_questions = [sorted_clean_questions[i] for i in sample_indices]
sample_answers = [sorted_clean_answers[i] for i in sample_indices]

X_train, X_test, y_train, y_test = train_test_split(sample_questions, sample_answers, test_size=0.1, random_state=42)

In [None]:
# Prepare the data for training
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#  Free up memory
del lines, conversations, questions, answers, word2count
gc.collect()

7116

In [None]:
def preprocess_data(questions, answers, max_length):
    encoder_input_data = pad_sequences(questions, maxlen=max_length, padding='post')
    decoder_input_data = pad_sequences(answers, maxlen=max_length, padding='post')
    decoder_target_data = np.zeros((len(answers), max_length, output_vocab_size), dtype='float32')
    
    for i, answer in enumerate(answers):
        for t, word_int in enumerate(answer):
            if t > 0:
                decoder_target_data[i, t - 1, word_int] = 1.0
                
    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_train, decoder_input_train, decoder_target_train = preprocess_data(X_train, y_train, max_length)
encoder_input_test, decoder_input_test, decoder_target_test = preprocess_data(X_test, y_test, max_length)

In [None]:
#  Train the model with a smaller batch size and gradient clipping
epochs = 50
batch_size = 32
optimizer = Adam(clipnorm=1.0)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

model.fit([encoder_input_train, decoder_input_train], decoder_target_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f7c2811f3a0>

In [None]:
#  Test the model and evaluate its performance
loss = model.evaluate([encoder_input_test, decoder_input_test], decoder_target_test, batch_size=batch_size)
print('Test loss:', loss)


Test loss: 2.0192832946777344


In [None]:
#  (Optional) Save the trained model
model.save('seq2seq_chatbot.h5')

In [None]:

# Load the saved model
from tensorflow.keras.models import load_model
model = load_model('seq2seq_chatbot.h5')

# Define functions for encoding and decoding input/output sequences
def encode_input_sequence(input_sequence, word2int):
    encoded_sequence = [word2int['<SOS>']]
    for word in input_sequence.split():
        if word in word2int:
            encoded_sequence.append(word2int[word])
        else:
            encoded_sequence.append(word2int['<OUT>'])
    return encoded_sequence

def decode_output_sequence(output_sequence, int2word):
    decoded_sequence = ''
    for integer in output_sequence:
        if integer > 0:
            decoded_sequence += int2word[integer] + ' '
    return decoded_sequence

# Define a function to generate a response from the chatbot
def generate_response(input_text):
    # Encode the input sequence
    input_sequence = encode_input_sequence(input_text, questionswords2int)
    # Pad the input sequence
    padded_input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='post')
    # Predict the output sequence
    output_sequence = model.predict([padded_input_sequence, np.zeros((1, max_length))])[0]
    # Decode the output sequence
    decoded_output_sequence = decode_output_sequence(np.argmax(output_sequence, axis=1), answersints2word)
    # Remove the <EOS> token from the output sequence
    if '<EOS>' in decoded_output_sequence:
        decoded_output_sequence = decoded_output_sequence[:decoded_output_sequence.index('<EOS>')]
    return decoded_output_sequence

# Start the chat
while True:
    input_text = input('You: ')
    if input_text.lower() == 'quit':
        break
    response = generate_response(input_text)
    print('Bot:', response)


You: hi
Bot: mexico riddle do do is is is is <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> 
You: Hello
Bot: do <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> 
You: you know french?
Bot: <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> 
You: how is the movie
Bot: mexico mexico mexico riddle do do is is is is is is <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> <OUT> 
You: quit
