<a href="https://colab.research.google.com/github/ravinnd3/Chatbot-without-LLM-s/blob/main/Chatbot_without_using_LLM's.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade nltk


In [None]:
!pip install contractions

In [None]:
import kagglehub
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split

import re
import contractions
from collections import Counter

import pickle
import nltk

nltk.download('punkt')                    # Tokenizer
nltk.download('tagsets')                    #tag info
nltk.download('punkt_tab') # Download punkt_tab for tokenization
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')  # POS tagger


from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

from nltk.corpus import wordnet




In [None]:
# Download latest version (will be downloaded to /kaggle/input/...)
path = kagglehub.dataset_download("grafstor/simple-dialogs-for-chatbot")

# Define the desired destination path
destination_path = "/content/chatbot"

# Create the destination directory if it doesn't exist
os.makedirs(destination_path, exist_ok=True)

# Copy the contents of the downloaded dataset to the destination path
# We use shell command for simplicity, you could also use shutil.copytree
!cp -r "{path}/." "{destination_path}/"

print(f"Dataset copied to: {destination_path}")

# List files in the destination directory to confirm
!ls "{destination_path}"

In [None]:
data = pd.read_csv("/content/chatbot/dialogs.txt",sep='\t',header=None,names=['question','answer'])


In [None]:
data

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

**Checking Sentence length**

In [None]:
data['q_len'] = data['question'].apply(lambda x: len(x.split()))
data['a_len'] = data['answer'].apply(lambda x: len(x.split()))


In [None]:
data[['q_len','a_len']].hist(bins=20)


**Vocabulary analysis**

In [None]:

all_text = ' '.join(data['question']) + ' ' + ' '.join(data['answer'])
word_counts = Counter(all_text.split())

print("Vocabulary size:", len(word_counts))
print("Most common words:", word_counts.most_common(20))


In [None]:
data.drop(['q_len','a_len'],axis=1,inplace=True)

**Cleaning Text**

In [None]:
def clean_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r"[^a-zA-Z0-9?!.']", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['question'] = data['question'].apply(clean_text)
data['answer'] = data['answer'].apply(clean_text)

In [None]:
data

METHOD 1: Retrieval-Based Chatbot (No Deep Learning Needed)

When a user asks something, the chatbot:

Finds the most similar question in your dataset.

Returns the paired answer.

In [None]:
# Create TF-IDF model
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data['question'])

def chatbot_response(user_input):
    user_tfidf = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_tfidf, tfidf)
    idx = similarities.argmax()
    return data.iloc[idx]['answer']

print("Chatbot: Hi! Type 'bye' to exit.")
while True:
    user_input = input("You: ").lower()
    if user_input == 'bye':
        print("Chatbot: Goodbye!")
        break
    response = chatbot_response(user_input)
    print("Chatbot:", response)


```
METHOD 2: Generative Chatbot (Seq2Seq using LSTM)
If you want the chatbot to generate new sentences (not just pick from known ones), use a Seq2Seq neural network.
🧮 Steps
Step 1: Preprocess
Tokenize questions and answers
Add <start> and <end> tokens to answers
Pad sequences to same length
Step 2: Train Encoder–Decoder LSTM
Encoder: reads the question
Decoder: generates the answer, word by word
Step 3: Predict response
Given a user query → encode it → decode word by word until <end>



```



In [None]:
# Add start and end tokens
data['answer'] = data['answer'].apply(lambda x: '<start> ' + x + ' <end>')



In [None]:
data

In [None]:
# Tokenizer
tokenizer = Tokenizer(filters='', oov_token=None)
tokenizer.fit_on_texts(list(data['question']) + list(data['answer']))
vocab_size = len(tokenizer.word_index) + 1


In [None]:
vocab_size



```
🚀 Which Should You Choose?
Approach	Good For	Needs GPU?	Learns new replies?
Retrieval (TF-IDF)	Small data, FAQs	❌ No	❌ No
Seq2Seq (LSTM)	Conversational data	⚙️ Optional (faster with GPU)	✅ Yes
```



In [None]:
# Convert to sequences
X = tokenizer.texts_to_sequences(data['question'])
y = tokenizer.texts_to_sequences(data['answer'])



In [None]:
# Padding
max_len = max(max(len(seq) for seq in X), max(len(seq) for seq in y))
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')



In [None]:
# Decoder inputs/outputs
y_input = y[:, :-1]
y_output = y[:, 1:]


**Build seq2seq model**

In [None]:
embed_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(max_len,), name="encoder_input")
enc_emb = Embedding(vocab_size, embed_dim, name="encoder_embedding")(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True, name="encoder_lstm")
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]


In [None]:
# Decoder
decoder_inputs = Input(shape=(max_len-1,), name="decoder_input")
dec_emb = Embedding(vocab_size, embed_dim, name="decoder_embedding")(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax', name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

In [None]:
# model training
model.fit([X, y_input], np.expand_dims(y_output, -1), batch_size=64, epochs=200,verbose=0)


In [None]:
# Save tokenizer and model
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))


In [None]:
model.save("seq2seq_chatbot.h5")

 **Inference Models (seq2seq)**


In [None]:
# Load model and tokenizer
tokenizer = pickle.load(open("tokenizer.pkl", "rb"))
model = load_model("seq2seq_chatbot.h5")

In [None]:
# Encoder inference
encoder_inputs_inf = model.input[0]
encoder_emb_layer = model.get_layer("encoder_embedding")
encoder_lstm_layer = model.get_layer("encoder_lstm")
enc_emb_inf = encoder_emb_layer(encoder_inputs_inf)
_, state_h_enc, state_c_enc = encoder_lstm_layer(enc_emb_inf)
encoder_model = Model(encoder_inputs_inf, [state_h_enc, state_c_enc])

In [None]:
# Decoder inference
decoder_inputs_inf = Input(shape=(1,), name="decoder_input_infer")
decoder_emb_layer = model.get_layer("decoder_embedding")
decoder_lstm_layer = model.get_layer("decoder_lstm")
decoder_dense_layer = model.get_layer("decoder_dense")

decoder_state_input_h = Input(shape=(lstm_units,), name="decoder_state_input_h")
decoder_state_input_c = Input(shape=(lstm_units,), name="decoder_state_input_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = decoder_emb_layer(decoder_inputs_inf)
decoder_outputs2, state_h2, state_c2 = decoder_lstm_layer(
    dec_emb2, initial_state=decoder_states_inputs
)
decoder_outputs2 = decoder_dense_layer(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs_inf] + decoder_states_inputs,
    [decoder_outputs2] + [state_h2, state_c2]
)

In [None]:
# Reverse tokenizer
reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}

In [None]:

def sentence_to_seq(sentence, max_len=max_len):
    seq = tokenizer.texts_to_sequences([sentence.lower()])
    return pad_sequences(seq, maxlen=max_len, padding='post')

def decode_sequence(input_text, max_output_len=15):
    states_value = encoder_model.predict(sentence_to_seq(input_text))
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']

    decoded_sentence = ''
    for _ in range(max_output_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '')

        if sampled_word in ('<end>', ''):
            break

        decoded_sentence += ' ' + sampled_word
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip().capitalize()




In [None]:
# Chat
print("Chatbot is ready! Type 'bye' to exit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'bye':
        print("Bot: Goodbye!")
        break
    reply = decode_sequence(user_input)
    print("Bot:", reply)

In [None]:
samples = [
    "how are you?",
    "are you right handed?",
    "i am pretty good. thanks for asking.",
    "no problem. so how have you been?",
    "but i do all my writing with my right hand."
]

for s in samples:
    print(f"You: {s}")
    print(f"Bot: {decode_sequence(s)}")
    print("---")


With Context Window - to remmember last 3 conversation

In [None]:
reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}
encoder_input_len = 21
context_window = 3  # remember last 3 messages (user+bot)

def sentence_to_seq(sentence):
    seq = tokenizer.texts_to_sequences([sentence.lower()])
    return pad_sequences(seq, maxlen=encoder_input_len, padding='post')

def decode_sequence(input_text):
    # Encode input text
    states_value = encoder_model.predict(sentence_to_seq(input_text))

    # Initialize target sequence with start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get('<start>', 1)

    decoded_sentence = ''
    for _ in range(encoder_input_len - 1):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '')

        # Break if <end> or nothing predicted
        if sampled_word.lower() in ('<end>', 'end', ''):
            break

        decoded_sentence += ' ' + sampled_word
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    # Clean and return nicely formatted output
    decoded_sentence = decoded_sentence.strip()
    return decoded_sentence.capitalize()

#Context Memory
context_history = []  # stores conversation turns

print("Context-Aware Chatbot is ready! Type 'quit' to exit.\n")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'bye':
        print("Bot: Goodbye!")
        break

    # Build contextual input using last few turns
    context_str = " ".join(context_history[-context_window:]) + " " + user_input
    reply = decode_sequence(context_str)

    print("Bot:", reply)

    # Save context (keep conversation memory)
    context_history.append(f"user: {user_input}")
    context_history.append(f"bot: {reply}")


**With POS Tagging**

In [None]:
!pip install --upgrade nltk

In [None]:
all_sentences = list(data['question']) + list(data['answer'])

In [None]:
# pos_tags_set = set()
# for sent in all_sentences:
#     tags = [tag for word, tag in nltk.pos_tag(nltk.word_tokenize(sent))]
#     pos_tags_set.update(tags)

# pos2idx = {tag: i+1 for i, tag in enumerate(sorted(pos_tags_set))}
# pos_vocab_size = len(pos2idx) + 1  # +1 for padding
# pos_vocab_size

In [None]:
questions = list(data['question'])
answers   = list(data['answer'])


In [None]:
# Add start and end tokens
answers = ["<start> " + ans + " <end>" for ans in answers]


In [None]:
data

In [None]:
# --- Word tokenizer ---
tokenizer = Tokenizer(filters='', oov_token=None)
tokenizer.fit_on_texts(["<start>", "<end>"] + questions + answers)
vocab_size = len(tokenizer.word_index) + 1


In [None]:
# POS tokenizer
all_sentences = questions + answers
pos_tags_set = set()
for sent in all_sentences:
    tags = [tag for word, tag in nltk.pos_tag(nltk.word_tokenize(sent))]
    pos_tags_set.update(tags)

pos2idx = {tag: i+1 for i, tag in enumerate(sorted(pos_tags_set))}
pos_vocab_size = len(pos2idx) + 1
pos_embed_dim = 32

In [None]:

max_len = 20
embed_dim = 128
lstm_units = 256

In [None]:
def sentence_to_seq_and_pos(sentence):
    seq = tokenizer.texts_to_sequences([sentence.lower()])
    seq = pad_sequences(seq, maxlen=max_len, padding='post')

    words = nltk.word_tokenize(sentence)
    pos_seq = [pos2idx.get(tag, 0) for word, tag in nltk.pos_tag(words)]
    pos_seq = pad_sequences([pos_seq], maxlen=max_len, padding='post')

    return seq, pos_seq

In [None]:

encoder_input_words_array = []
encoder_input_pos_array   = []
decoder_input_array       = []
decoder_target_array      = []

for i in range(len(questions)):
    w_seq, p_seq = sentence_to_seq_and_pos(questions[i])
    encoder_input_words_array.append(w_seq[0])
    encoder_input_pos_array.append(p_seq[0])

    ans_seq = tokenizer.texts_to_sequences([answers[i]])[0]
    decoder_input_array.append(ans_seq[:-1])   # input: <start> ... last word
    decoder_target_array.append(ans_seq[1:])   # target: first word ... <end>

encoder_input_words_array = pad_sequences(encoder_input_words_array, maxlen=max_len, padding='post')
encoder_input_pos_array   = pad_sequences(encoder_input_pos_array, maxlen=max_len, padding='post')
decoder_input_array       = pad_sequences(decoder_input_array, maxlen=max_len, padding='post')
decoder_target_array      = pad_sequences(decoder_target_array, maxlen=max_len, padding='post')
decoder_target_array      = np.expand_dims(decoder_target_array, -1)

#Encoder
encoder_input_words = Input(shape=(max_len,), name="encoder_words_input")
encoder_input_pos   = Input(shape=(max_len,), name="encoder_pos_input")

word_emb = Embedding(vocab_size, embed_dim, mask_zero=False)(encoder_input_words)
pos_emb  = Embedding(pos_vocab_size, pos_embed_dim, mask_zero=False)(encoder_input_pos)

encoder_emb = Concatenate()([word_emb, pos_emb])
encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_emb)
encoder_states = [state_h, state_c]

In [None]:
#Decoder
decoder_inputs = Input(shape=(max_len,), name="decoder_input")
decoder_emb = Embedding(vocab_size, embed_dim, mask_zero=False, name="decoder_embedding")(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_input_words, encoder_input_pos, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Training
model.fit([encoder_input_words_array, encoder_input_pos_array, decoder_input_array],
          decoder_target_array,
          batch_size=32,
          epochs=100,
          validation_split=0.1)

In [None]:
# Inference Models
encoder_model = Model([encoder_input_words, encoder_input_pos], encoder_states)

decoder_state_input_h = Input(shape=(lstm_units,), name="decoder_state_input_h")
decoder_state_input_c = Input(shape=(lstm_units,), name="decoder_state_input_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_infer = Input(shape=(1,), name="decoder_input_infer")
decoder_emb_infer = model.get_layer("decoder_embedding")(decoder_inputs_infer)
decoder_lstm_infer = LSTM(lstm_units, return_sequences=False, return_state=True, name="decoder_lstm_infer")
dec_outputs, state_h2, state_c2 = decoder_lstm_infer(decoder_emb_infer, initial_state=decoder_states_inputs)
decoder_states = [state_h2, state_c2]
decoder_outputs = decoder_dense(dec_outputs)
decoder_model = Model([decoder_inputs_infer] + decoder_states_inputs, [decoder_outputs] + decoder_states)

#Reverse word index
reverse_word_index = {v:k for k,v in tokenizer.word_index.items()}

In [None]:
#Decode sequence
def decode_sequence(input_text):
    w_seq, p_seq = sentence_to_seq_and_pos(input_text)
    states_value = encoder_model.predict([w_seq, p_seq])

    target_seq = np.zeros((1,1))
    target_seq[0,0] = tokenizer.word_index['<start>']

    decoded_sentence = ''
    for _ in range(max_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0])
        sampled_word = reverse_word_index.get(sampled_token_index, '')
        if sampled_word in ("<end>", ""):
            break
        if sampled_word != "<start>":
            decoded_sentence += ' ' + sampled_word
        target_seq[0,0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip().capitalize()




In [None]:
# Chat
context_history = []
context_window = 3

print("Chatbot is ready! Type 'quit' to exit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'bye':
        print("Bot: Goodbye!")
        break
    context_str = " ".join(context_history[-context_window:]) + " " + user_input
    reply = decode_sequence(context_str)
    print("Bot:", reply)
    context_history.append(user_input)
    context_history.append(reply)

In [None]:
data