## Chatbot

In [None]:
import pandas as pd



df = pd.read_csv('/content/dialogs.txt', delimiter='\t', header=None, names=['Context', 'Response'])




In [None]:
df

Unnamed: 0,Context,Response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


## Preprocessing

In [None]:

import re



def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    text = text.strip()
    return text

df['Context'] = df['Context'].apply(clean_text)
df['Response'] = df['Response'].apply(clean_text)


In [None]:
df

Unnamed: 0,Context,Response
0,hi how are you doing,i m fine how about yourself
1,i m fine how about yourself,i m pretty good thanks for asking
2,i m pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i ve been great what about you
4,i ve been great what about you,i ve been good i m in school right now
...,...,...
3720,that s a good question maybe it s not old age,are you right handed
3721,are you right handed,yes all my life
3722,yes all my life,you re wearing out your right hand stop using ...
3723,you re wearing out your right hand stop using ...,but i do all my writing with my right hand


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

start_token = 'startseq'
end_token = 'endseq'

# Add start and end tokens to each response
df['Response'] = df['Response'].apply(lambda x: f"{start_token} {x} {end_token}")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([df['Context'], df['Response']]))


df['Context_seq'] = tokenizer.texts_to_sequences(df['Context'])
df['Response_seq'] = tokenizer.texts_to_sequences(df['Response'])


In [None]:
df

Unnamed: 0,Context,Response,Context_seq,Response_seq
0,hi how are you doing,startseq i m fine how about yourself endseq,"[1498, 40, 19, 4, 175]","[1, 3, 33, 603, 40, 36, 551, 2]"
1,i m fine how about yourself,startseq i m pretty good thanks for asking endseq,"[3, 33, 603, 40, 36, 551]","[1, 3, 33, 159, 47, 243, 28, 485, 2]"
2,i m pretty good thanks for asking,startseq no problem so how have you been endseq,"[3, 33, 159, 47, 243, 28, 485]","[1, 32, 171, 24, 40, 17, 4, 100, 2]"
3,no problem so how have you been,startseq i ve been great what about you endseq,"[32, 171, 24, 40, 17, 4, 100]","[1, 3, 70, 100, 104, 12, 36, 4, 2]"
4,i ve been great what about you,startseq i ve been good i m in school right no...,"[3, 70, 100, 104, 12, 36, 4]","[1, 3, 70, 100, 47, 3, 33, 20, 94, 69, 113, 2]"
...,...,...,...,...
3720,that s a good question maybe it s not old age,startseq are you right handed endseq,"[10, 9, 8, 47, 563, 80, 7, 9, 42, 133, 584]","[1, 19, 4, 69, 2283, 2]"
3721,are you right handed,startseq yes all my life endseq,"[19, 4, 69, 2283]","[1, 27, 59, 29, 286, 2]"
3722,yes all my life,startseq you re wearing out your right hand st...,"[27, 59, 29, 286]","[1, 4, 51, 1375, 74, 46, 69, 421, 216, 418, 7,..."
3723,you re wearing out your right hand stop using ...,startseq but i do all my writing with my right...,"[4, 51, 1375, 74, 46, 69, 421, 216, 418, 7, 24...","[1, 31, 3, 13, 59, 29, 762, 52, 29, 69, 421, 2]"


In [None]:

max_len = max(df['Context_seq'].apply(len).max(), df['Response_seq'].apply(len).max())

df['Context_seq'] = pad_sequences(df['Context_seq'], maxlen=max_len, padding='post').tolist()
df['Response_seq'] = pad_sequences(df['Response_seq'], maxlen=max_len, padding='post').tolist()


In [None]:
df

Unnamed: 0,Context,Response,Context_seq,Response_seq
0,hi how are you doing,startseq i m fine how about yourself endseq,"[1498, 40, 19, 4, 175, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 3, 33, 603, 40, 36, 551, 2, 0, 0, 0, 0, 0,..."
1,i m fine how about yourself,startseq i m pretty good thanks for asking endseq,"[3, 33, 603, 40, 36, 551, 0, 0, 0, 0, 0, 0, 0,...","[1, 3, 33, 159, 47, 243, 28, 485, 2, 0, 0, 0, ..."
2,i m pretty good thanks for asking,startseq no problem so how have you been endseq,"[3, 33, 159, 47, 243, 28, 485, 0, 0, 0, 0, 0, ...","[1, 32, 171, 24, 40, 17, 4, 100, 2, 0, 0, 0, 0..."
3,no problem so how have you been,startseq i ve been great what about you endseq,"[32, 171, 24, 40, 17, 4, 100, 0, 0, 0, 0, 0, 0...","[1, 3, 70, 100, 104, 12, 36, 4, 2, 0, 0, 0, 0,..."
4,i ve been great what about you,startseq i ve been good i m in school right no...,"[3, 70, 100, 104, 12, 36, 4, 0, 0, 0, 0, 0, 0,...","[1, 3, 70, 100, 47, 3, 33, 20, 94, 69, 113, 2,..."
...,...,...,...,...
3720,that s a good question maybe it s not old age,startseq are you right handed endseq,"[10, 9, 8, 47, 563, 80, 7, 9, 42, 133, 584, 0,...","[1, 19, 4, 69, 2283, 2, 0, 0, 0, 0, 0, 0, 0, 0..."
3721,are you right handed,startseq yes all my life endseq,"[19, 4, 69, 2283, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 27, 59, 29, 286, 2, 0, 0, 0, 0, 0, 0, 0, 0..."
3722,yes all my life,startseq you re wearing out your right hand st...,"[27, 59, 29, 286, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 4, 51, 1375, 74, 46, 69, 421, 216, 418, 7,..."
3723,you re wearing out your right hand stop using ...,startseq but i do all my writing with my right...,"[4, 51, 1375, 74, 46, 69, 421, 216, 418, 7, 24...","[1, 31, 3, 13, 59, 29, 762, 52, 29, 69, 421, 2..."


In [None]:

vocab_size = len(tokenizer.word_index) + 1


word_to_index = tokenizer.word_index


index_to_word = {index: word for word, index in word_to_index.items()}


In [None]:
vocab_size

2458

In [None]:
import numpy as np


encoder_input_data = np.array(df['Context_seq'].tolist())
decoder_input_data = np.array(df['Response_seq'].tolist())


decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]


In [None]:
encoder_input_data

array([[1498,   40,   19, ...,    0,    0,    0],
       [   3,   33,  603, ...,    0,    0,    0],
       [   3,   33,  159, ...,    0,    0,    0],
       ...,
       [  27,   59,   29, ...,    0,    0,    0],
       [   4,   51, 1375, ...,    0,    0,    0],
       [  31,    3,   13, ...,    0,    0,    0]])

In [None]:
decoder_input_data[0]

array([  1,   3,  33, 603,  40,  36, 551,   2,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

In [None]:
decoder_target_data[0]

array([  3,  33, 603,  40,  36, 551,   2,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

## Model

In [None]:
import tensorflow as tf
class LuongAttention(tf.keras.Model):
    def __init__(self, rnn_size, attention_func):
        super(LuongAttention, self).__init__()
        self.attention_func = attention_func

        if attention_func not in ['dot', 'general', 'concat']:
            raise ValueError(
                'Attention score must be either dot, general or concat.')

        if attention_func == 'general':

            self.wa = tf.keras.layers.Dense(rnn_size)
        elif attention_func == 'concat':

            self.wa = tf.keras.layers.Dense(rnn_size, activation='tanh')
            self.va = tf.keras.layers.Dense(1)

    def call(self, decoder_output, encoder_output):
        if self.attention_func == 'dot':

            score = tf.matmul(decoder_output, encoder_output, transpose_b=True)
        elif self.attention_func == 'general':

            score = tf.matmul(decoder_output, self.wa(
                encoder_output), transpose_b=True)
        elif self.attention_func == 'concat':

            decoder_output = tf.tile(
                decoder_output, [1, encoder_output.shape[1], 1])


            score = self.va(
                self.wa(tf.concat((decoder_output, encoder_output), axis=-1)))


            score = tf.transpose(score, [0, 2, 1])


        alignment = tf.keras.activations.softmax(score, axis=-1)


        context = tf.matmul(alignment, encoder_output)

        return context, alignment

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model

embedding_dim = 256
units = 512
vocab_size = vocab_size

encoder_inputs = Input(shape=(None,))
x = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(units, return_sequences=True, return_state=True)(x)

decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

attention = LuongAttention(units, attention_func='dot')
context_vector, alignment = attention(decoder_outputs, encoder_outputs)


decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

decoder_dense = Dense(vocab_size, activation='softmax')
outputs = decoder_dense(decoder_combined_context)

model = Model([encoder_inputs, decoder_inputs], outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

model.summary()


In [None]:
model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=64, epochs=50, validation_split=0.2)

Epoch 1/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.5964 - loss: 3.7384 - val_accuracy: 0.6773 - val_loss: 2.1878
Epoch 2/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.6976 - loss: 1.9193 - val_accuracy: 0.6788 - val_loss: 2.1650
Epoch 3/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.7045 - loss: 1.8326 - val_accuracy: 0.6849 - val_loss: 2.1224
Epoch 4/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.7120 - loss: 1.7471 - val_accuracy: 0.6900 - val_loss: 2.0616
Epoch 5/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.7136 - loss: 1.6734 - val_accuracy: 0.6947 - val_loss: 2.0293
Epoch 6/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.7215 - loss: 1.5859 - val_accuracy: 0.7009 - val_loss: 2.0011
Epoch 7/50
[1m47/47[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7c9731ead030>

## Inference Model

In [None]:

encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])


In [None]:

decoder_inputs_single = Input(shape=(None,))
decoder_emb_layer = Embedding(vocab_size, embedding_dim)  # Recreate the embedding layer
decoder_emb2 = decoder_emb_layer(decoder_inputs_single)


decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    decoder_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

context_vector, alignment = attention(decoder_outputs2, encoder_outputs_input)

decoder_combined_context2 = Concatenate(axis=-1)([context_vector, decoder_outputs2])

decoder_outputs2 = decoder_dense(decoder_combined_context2)

decoder_model = Model(
    [decoder_inputs_single, encoder_outputs_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2, state_h2, state_c2, alignment])


In [None]:
def decode_sequence(input_seq, temperature=1.0):
    encoder_outs, h, c = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word_to_index[start_token]
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c, _ = decoder_model.predict([target_seq, encoder_outs, h, c])


        output_tokens = output_tokens[0, -1, :] / temperature
        exp_preds = np.exp(output_tokens)
        preds = exp_preds / np.sum(exp_preds)


        sampled_token_index = np.random.choice(len(preds), p=preds)
        sampled_word = index_to_word.get(sampled_token_index, '')

        decoded_sentence += ' ' + sampled_word

        if sampled_word == end_token or len(decoded_sentence.split()) > max_len:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return decoded_sentence.strip()


## Prediction

In [None]:
input_seq = ['what are you doing']


input_seq = tokenizer.texts_to_sequences(input_seq)

input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

decoded_sentence = decode_sequence(input_seq)
print("Decoded sentence:", decoded_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28

## Evaluating its performance

In [None]:
!pip install numpy tensorflow transformers nltk sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
import torch
#BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')



# BERT-based similarity
def compute_bert_similarity(sent1, sent2):
    inputs1 = bert_tokenizer(sent1, return_tensors='pt', truncation=True, padding=True)
    inputs2 = bert_tokenizer(sent2, return_tensors='pt', truncation=True, padding=True)

    with torch.no_grad():
        output1 = bert_model(**inputs1).last_hidden_state.mean(dim=1)
        output2 = bert_model(**inputs2).last_hidden_state.mean(dim=1)

    similarity = torch.nn.functional.cosine_similarity(output1, output2)
    return similarity.item()

#sentence-transformer similarity
def compute_sentence_transformer_similarity(sent1, sent2):
    embeddings1 = sentence_transformer.encode(sent1, convert_to_tensor=True)
    embeddings2 = sentence_transformer.encode(sent2, convert_to_tensor=True)
    return util.pytorch_cos_sim(embeddings1, embeddings2).item()

#  responses
input_seq = input_seq
generated_response = decoded_sentence
reference_response = "responding you"



# BERT-based similarity
bert_similarity = compute_bert_similarity(reference_response, generated_response)
print(f"BERT-based similarity: {bert_similarity:.4f}")

#  Sentence Transformer similarity
st_similarity = compute_sentence_transformer_similarity(reference_response, generated_response)
print(f"Sentence Transformer similarity: {st_similarity:.4f}")

#analysis: Print results
print(f"\nGenerated Response: {generated_response}")
print(f"Reference Response: {reference_response}")



BERT-based similarity: 0.5611
Sentence Transformer similarity: 0.1133

Generated Response: playing twice fill wipe immediately predictable handed joking though iron hours score clears solve meet 2003 types rubber selling barbara tape correct sing
Reference Response: responding you
