<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## LSTM Bot QA

### Datos
El objecto es utilizar datos disponibles del challenge ConvAI2 (Conversational Intelligence Challenge 2) de conversaciones en inglés. Se construirá un BOT para responder a preguntas del usuario (QA).\
[LINK](http://convai.io/data/)

In [1]:
!pip install --upgrade --no-cache-dir gdown --quiet

In [2]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import os
import gdown
import json

In [3]:
# Descargar la carpeta de dataset
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

El dataset ya se encuentra descargado


In [5]:
# dataset_file
text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f) # la variable data será un diccionario

In [6]:
# Observar los campos disponibles en cada linea del dataset
data[0].keys()

dict_keys(['dialog', 'start_time', 'end_time', 'bot_profile', 'user_profile', 'eval_score', 'profile_match', 'participant1_id', 'participant2_id'])

In [7]:
chat_in = []
chat_out = []
input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)

    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out

        # output sentence (decoder_output) tiene <eos>
        output_sentence = output + ' <eos>'
        # output sentence input (decoder_input) tiene <sos>
        output_sentence_input = '<sos> ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print("Cantidad de rows utilizadas:", len(input_sentences))

Cantidad de rows utilizadas: 6033


In [8]:
input_sentences[1], output_sentences[1], output_sentences_inputs[1]

('hi how are you ', 'not bad and you  <eos>', '<sos> not bad and you ')

### 2 - Preprocesamiento
Realizar el preprocesamiento necesario para obtener:
- word2idx_inputs, max_input_len
- word2idx_outputs, max_out_len, num_words_output
- encoder_input_sequences, decoder_output_sequences, decoder_targets

2.1: Tokenización y Creación de Diccionarios

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

MAX_VOCAB_SIZE = 8000
MAX_SEQUENCE_LENGTH = 10

# Tokenizador para las entradas (preguntas)
tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer_inputs.fit_on_texts(input_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)

# Diccionario palabra a índice
word2idx_inputs = tokenizer_inputs.word_index
print(f"Total de palabras en el vocabulario de entrada: {len(word2idx_inputs)}")

# Longitud máxima de entrada
max_input_len = max(len(s) for s in input_sequences)
print(f"Longitud máxima de secuencia de entrada: {max_input_len}")

# Tokenizador para las salidas (respuestas)
tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

# Diccionario palabra a índice para las salidas
word2idx_outputs = tokenizer_outputs.word_index
num_words_output = len(word2idx_outputs) + 1
print(f"Total de palabras en el vocabulario de salida: {num_words_output}")

# Longitud máxima de salida
max_out_len = max(len(s) for s in output_sequences)
print(f"Longitud máxima de secuencia de salida: {max_out_len}")

Total de palabras en el vocabulario de entrada: 1799
Longitud máxima de secuencia de entrada: 9
Total de palabras en el vocabulario de salida: 1807
Longitud máxima de secuencia de salida: 10


2.2: Padding de Secuencias

In [10]:
encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len)
decoder_input_sequences = pad_sequences(output_sequences_inputs, maxlen=max_out_len)
decoder_output_sequences = pad_sequences(output_sequences, maxlen=max_out_len)

### 3 - Preparar los embeddings
Utilizar los embeddings de Glove o FastText para transformar los tokens de entrada en vectores

In [11]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gunzip cc.en.300.vec.gz

--2024-08-08 17:31:45--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.65.229.46, 18.65.229.89, 18.65.229.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.65.229.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2024-08-08 17:31:51 (188 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]

gzip: cc.en.300.vec already exists; do you wish to overwrite (y or n)? y
y
yes
exit


In [13]:
EMBEDDING_DIM = 300
embedding_file = 'cc.en.300.vec'
embedding_matrix = np.zeros((len(word2idx_inputs) + 1, EMBEDDING_DIM))

# Cargar los embeddings de FastText
with open(embedding_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        if word in word2idx_inputs:
            idx = word2idx_inputs[word]
            embedding_matrix[idx] = vector


embedding_layer = Embedding(
    len(word2idx_inputs) + 1,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=max_input_len,
    trainable=False
)



In [14]:
encoder_inputs = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs)
encoder_lstm = LSTM(128, return_state=True, dropout=0.2)
encoder_outputs, state_h, state_c = encoder_lstm(x)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_out_len,))
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_embedding = decoder_embedding(decoder_inputs)

decoder_lstm = LSTM(128, return_sequences=True, return_state=True, dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### 4 - Entrenar el modelo
Entrenar un modelo basado en el esquema encoder-decoder utilizando los datos generados en los puntos anteriores. Utilce como referencias los ejemplos vistos en clase.

In [15]:
decoder_target_data = np.expand_dims(decoder_output_sequences, -1)
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_target_data,
    batch_size=64,
    epochs=50,
    validation_split=0.2
)

Epoch 1/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 24ms/step - accuracy: 0.4682 - loss: 4.6087 - val_accuracy: 0.6083 - val_loss: 2.3649
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.6022 - loss: 2.2474 - val_accuracy: 0.6476 - val_loss: 2.1646
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.6419 - loss: 2.0410 - val_accuracy: 0.6661 - val_loss: 2.0274
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6596 - loss: 1.9038 - val_accuracy: 0.6738 - val_loss: 1.9357
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6795 - loss: 1.7843 - val_accuracy: 0.6879 - val_loss: 1.8599
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6984 - loss: 1.6773 - val_accuracy: 0.6965 - val_loss: 1.7792
Epoch 7/50
[1m76/76[0m [32m━━

### 5 - Inferencia
Experimentar el funcionamiento de su modelo. Recuerde que debe realizar la inferencia de los modelos por separado de encoder y decoder.

In [16]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

reverse_word_map = dict(map(reversed, word2idx_outputs.items()))

# Función de Decodificación
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_map[sampled_token_index]

        if sampled_word == '<eos>' or len(decoded_sentence.split()) > max_out_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()

# Probar el BOT con preguntas
questions = ["do you read?", "do you have any pet?", "where are you from?"]
for question in questions:
    input_seq = tokenizer_inputs.texts_to_sequences([question])
    input_seq = pad_sequences(input_seq, maxlen=max_input_len)
    decoded_answer = decode_sequence(input_seq)
    print(f'Q: {question}\nA: {decoded_answer}\n')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Q: do you read?
A: what do you do for a living for a living

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
