In [None]:
import numpy as np
import codecs
import tensorflow as tf
import pandas as pd
import ast
import tqdm
print(tf.__version__)

#### We are faced with the task of writing a simple chat bot using a neural network. We'll be using the Cornell Movie-Dialogs Corpus dataset. It contains more than 130 thousand refined replicas from 617 films.
#### What does cleared mean - we can remove all dialogues with long lines. Important: we do not remove the long lines themselves, but the entire dialogues with such lines, so that we have coherent dialogues. Thus, more than 130 thousand replicas turned out and not 220.
#### Our model will implement the attention mechanism.

## Prepare to learning


### Loading the data.

In [None]:
dialogs = pd.read_csv('../input/cleaned-data-for-the-chatbot-collected-from-movies/dialogs_expanded.csv')

In [None]:
dialogs.head()

In [None]:
dialogs.shape

### Text encoding.
We are now ready to encode our sequences numerically. Let's use the functions from the practical assignment.
First, let's code the dictionaries for both sequences.

In [None]:
input_texts = pd.read_csv('../input/cleaned-data-for-the-chatbot-collected-from-movies/input3.csv')
target_texts = pd.read_csv('../input/cleaned-data-for-the-chatbot-collected-from-movies/target3.csv')

In [None]:
for i in input_texts.index:
    typ = type(input_texts.text[i])
    if typ == float:
        input_texts.text[i] = ' '

for i in target_texts.index:
    typ = type(target_texts.text[i])
    if typ == float:
        target_texts.text[i] = ' '

In [None]:
def prepare_vocab(texts):
    vocab = sorted(set(''.join(texts)))
    vocab.append('<START>')
    vocab.append('<END>')
    vocab_size = len(vocab)
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    return vocab_size, char2idx, idx2char

input_texts_for_vocabs = input_texts.text.values.tolist()
target_texts_for_vocabs = target_texts.text.values.tolist()
INPUT_VOCAB_SIZE, input_char2idx, input_idx2char = prepare_vocab(input_texts_for_vocabs)
TARGET_VOCAB_SIZE, target_char2idx, target_idx2char = prepare_vocab(target_texts_for_vocabs)

In [None]:
input_texts_as_int = [[input_char2idx[c] for c in text] for text in dialogs['question']]
target_texts_as_int = [[target_char2idx[c] for c in text] for text in dialogs['answer']]

Now let's encode the sequences themselves using dictionaries.
Since for our model we need to enter the Encoder and enter and exit the Decoder, we will prepare three sequences of numbers.

In [None]:
encoder_input_seqs = [np.array(text) for text in input_texts_as_int]
decoder_input_seqs = []
decoder_target_seqs = []
for target_text in target_texts_as_int:
    decoder_input_seqs.append(np.array([target_char2idx['<START>']] + target_text))
    decoder_target_seqs.append(np.array(target_text + [target_char2idx['<END>']]))

Let's add padding.

In [None]:
max_enc_seq_length = 100
max_dec_seq_length = 100

encoder_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    encoder_input_seqs,
    value=input_char2idx[' '],
    padding='post',
    maxlen=max_enc_seq_length)

decoder_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    decoder_input_seqs,
    value=target_char2idx[' '],
    padding='post',
    maxlen=max_dec_seq_length)

decoder_target_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    decoder_target_seqs,
    value=target_char2idx[' '],
    padding='post',
    maxlen=max_dec_seq_length)

In [None]:
max_enc_seq_length, max_dec_seq_length

In [None]:
encoder_input_seqs.shape, decoder_input_seqs.shape, decoder_target_seqs.shape

In [None]:
''.join(input_idx2char[encoder_input_seqs[0]])

## Create and train the model.
Let's create a model. Our model will have three bidirectional LSTM layers and an attention mechanism. On our data, the model should train long enough to give a good result.

In [None]:
H_SIZE = 512 
EMB_SIZE = 512 

class Encoder_att(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(INPUT_VOCAB_SIZE, EMB_SIZE)
        self.lstm_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, return_sequences=True, return_state=True))
        self.lstm_2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, return_sequences=True, return_state=True))
        self.lstm_3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, return_sequences=True, return_state=True))
        
    def call(self, x):
        out = self.embed(x)
        out, f_h_1, f_c_1, b_h_1, b_c_1 = self.lstm_1(out)
        out, f_h_2, f_c_2, b_h_2, b_c_2 = self.lstm_2(out)
        out, f_h_3, f_c_3, b_h_3, b_c_3 = self.lstm_3(out)
        h_1 = tf.keras.layers.Concatenate()([f_h_1, b_h_1])
        c_1 = tf.keras.layers.Concatenate()([f_c_1, b_c_1])
        h_2 = tf.keras.layers.Concatenate()([f_h_2, b_h_2])
        c_2 = tf.keras.layers.Concatenate()([f_c_2, b_c_2])
        h_3 = tf.keras.layers.Concatenate()([f_h_3, b_h_3])
        c_3 = tf.keras.layers.Concatenate()([f_c_3, b_c_3])
        state_1 = (h_1, c_1)
        state_2 = (h_2, c_2)
        state_3 = (h_3, c_3)
        return out, (state_1, state_2, state_3)

class Decoder_att(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(TARGET_VOCAB_SIZE, EMB_SIZE)
        self.lstm_1 = tf.keras.layers.LSTM(H_SIZE*2, return_sequences=True, return_state=True)
        self.lstm_2 = tf.keras.layers.LSTM(H_SIZE*2, return_sequences=True, return_state=True)
        self.lstm_3 = tf.keras.layers.LSTM(H_SIZE*2, return_sequences=True, return_state=True)
        self.attention = tf.keras.layers.Attention()
        self.fc = tf.keras.layers.Dense(TARGET_VOCAB_SIZE, activation='softmax')
        
    def call(self, x, init_state, encoder_outputs, training=True):
        out = self.embed(x)
        out, h_1, c_1 = self.lstm_1(out, initial_state=init_state[0])
        out, h_2, c_2 = self.lstm_2(out, initial_state=init_state[1])
        out, h_3, c_3 = self.lstm_3(out, initial_state=init_state[2])
        out = self.attention([out, encoder_outputs], training=training)
        out = self.fc(out)
        state_1 = (h_1, c_1)
        state_2 = (h_2, c_2)
        state_3 = (h_3, c_3)
        return out, (state_1, state_2, state_3)

encoder_model_att = Encoder_att()
decoder_model_att = Decoder_att()

encoder_inputs_att = tf.keras.layers.Input(shape=(None,))
decoder_inputs_att = tf.keras.layers.Input(shape=(None,))

encoder_outputs_att, enc_state_att = encoder_model_att(encoder_inputs_att)
decoder_outputs_att, _ = decoder_model_att(decoder_inputs_att, enc_state_att, encoder_outputs_att)

seq2seq = tf.keras.Model([encoder_inputs_att, decoder_inputs_att], decoder_outputs_att)

In [None]:
seq2seq.summary()

In [None]:
BATCH_SIZE = 64
EPOCHS = 50

loss = tf.losses.SparseCategoricalCrossentropy()
seq2seq.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

for iterate in range(0, 40):
    seq2seq.fit([encoder_input_seqs, decoder_input_seqs], decoder_target_seqs,\
          batch_size=BATCH_SIZE, steps_per_epoch=50, epochs=EPOCHS)
    print(next_line('Tell me about itTell me about it'))
    print(next_line('What are you thinking about?'))
    print(next_line('Close the door!'))
    print(next_line('What is your name?'))
    print(next_line('How about we have lunch together?'))
    print(next_line('What time is it?'))
    seq2seq.save_weights(f'model_att{iterate}iter_expanded')

> It takes a very long time to train again, so we load the weights with the model I have already trained

In [None]:
seq2seq.load_weights('../input/cleaned-data-for-the-chatbot-collected-from-movies/model_att29iter_expanded')

## Inference

In [None]:
def seq2seq_att_inference(input_seq):
    output, state = encoder_model_att(input_seq)

    target_seq = np.array([[target_char2idx['<START>']]])

    decoded_sentence = ''
    while True:
        output_tokens, state = decoder_model_att(x=target_seq, init_state=state, encoder_outputs=output, training=False)

        sampled_token_index = np.argmax(np.array(output_tokens[0, -1, :]))
        sampled_char = target_idx2char[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '<END>' or
           len(decoded_sentence) > max_dec_seq_length):
            break

        target_seq = np.array([[sampled_token_index]])

    return decoded_sentence

In [None]:
def next_line(line):
    int_seq = np.array([input_char2idx[c] for c in line])
    int_seq_pad = np.zeros(100)
    int_seq_pad[:len(int_seq)] = int_seq
    int_seq_pad = int_seq_pad.reshape(1, -1)
    decoded = seq2seq_att_inference(int_seq_pad)
    if decoded[-5:] == '<END>':
        decoded = decoded[:-5]
    decoded = decoded.rstrip()
    return decoded

In [None]:
line = 'Hi, how are you?'
print(line)
for _ in range(10):
    line = next_line(line)
    print(line)

In [None]:
def own_dialog(len_of_conversation):
    for i in range(len_of_conversation):
        line_input = str(input())
        line_output = next_line(line_input)
        print(line_output)
own_dialog(10)

For the answers to be meaningful, you need to connect BERT or something like this

Most of the answers are at least grammatically correct