# chatbot.py

In [3]:
# importing the libraries
import numpy as np
import tensorflow as tf
import re
import time

# Part 1. Data Preprocessing

### Importing the dataset

In [17]:
lines = open("movie_lines.txt", encoding="utf-8", errors="ignore").read().split("\n")
conversations = open("movie_conversations.txt", encoding="utf-8", errors="ignore").read().split("\n")

In [9]:
print("lines")
print(lines[0:3])
print("conversations")
print(conversations[0:3])

lines
['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!', 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!', 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.']
conversations
["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']"]


### Creating a dictionary that maps each line and its id

In [12]:

id2line = {}
for line in lines:
    _line = line.split(" +++$+++ ")
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

### Creating a list of all of the conversations

In [20]:
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(","))

In [21]:
print(conversations_ids[0:3])

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203']]


### Getting separately the questions and the answers

In [22]:
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [26]:
print("questions:")
print(questions[0])
print("answers:")
print(answers[0])

questions:
Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
answers:
Well, I thought we'd start with pronunciation, if that's okay with you.


### Doing a first cleaning of the texts

In [29]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

### Cleaning the questions

In [30]:
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

### Cleaning the answers

In [31]:
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [32]:
print(clean_questions[0])
print(clean_answers[0])

can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
well i thought we would start with pronunciation if that is okay with you


### Creating a dictionary that maps each word to its number of occurrences

In [33]:
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
            
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [34]:
print(word2count["he"])

39498


### Creating two dictionaries that map the questions words and the answers words to a unique integer

In [36]:
threshold_questions = 20
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 20
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [40]:
print(questionswords2int["he"])
print(answerswords2int["he"])

180
180


### Adding the last tokens to these two dictionaries

In [42]:
# out: unknown
# sos: start of string
tokens = ["<PAD>", "<EOS>", "<OUT>", "<SOS>"]

In [44]:
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

### Creating the inverse dictionary of the answerswords2int dictionary

In [45]:
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [47]:
print(answersints2word[0])

can


### Adding the End Of String token to the end of every answer

In [49]:
for i in range(len(clean_answers)):
    clean_answers[i] += " <EOS>"

In [50]:
print(clean_answers[0])

well i thought we would start with pronunciation if that is okay with you <EOS>


### Translating all the questions and the answers into integers and Replacing all the words that were filtered out by `<OUT>`

In [52]:
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int["<OUT>"])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int["<OUT>"])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [53]:
print(questions_into_int[0])
print(answers_into_int[0])

[0, 1, 2, 3, 4, 8824, 8824, 5, 6, 8824, 7, 8, 9, 10, 8824, 11, 12, 13, 14, 15, 8824, 16]
[17, 18, 19, 1, 20, 21, 22, 8824, 23, 24, 25, 26, 22, 27, 8823]


### Sorting questions and answers by the length of questions

In [60]:
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1): # length
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

In [61]:
print(sorted_clean_questions[0])
print(sorted_clean_answers[0])

[47]
[15, 48, 25, 47, 18, 49, 50, 15, 51, 52, 45, 53, 8824, 54, 52, 55, 41, 56, 18, 57, 58, 59, 60, 61, 8823]


# Part 2. Building The Seq2Seq Model

### Creating placeholders for the inputs and the targets

In [62]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name="input")
    targets = tf.placeholder(tf.int32, [None, None], name="target")
    lr = tf.placeholder(tf.float32, name="learning_rate")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    return inputs, targets, lr, keep_prob

- tf.placeholder
    - Inserts a placeholder for a tensor that will be always fed
    - https://www.tensorflow.org/api_docs/python/tf/placeholder

### Preprocessing the targets

In [63]:
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int["<SOS>"])
    right_side = tf.strieded_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1) # horizontal concat
    return preprocessed_targets

- tf.fill
    - Creates a tensor filled with a scalar value
    - This operation creates a tensor of shape dims and fills it with value
    - https://www.tensorflow.org/api_docs/python/tf/fill
- tf.strided_slice
    - Extracts a strided slice of a tnesor (generalized python array indexing)
    - https://www.tensorflow.org/api_docs/python/tf/strided_slice
- tf.concat
    - Concatenates tensors along one dimension
    - https://www.tensorflow.org/api_docs/python/tf/concat

### Creating the Encoder RNN Layer

In [64]:
def encoder_rnn_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                                       cell_bw=encoder_cell,
                                                       sequence_length=sequence_length,
                                                       inputs=rnn_inputs,
                                                       dtype=tf.float32)
    return encoder_state

### Decoding the training set

In [66]:
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input,
                        sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, 
                                                                                                                                    attention_option="bahdanau", 
                                                                                                                                    num_units=decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name="aatn_dec_train")
    decoder_output, decoder_final_satte, decoder_final_context_state = tf.contrib.seq2seq.dynamic_decoder(decoder_cell,
                                                                                                         training_decoder_function,
                                                                                                         sequence_length,
                                                                                                         scope=decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoer_output, keep_prob)
    return output_function(decoder_output_dropout)

### Decoding the test/validation set

In [None]:
def decode_test_set(encoder_state, decoder_cell, decoder_embedded_input,
                        sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, 
                                                                                                                                    attention_option="bahdanau", 
                                                                                                                                    num_units=decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name="aatn_dec_train")
    decoder_output, decoder_final_satte, decoder_final_context_state = tf.contrib.seq2seq.dynamic_decoder(decoder_cell,
                                                                                                         training_decoder_function,
                                                                                                         sequence_length,
                                                                                                         scope=decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoer_output, keep_prob)
    return output_function(decoder_output_dropout)