In [125]:
# libraries
import pickle
import codecs
import re
import time
import numpy as np

from __future__ import print_function # for backward compatibility
from gensim.models import word2vec, KeyedVectors

import random

#needed for tensorflow to access gpu
import os
os.add_dll_directory("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/bin")
import tensorflow as tf
#pip install tensorflow-addons==0.11.2
import tensorflow_addons as tfa


# Data Parser
This parser cleans and preprocesses the dataset.

In [126]:
# This function parses all words into script
def parse_all_words(all_words_path):
    raw_movie_lines = open('data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore').read().split('\n')[:-1]

    with codecs.open(all_words_path, "w", encoding='utf-8', errors='ignore') as f:
        for line in raw_movie_lines:
            line = line.split(' +++$+++ ')
            utterance = line[-1]
            f.write(utterance + '\n')

In [127]:
#This function filter words by preprocessing word counts and creating vocab based on word count theresholds.
def preProBuildWordVocab(word_count_threshold=5, all_words_path='data/all_words.txt'):
    # borrowed this function from NeuralTalk

    if not os.path.exists(all_words_path):
        parse_all_words(all_words_path)

    corpus = open(all_words_path, 'r').read().split('\n')[:-1]
    captions = np.asarray(corpus, dtype=object) #np.object_ depreciated  changed to object

    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    captions = map(lambda x: x.replace('"', ''), captions)
    captions = map(lambda x: x.replace('\n', ''), captions)
    captions = map(lambda x: x.replace('?', ''), captions)
    captions = map(lambda x: x.replace('!', ''), captions)
    captions = map(lambda x: x.replace('\\', ''), captions)
    captions = map(lambda x: x.replace('/', ''), captions)
	
	
    print('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold))
    word_counts = {}
    # iterate through the captions and create vocab
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.lower().split(' '):
            
            word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('filtered words from %d to %d' % (len(word_counts), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<pad>'
    ixtoword[1] = '<bos>'
    ixtoword[2] = '<eos>'
    ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<pad>'] = 0
    wordtoix['<bos>'] = 1
    wordtoix['<eos>'] = 2
    wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx+4
        ixtoword[idx+4] = w

    word_counts['<pad>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<eos>'] = nsents
    word_counts['<unk>'] = nsents

    bias_init_vector = np.array([1.0 * word_counts[ixtoword[i]] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range

    return wordtoix, ixtoword, bias_init_vector

In [128]:
# This function extracts only the defined vocab from the data
def refine(data):
    words = re.findall("[a-zA-Z'-]+", data)
    words = ["".join(word.split("'")) for word in words]
    # words = ["".join(word.split("-")) for word in words]
    data = ' '.join(words)
    return data

In [129]:
#create and store utterance dictionary
def data_parser():
    parse_all_words('data/all_words.txt')

    raw_movie_lines = open('data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore').read().split('\n')[:-1]
    
    utterance_dict = {}
    with codecs.open('data/tokenized_all_words.txt', "w", encoding='utf-8', errors='ignore') as f:
        for line in raw_movie_lines:
            line = line.split(' +++$+++ ')
            line_ID = line[0]
            utterance = line[-1]
            utterance_dict[line_ID] = utterance
            utterance = " ".join([refine(w) for w in utterance.lower().split()])
            f.write(utterance + '\n')
    pickle.dump(utterance_dict, open('data/utterance_dict', 'wb'), True)

#run data parser
    data_parser()

# Feature Extractor
Extracts vocabulary from movie conversation data

In [130]:
ts = time.time() # get current time

#get files
raw_movie_conversations = open('data/movie_conversations.txt', 'r').read().split('\n')[:-1]
utterance_dict = pickle.load(open('data/utterance_dict', 'rb'))
corpus = word2vec.Text8Corpus("data/tokenized_all_words.txt")

#create and save word vector
word_vector_size = 300
word_vector = word2vec.Word2Vec(corpus, vector_size=word_vector_size)
word_vector.wv.save_word2vec_format(u"model/word_vector.bin", binary=True)

#load work vector
word_vector = KeyedVectors.load_word2vec_format('model/word_vector.bin', binary=True)

#output how long it took
print("Time Elapsed: {} secs\n".format(time.time() - ts))

Time Elapsed: 13.607280254364014 secs



In [131]:
""" Extract only the vocabulary part of the data """
def refine(data):
    words = re.findall("[a-zA-Z'-]+", data)
    words = ["".join(word.split("'")) for word in words]
    # words = ["".join(word.split("-")) for word in words]
    data = ' '.join(words)
    return data

In [132]:
ts = time.time()
conversations = []
print('len conversation', len(raw_movie_conversations))
con_count = 0
traindata_count = 0

for conversation in raw_movie_conversations:
    conversation = conversation.split(' +++$+++ ')[-1]
    conversation = conversation.replace('[', '')
    conversation = conversation.replace(']', '')
    conversation = conversation.replace('\'', '')
    conversation = conversation.split(', ')
    assert len(conversation) > 1 #test length of conversation and raise error if failed

    for i in range(len(conversation)-1):
        con_a = utterance_dict[conversation[i+1]].strip()
        con_b = utterance_dict[conversation[i]].strip()
        if len(con_a.split()) <= 22 and len(con_b.split()) <= 22:
            con_a = [refine(w) for w in con_a.lower().split()]
            # con_a = [word_vector[w] if w in word_vector else np.zeros(WORD_VECTOR_SIZE) for w in con_a]
            conversations.append((con_a, con_b))
            traindata_count += 1
    con_count += 1
    if con_count % 1000 == 0:
        print('con_count {}, traindata_count {}'.format(con_count, traindata_count))
pickle.dump(conversations, open('data/reversed_conversations_lenmax22', 'wb'), True)
print("Time Elapsed: {} secs\n".format(time.time() - ts))

len conversation 83097
con_count 1000, traindata_count 2049
con_count 2000, traindata_count 3996
con_count 3000, traindata_count 6425
con_count 4000, traindata_count 8353
con_count 5000, traindata_count 10654
con_count 6000, traindata_count 12707
con_count 7000, traindata_count 14666
con_count 8000, traindata_count 16673
con_count 9000, traindata_count 18578
con_count 10000, traindata_count 20317
con_count 11000, traindata_count 22826
con_count 12000, traindata_count 25611
con_count 13000, traindata_count 27879
con_count 14000, traindata_count 30057
con_count 15000, traindata_count 32631
con_count 16000, traindata_count 34686
con_count 17000, traindata_count 36849
con_count 18000, traindata_count 38890
con_count 19000, traindata_count 41103
con_count 20000, traindata_count 43175
con_count 21000, traindata_count 45123
con_count 22000, traindata_count 47305
con_count 23000, traindata_count 48998
con_count 24000, traindata_count 51571
con_count 25000, traindata_count 53672
con_count 26000

In [133]:
#statistics of data
max_a = -1
max_b = -1
max_a_ind = -1
max_b_ind = -1
sum_a = 0.
sum_b = 0.

len_a_list = []
len_b_list = []

for i in range(len(conversations)):
    len_a = len(conversations[i][0])
    len_b = len(conversations[i][1].split())
    if len_a > max_a:
        max_a = len_a
        max_a_ind = i
    if len_b > max_b:
        max_b = len_b
        max_b_ind = i
    sum_a += len_a
    sum_b += len_b

    len_a_list.append(len_a)
    len_b_list.append(len_b)

np.save("data/reversed_lenmax22_a_list", np.array(len_a_list))
np.save("data/reversed_lenmax22_b_list", np.array(len_b_list))

print("max_a_ind {}, max_b_ind {}".format(max_a_ind, max_b_ind))
print("max_a {}, max_b {}, avg_a {}, avg_b {}".format(max_a, max_b, sum_a/len(conversations), sum_b/len(conversations)))

max_a_ind 10, max_b_ind 0
max_a 22, max_b 22, avg_a 7.424128170416531, avg_b 7.383465165974871


In [134]:
ts = time.time()
conversations = []
# former_sents = []
print('len conversation', len(raw_movie_conversations))
con_count = 0
traindata_count = 0
for conversation in raw_movie_conversations:
    conversation = conversation.split(' +++$+++ ')[-1]
    conversation = conversation.replace('[', '')
    conversation = conversation.replace(']', '')
    conversation = conversation.replace('\'', '')
    conversation = conversation.split(', ')
    assert len(conversation) > 1
    con_a_1 = ''
    for i in range(len(conversation)-1):
        con_a_2 = utterance_dict[conversation[i]]
        con_b = utterance_dict[conversation[i+1]]
        if len(con_a_1.split()) <= 22 and len(con_a_2.split()) <= 22 and len(con_b.split()) <= 22:
            con_a = "{} {}".format(con_a_1, con_a_2)
            con_a = [refine(w) for w in con_a.lower().split()]
            # con_a = [word_vector[w] if w in word_vector else np.zeros(WORD_VECTOR_SIZE) for w in con_a]
            conversations.append((con_a, con_b, con_a_2))
            # former_sents.append(con_a_2)
            traindata_count += 1
        con_a_1 = con_a_2
    con_count += 1
    if con_count % 1000 == 0:
        print('con_count {}, traindata_count {}'.format(con_count, traindata_count))
pickle.dump(conversations, open('data/conversations_lenmax22_formersents2_with_former', 'wb'), True)
# pickle.dump(former_sents, open('data/conversations_lenmax22_former_sents', 'wb'), True)
print("Time Elapsed: {} secs\n".format(time.time() - ts))

len conversation 83097
con_count 1000, traindata_count 1942
con_count 2000, traindata_count 3782
con_count 3000, traindata_count 6095
con_count 4000, traindata_count 7888
con_count 5000, traindata_count 10079
con_count 6000, traindata_count 12042
con_count 7000, traindata_count 13890
con_count 8000, traindata_count 15781
con_count 9000, traindata_count 17571
con_count 10000, traindata_count 19183
con_count 11000, traindata_count 21543
con_count 12000, traindata_count 24160
con_count 13000, traindata_count 26267
con_count 14000, traindata_count 28381
con_count 15000, traindata_count 30743
con_count 16000, traindata_count 32700
con_count 17000, traindata_count 34766
con_count 18000, traindata_count 36692
con_count 19000, traindata_count 38768
con_count 20000, traindata_count 40717
con_count 21000, traindata_count 42566
con_count 22000, traindata_count 44584
con_count 23000, traindata_count 46159
con_count 24000, traindata_count 48564
con_count 25000, traindata_count 50524
con_count 26000

In [135]:
ts = time.time()
conversations = []
# former_sents = []
print('len conversation', len(raw_movie_conversations))
con_count = 0
traindata_count = 0
for conversation in raw_movie_conversations:
    conversation = conversation.split(' +++$+++ ')[-1]
    conversation = conversation.replace('[', '')
    conversation = conversation.replace(']', '')
    conversation = conversation.replace('\'', '')
    conversation = conversation.split(', ')
    assert len(conversation) > 1
    con_a_1 = ''
    for i in range(len(conversation)-1):
        con_a_2 = utterance_dict[conversation[i]]
        con_b = utterance_dict[conversation[i+1]]
        if len(con_a_1.split()) <= 22 and len(con_a_2.split()) <= 22 and len(con_b.split()) <= 22:
            con_a = "{} {}".format(con_a_1, con_a_2)
            con_a = [refine(w) for w in con_a.lower().split()]
            # con_a = [word_vector[w] if w in word_vector else np.zeros(WORD_VECTOR_SIZE) for w in con_a]
            conversations.append((con_a, con_b))
            # former_sents.append(con_a_2)
            traindata_count += 1
        con_a_1 = con_a_2
    con_count += 1
    if con_count % 1000 == 0:
        print('con_count {}, traindata_count {}'.format(con_count, traindata_count))
pickle.dump(conversations, open('data/conversations_lenmax22_former_sents2', 'wb'), True)
print("Time Elapsed: {} secs\n".format(time.time() - ts))

len conversation 83097
con_count 1000, traindata_count 1942
con_count 2000, traindata_count 3782
con_count 3000, traindata_count 6095
con_count 4000, traindata_count 7888
con_count 5000, traindata_count 10079
con_count 6000, traindata_count 12042
con_count 7000, traindata_count 13890
con_count 8000, traindata_count 15781
con_count 9000, traindata_count 17571
con_count 10000, traindata_count 19183
con_count 11000, traindata_count 21543
con_count 12000, traindata_count 24160
con_count 13000, traindata_count 26267
con_count 14000, traindata_count 28381
con_count 15000, traindata_count 30743
con_count 16000, traindata_count 32700
con_count 17000, traindata_count 34766
con_count 18000, traindata_count 36692
con_count 19000, traindata_count 38768
con_count 20000, traindata_count 40717
con_count 21000, traindata_count 42566
con_count 22000, traindata_count 44584
con_count 23000, traindata_count 46159
con_count 24000, traindata_count 48564
con_count 25000, traindata_count 50524
con_count 26000

In [136]:
ts = time.time()
conversations = []
print('len conversation', len(raw_movie_conversations))
con_count = 0
traindata_count = 0
for conversation in raw_movie_conversations:
    conversation = conversation.split(' +++$+++ ')[-1]
    conversation = conversation.replace('[', '')
    conversation = conversation.replace(']', '')
    conversation = conversation.replace('\'', '')
    conversation = conversation.split(', ')
    assert len(conversation) > 1
    for i in range(len(conversation)-1):
        con_a = utterance_dict[conversation[i]]
        con_b = utterance_dict[conversation[i+1]]
        if len(con_a.split()) <= 22 and len(con_b.split()) <= 22:
            con_a = [refine(w) for w in con_a.lower().split()]
            # con_a = [word_vector[w] if w in word_vector else np.zeros(WORD_VECTOR_SIZE) for w in con_a]
            conversations.append((con_a, con_b))
            traindata_count += 1
    con_count += 1
    if con_count % 1000 == 0:
        print('con_count {}, traindata_count {}'.format(con_count, traindata_count))
pickle.dump(conversations, open('data/conversations_lenmax22', 'wb'), True)
print("Time Elapsed: {} secs\n".format(time.time() - ts))

len conversation 83097
con_count 1000, traindata_count 2049
con_count 2000, traindata_count 3996
con_count 3000, traindata_count 6425
con_count 4000, traindata_count 8353
con_count 5000, traindata_count 10654
con_count 6000, traindata_count 12707
con_count 7000, traindata_count 14666
con_count 8000, traindata_count 16673
con_count 9000, traindata_count 18578
con_count 10000, traindata_count 20317
con_count 11000, traindata_count 22826
con_count 12000, traindata_count 25611
con_count 13000, traindata_count 27879
con_count 14000, traindata_count 30057
con_count 15000, traindata_count 32631
con_count 16000, traindata_count 34686
con_count 17000, traindata_count 36849
con_count 18000, traindata_count 38890
con_count 19000, traindata_count 41103
con_count 20000, traindata_count 43175
con_count 21000, traindata_count 45123
con_count 22000, traindata_count 47305
con_count 23000, traindata_count 48998
con_count 24000, traindata_count 51571
con_count 25000, traindata_count 53672
con_count 26000

# Data Reader
This reader generates trainable batches from the preprocessed training text from the data parser script.

In [137]:
class Data_Reader:
    def __init__(self, cur_train_index=0, load_list=False):
        self.training_data = pickle.load(open('data/conversations_lenmax22_formersents2_with_former', 'rb'))
        self.data_size = len(self.training_data)
        if load_list:
            self.shuffle_list = pickle.load(open('data/shuffle_index_list', 'rb'))
        else:    
            self.shuffle_list = self.shuffle_index()
        self.train_index = cur_train_index

    # get batch number from data
    def get_batch_num(self, batch_size):
        return self.data_size // batch_size

    #shuffle index from data
    def shuffle_index(self):
        shuffle_index_list = random.sample(range(self.data_size), self.data_size)
        pickle.dump(shuffle_index_list, open('data/shuffle_index_list', 'wb'), True)
        return shuffle_index_list

    #generate batch indeices based on batch numbers
    def generate_batch_index(self, batch_size):
        if self.train_index + batch_size > self.data_size:
            batch_index = self.shuffle_list[self.train_index:self.data_size]
            self.shuffle_list = self.shuffle_index()
            remain_size = batch_size - (self.data_size - self.train_index)
            batch_index += self.shuffle_list[:remain_size]
            self.train_index = remain_size
        else:
            batch_index = self.shuffle_list[self.train_index:self.train_index+batch_size]
            self.train_index += batch_size
        return batch_index

    # generate training batches
    def generate_training_batch(self, batch_size):
        batch_index = self.generate_batch_index(batch_size)
        batch_X = [self.training_data[i][0] for i in batch_index]   # batch_size of conv_a
        batch_Y = [self.training_data[i][1] for i in batch_index]   # batch_size of conv_b
        return batch_X, batch_Y

    # generate training batches with previous former batches
    def generate_training_batch_with_former(self, batch_size):
        batch_index = self.generate_batch_index(batch_size)
        batch_X = [self.training_data[i][0] for i in batch_index]   # batch_size of conv_a
        batch_Y = [self.training_data[i][1] for i in batch_index]   # batch_size of conv_b
        former = [self.training_data[i][2] for i in batch_index]    # batch_size of former utterance
        return batch_X, batch_Y, former

    # generate testing batches
    def generate_testing_batch(self, batch_size):
        batch_index = self.generate_batch_index(batch_size)
        batch_X = [self.training_data[i][0] for i in batch_index]   # batch_size of conv_a
        return batch_X

# Helper
seq2seq dialog generator is used for the reverse model of backward entropy loss. This determines the reward for semantic coherence in the policy gradient dialogue. In other words it helps represent future reward based on LSTM (i.e encoding, decoding and generating builds). Feature extraction script gets features and characteristics from data to help improve training.

In [138]:
# if RL is set to true a scaler is computed based on semantic coherence and ease of answering loss caption.
@tf.function
def model_inputs(embed_dim, reinforcement= False):#tensorflow v1 issues.
    word_vectors = tf.placeholder(tf.float32, [None, None, embed_dim], name = "word_vectors")
    reward = tf.placeholder(tf.float32, shape = (), name = "rewards")
    caption = tf.placeholder(tf.int32, [None, None], name = "captions")
    caption_mask = tf.placeholder(tf.float32, [None, None], name = "caption_masks")
    if reinforcement: # Normal training returns only the word_vectors, caption and caption_mask placeholders, 
        # With reinforcement learning, there is an extra placeholder for rewards
        return word_vectors, caption, caption_mask, reward
    else:
        return word_vectors, caption, caption_mask

In [139]:
# performs encoding for sequence to sequence network. The input sequence is passed into the encoder and returns both an ouput RNN and the state
def encoding_layer(word_vectors, lstm_size, num_layers, keep_prob, vocab_size):
    cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(lstm_size), keep_prob) for _ in range(num_layers)])
    outputs, state = tf.nn.rnn_cell.dynamic_rnn(cells, word_vectors, dtype=tf.float32)
    return outputs, state

In [140]:
# training process for decoder using LSTM cells and encoder states 
def decode_train(enc_state, dec_cell, dec_input, 
                         target_sequence_length,output_sequence_length,
                         output_layer, keep_prob):
    #Apply dropout to the LSTM cell
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob=keep_prob)
    
    #Training helper for decoder
    helper = tf.contrib.seq2seq.TrainingHelper(dec_input, target_sequence_length)

    decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, enc_state, output_layer)

    # unrolling the decoder layer
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=output_sequence_length)
    return outputs


In [141]:
# generates an inference decoder that makes use of greedy helpter which feeds last output of decoder as next decoder input.
# returns training logits and sample id
def decode_generate(encoder_state, dec_cell, dec_embeddings,
                         target_sequence_length,output_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
    #dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob=keep_prob)
    dec_cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=keep_prob)
    
    #Decoder helper for inference
    #helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, tf.fill([batch_size], 1), 2)
    helper = tfa.seq2seq.GreedyEmbeddingSampler(dec_embeddings, tf.fill([batch_size], 1), 2)
    
    #decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
    decoder = tfa.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
    
    #outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=output_sequence_length)
    outputs, _, _ = tfa.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=output_sequence_length)
    return outputs

In [142]:
# decodes the encoded layer
def decoding_layer(dec_input, enc_state,
                   target_sequence_length,output_sequence_length,
                   lstm_size,
                   num_layers,n_words,
                   batch_size, keep_prob,embedding_size, Train = True):
    target_vocab_size = n_words
    with tf.device("/cpu:0"):
        dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size,embedding_size], -0.1, 0.1), name='Wemb')
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(lstm_size) for _ in range(num_layers)])
    with tf.variable_scope("decode"):
        output_layer = tf.layers.Dense(target_vocab_size)
    
    if Train:
        with tf.variable_scope("decode"):
            train_output = decode_train(enc_state, 
                                                cells, 
                                                dec_embed_input, 
                                                target_sequence_length, output_sequence_length,
                                                output_layer, 
                                                keep_prob)

    with tf.variable_scope("decode", reuse=tf.AUTO_REUSE):
        infer_output = decode_generate(enc_state, 
                                            cells, 
                                            dec_embeddings, target_sequence_length,
                                           output_sequence_length,
                                            target_vocab_size, 
                                            output_layer,
                                            batch_size,
                                            keep_prob)
    if Train:
        return train_output, infer_output
    return infer_output


In [143]:
# appends the index corresponding to <bos> which is the beginning of a sentence to the first index of the capton tensor for every batch.
def bos_inclusion(caption,batch_size):
 
    sliced_target = tf.strided_slice(caption, [0,0], [batch_size, -1], [1,1])
    concat = tf.concat([tf.fill([batch_size, 1],1), sliced_target],1)
    return concat


In [144]:
# creates an array of size maxlen from every question by padding with zeros or trucating where applicable
def pad_sequences(questions, sequence_length =22):
    lengths = [len(x) for x in questions]
    num_samples = len(questions)
    x = np.zeros((num_samples, sequence_length)).astype(int)
    for idx, sequence in enumerate(questions):
        if not len(sequence):
            continue  # empty list/array was found
        truncated  = sequence[-sequence_length:]

        truncated = np.asarray(truncated, dtype=int)

        x[idx, :len(truncated)] = truncated

In [145]:
# only takes non numerical data
def refine(data):
    words = re.findall("[a-zA-Z'-]+", data)
    words = ["".join(word.split("'")) for word in words]
    data = ' '.join(words)
    return data

In [146]:
# create batches to feed into network from word vector representation
def make_batch_input(batch_input, input_sequence_length, embed_dims, word2vec):
    
    for i in range(len(batch_input)):
        
        batch_input[i] = [word2vec[w] if w in word2vec else np.zeros(embed_dims) for w in batch_input[i]]
        if len(batch_input[i]) >input_sequence_length:
            batch_input[i] = batch_input[i][:input_sequence_length]
        else:
            for _ in range(input_sequence_length - len(batch_input[i])):
                batch_input[i].append(np.zeros(embed_dims))
    return np.array(batch_input)


In [147]:
def replace(target,symbols):  #Remove symbols from sequence
    for symbol in symbols:
        target = list(map(lambda x: x.replace(symbol,''),target))
    return target

In [148]:
def make_batch_target(batch_target, word_to_index, target_sequence_length):
    target = batch_target
    target = list(map(lambda x: '<bos> ' + x, target))
    symbols = ['.', ',', '"', '\n','?','!','\\','/']
    target = replace(target, symbols)

    for idx, each_cap in enumerate(target):
        word = each_cap.lower().split(' ')
        if len(word) < target_sequence_length:
            target[idx] = target[idx] + ' <eos>'  #Append the end of symbol symbol 
        else:
            new_word = ''
            for i in range(target_sequence_length-1):
                new_word = new_word + word[i] + ' '
            target[idx] = new_word + '<eos>'
            
    target_index = [[word_to_index[word] if word in word_to_index else word_to_index['<unk>'] for word in 
                          sequence.lower().split(' ')] for sequence in target]
    #print(target_index[0])
    
    caption_matrix = pad_sequences(target_index,target_sequence_length)
    caption_matrix = np.hstack([caption_matrix, np.zeros([len(caption_matrix), 1])]).astype(int)
    caption_masks = np.zeros((caption_matrix.shape[0], caption_matrix.shape[1]))
    nonzeros = np.array(list(map(lambda x: (x != 0).sum(), caption_matrix)))
    #print(nonzeros)
    #print(caption_matrix[1])
    
    for ind, row in enumerate(caption_masks): #Set the masks as an array of ones where actual words exist and zeros otherwise
        row[:nonzeros[ind]] = 1                 
        #print(row)
    print(caption_masks[0])
    print(caption_matrix[0])
    return caption_matrix,caption_masks   


In [149]:
def generic_batch(generic_responses, batch_size, word_to_index, target_sequence_length):
    size = len(generic_responses) 
    if size > batch_size:
        generic_responses = generic_responses[:batch_size]
    else:
        for j in range(batch_size - size):
            generic_responses.append('')
    return make_batch_target(generic_responses, word_to_index, target_sequence_length)


In [150]:
# generate sentences from the predicted indices of word with the next highest probability
def index2sentence(generated_word_index, prob_logit, ixtoword):
    generated_word_index = list(generated_word_index)
    for i in range(len(generated_word_index)):
        if generated_word_index[i] == 3 or generated_word_index[i] == 0:
            sort_prob_logit = sorted(prob_logit[i])
            curindex = np.where(prob_logit[i] == sort_prob_logit[-2])[0][0]
            count = 1
            while curindex <= 3:
                curindex = np.where(prob_logit[i] == sort_prob_logit[(-2)-count])[0][0]
                count += 1

            generated_word_index[i] = curindex

    generated_words = []
    for ind in generated_word_index:
        generated_words.append(ixtoword[ind])    
    generated_sentence = ' '.join(generated_words)
    generated_sentence = generated_sentence.replace('<bos> ', '')  #Replace the beginning of sentence tag
    generated_sentence = generated_sentence.replace('<eos>', '')   #Replace the end of sentence tag
    generated_sentence = generated_sentence.replace('--', '')      #Replace the other symbols predicted
    generated_sentence = generated_sentence.split('  ')
    for i in range(len(generated_sentence)):       #Begin sentences with Upper case 
        generated_sentence[i] = generated_sentence[i].strip()
        if len(generated_sentence[i]) > 1:
            generated_sentence[i] = generated_sentence[i][0].upper() + generated_sentence[i][1:] + '.'
        else:
            generated_sentence[i] = generated_sentence[i].upper()
    generated_sentence = ' '.join(generated_sentence)
    generated_sentence = generated_sentence.replace(' i ', ' I ')
    generated_sentence = generated_sentence.replace("i'm", "I'm")
    generated_sentence = generated_sentence.replace("i'd", "I'd")

    return generated_sentence

# Seq2Seq Chatbot
This is the policy gradient model which combines reinforcement learning with cross-entropy loss. Policy gradient is based on LSTM encoder-decoder. policy gradient is stochastic based on a probability distribution of actions depending on the state. The policy gradient loss to minimise is also defined here

In [151]:
# our chatbot class for our model
class Chatbot():
    def __init__(self, embed_dim, vocab_size, lstm_size, batch_size, input_sequence_length, target_sequence_length, learning_rate =0.0001, keep_prob = 0.5, num_layers = 1, policy_gradients = False, Training = True):
        self.embed_dim = embed_dim
        self.lstm_size = lstm_size
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.input_sequence_length = tf.fill([self.batch_size],input_sequence_length+1)
        self.target_sequence_length = tf.fill([self.batch_size],target_sequence_length+1)
        self.output_sequence_length = target_sequence_length +1
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.num_layers = num_layers
        self.policy_gradients = policy_gradients
        self.Training = Training

    # if policy gradient requested we input accordingly
    def build_model(self):
        if self.policy_gradients:
            word_vectors, caption, caption_mask, rewards = model_inputs(self.embed_dim, True)
            place_holders = {'word_vectors': word_vectors,
                'caption': caption,
                'caption_mask': caption_mask, "rewards": rewards
                             }
        else:
            word_vectors, caption, caption_mask = model_inputs(self.embed_dim)
            
            place_holders = {'word_vectors': word_vectors,
                'caption': caption,
                'caption_mask': caption_mask}
        enc_output, enc_state = encoding_layer(word_vectors, self.lstm_size, self.num_layers,
                                         self.keep_prob, self.vocab_size)
        #dec_inp = bos_inclusion(caption, self.batch_size)
        dec_inp = caption

        # gets our inference layer
        if not self.Training:
            print("Test mode")
            inference_out = decoding_layer(dec_inp, enc_state,self.target_sequence_length, 
                                                    self.output_sequence_length,
                                                    self.lstm_size, self.num_layers,
                                                    self.vocab_size, self.batch_size,
                                                  self.keep_prob, self.embed_dim, False)
            logits = tf.identity(inference_out.rnn_output, name = "train_logits")
            predictions = tf.identity(inference_out.sample_id, name = "predictions")
            return place_holders, predictions, logits 
        #gets our loss layer
        train_out, inference_out = decoding_layer(dec_inp, enc_state,self.target_sequence_length, 
                                                    self.output_sequence_length,
                                                    self.lstm_size, self.num_layers,
                                                    self.vocab_size, self.batch_size,
                                                  self.keep_prob, self.embed_dim)

        training_logits = tf.identity(train_out.rnn_output, name = "train_logits")
        prediction_logits = tf.identity(inference_out.sample_id, name = "predictions")
        
        cross_entropy = tf.contrib.seq2seq.sequence_loss(training_logits, caption, caption_mask)
        losses = {"entropy": cross_entropy}

        # will either minimise cross entropy loss or policy gradient loss depending on our state
        if self.policy_gradients:
            pg_loss = tf.contrib.seq2seq.sequence_loss(training_logits, caption, caption_mask*rewards)
            with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(pg_loss)
            losses.update({"pg":pg_loss}) 
        else:
            with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(cross_entropy)
                
        return optimizer, place_holders,prediction_logits,training_logits, losses

# Train Model
This is the policy gradient model which combines reinforcement learning with cross-entropy loss. Policy gradient is based on LSTM encoder-decoder. policy gradient is stochastic based on a probability distribution of actions depending on the state. The policy gradient loss to minimise is also defined here.

In [152]:
#load either the foward or revese sequence to sequence model
def train(type_, epochs=50, checkpoint=False):
    tf.compat.v1.reset_default_graph()
    if type_ == "forward":
        path = "model/forward/seq2seq"
        dr = Data_Reader(load_list=False) #forward
    else:
        dr = Data_Reader(load_list=True) #reverse
        path = "model/reverse/seq2seq"
    # create vocab
    word_to_index, index_to_word, _ = preProBuildWordVocab(word_count_threshold=word_count_threshold)
    #load wor to vector model from gensim
    word_vector = KeyedVectors.load_word2vec_format('model/word_vector.bin', binary=True)

    #build chatbot model and pass in variables defined earlier.
    model = Chatbot(dim_wordvec, len(word_to_index), dim_hidden, batch_size,
                    input_sequence_length, output_sequence_length, learning_rate)
    optimizer, place_holders, predictions, logits, losses = model.build_model()
    saver = tf.train.Saver()
    sess = tf.InteractiveSession()

    #restore checkpoint if continuing from a previous run or initialise the graph
    if checkpoint:
        saver.restore(sess, path)
        print("checkpoint restored at path: {}".format(path))
    else:
        tf.global_variables_initializer().run()

    #start iterating through episodes and batches to train
    for epoch in range(epochs):
        n_batch = dr.get_batch_num(batch_size=batch_size)
        for batch in range(n_batch):

            batch_input, batch_target = dr.generate_training_batch(batch_size)

            #the batch input has the list of words from the training sete. The batch target has a list of sentences for the input which will be the target.
            # the list of words is conerted to vector form using helper it then makes the feed dictionary for the graph
            inputs_ = make_batch_input(batch_input, input_sequence_length, dim_wordvec, word_vector)

            targets, masks = make_batch_target(batch_target, word_to_index, output_sequence_length)
            feed_dict = {
                place_holders['word_vectors']: inputs_,
                place_holders['caption']: targets,
                place_holders['caption_mask']: masks
            }
            #calls optimiser by feeding in training data and logs loss value to see progression
            _, loss_val, preds = sess.run([optimizer, losses["entropy"], predictions],
                                          feed_dict=feed_dict)

            if batch % display_interval == 0:
                print(preds.shape)
                print("Epoch: {}, batch: {}, loss: {}".format(epoch, batch, loss_val))
                print("===========================================================")

        saver.save(sess, path)
        print("Model saved at {}".format(path))
    print("Training Complete.")
    sess.close()

In [153]:
#modes are restored and trainined again to create the chatbot
def pg_train(epochs=1, checkpoint=False):
    tf.reset_default_graph()
    path = "model/reinforcement/seq2seq"
    word_to_index, index_to_word, _ = preProBuildWordVocab(word_count_threshold=word_count_threshold)
    word_vector = KeyedVectors.load_word2vec_format('model/word_vector.bin', binary=True)
    generic_caption, generic_mask = generic_batch(generic_responses, batch_size, word_to_index,
                                                    output_sequence_length)
    dr = Data_Reader()
    forward_graph = tf.Graph()
    reverse_graph = tf.Graph()
    default_graph = tf.get_default_graph()

    # create two graphs to load the trained model
    with forward_graph.as_default():
        pg_model = Chatbot(dim_wordvec, len(word_to_index), dim_hidden, batch_size,
                           input_sequence_length, output_sequence_length, learning_rate, policy_gradients=True)
        optimizer, place_holders, predictions, logits, losses = pg_model.build_model()
        sess = tf.InteractiveSession()
        saver = tf.train.Saver()
        if checkpoint:
            saver.restore(sess, path)
            print("checkpoint restored at path: {}".format(path))
        else:
            tf.global_variables_initializer().run()
            saver.restore(sess, 'model/forward/seq2seq')
        # tf.global_variables_initializer().run()
        with reverse_graph.as_default():
            model = Chatbot(dim_wordvec, len(word_to_index), dim_hidden, batch_size,
                            input_sequence_length, output_sequence_length, learning_rate)
            _, rev_place_holders, _, _, reverse_loss = model.build_model()
            sess2 = tf.InteractiveSession()
            saver2 = tf.train.Saver()

            saver2.restore(sess2, "model/reverse/seq2seq")
            print("reverse model restored")

        dr = Data_Reader(load_list=True)

    # load data to train in batches
    for epoch in range(epochs):
        n_batch = dr.get_batch_num(batch_size=batch_size)
        for batch in range(n_batch):

            batch_input, batch_caption, prev_utterance = dr.generate_training_batch_with_former(batch_size)
            targets, masks = make_batch_target(batch_caption, word_to_index, output_sequence_length)
            inputs_ = make_batch_input(batch_input, input_sequence_length, dim_wordvec, word_vector)

            word_indices, probabilities = sess.run([predictions, logits],
                                                   feed_dict={place_holders['word_vectors']: inputs_

                                                       , place_holders["caption"]: targets})

            sentence = [index2sentence(generated_word, probability, index_to_word) for
                        generated_word, probability in zip(word_indices, probabilities)]

            word_list = [word.split() for word in sentence]

            generic_test_input = make_batch_input(word_list, input_sequence_length, dim_wordvec, word_vector)

            forward_coherence_target, forward_coherence_masks = make_batch_target(sentence,
                                                                                    word_to_index,
                                                                                    output_sequence_length)

            generic_loss = 0.0

            #learns when to say generic texts
            for response in generic_test_input:
                sentence_input = np.array([response] * batch_size)
                feed_dict = {place_holders['word_vectors']: sentence_input,
                            place_holders['caption']: generic_caption,
                            place_holders['caption_mask']: generic_mask,
                            }
                generic_loss_i = sess.run(losses["entropy"], feed_dict=feed_dict)
                generic_loss -= generic_loss_i / batch_size

            # print("generic loss work: {}".format(generic_loss))

            feed_dict = {place_holders['word_vectors']: inputs_,
                        place_holders['caption']: forward_coherence_target,
                        place_holders['caption_mask']: forward_coherence_masks,
                        }

            forward_entropy = sess.run(losses["entropy"], feed_dict=feed_dict)

            previous_utterance, previous_mask = make_batch_target(prev_utterance,
                                                                    word_to_index, output_sequence_length)

            feed_dict = {rev_place_holders['word_vectors']: generic_test_input,
                        rev_place_holders['caption']: previous_utterance,
                        rev_place_holders['caption_mask']: previous_mask,}
            reverse_entropy = sess2.run(reverse_loss["entropy"], feed_dict=feed_dict)

            rewards = 1 / (1 + np.exp(-reverse_entropy - forward_entropy - generic_loss))

            feed_dict = {place_holders['word_vectors']: inputs_,
                        place_holders['caption']: targets,
                        place_holders['caption_mask']: masks,
                        place_holders['rewards']: rewards}
            _, loss_pg, loss_ent = sess.run([optimizer, losses["pg"], losses["entropy"]], feed_dict=feed_dict)

            if batch % display_interval == 0:
                print("Epoch: {}, batch: {}, Entropy loss: {}, Policy gradient loss: {}".format(epoch, batch, loss_ent, loss_pg))                                                              
                print("rewards: {}".format(rewards))
                print("===========================================================")
        saver.save(sess, path)
        print("Model saved at {}".format(path))
    print("Training done")

In [154]:
#policy gradient trained to avoid.
generic_responses = [
    "I don't know what you're talking about.", 
    "I don't know.", 
    "You don't know.",
    "You know what I mean.", 
    "I know what you mean.", 
    "You know what I'm saying.",
    "You don't know anything."
]

#training parameters
checkpoint = True

forward_model_path = 'model/forward'
reversed_model_path = 'model/reversed'
rl_model_path = "model/rl"
model_name = 'seq2seq'
word_count_threshold = 20
reversed_word_count_threshold = 6
dim_wordvec = 300
dim_hidden = 1000
input_sequence_length = 22
output_sequence_length = 22
learning_rate = 0.0001
epochs = 1
batch_size = 200

display_interval = 100

In [155]:
#train a feed forward model then a reverse model
#direction type, #of epochs
train('forward', 50, False)
train('reverse', 50, False)
pg_train(100, False)

#https://researchdatapod.com/how-to-solve-python-attributeerror-module-tensorflow-has-no-attribute-placeholder/

preprocessing word counts and creating vocab based on word count threshold 20
filtered words from 76029 to 6847


  cells = tf.compat.v1.nn.rnn_cell.MultiRNNCell([tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(lstm_size), keep_prob) for _ in range(num_layers)])


ValueError: as_list() is not defined on an unknown TensorShape.

# Test Model
Once trained we test the chatbot on unseen dialogue. The bot is trying to be reasonably coherent. Context is important, depending on dataset used, responses will be in that context. Performance is measured as informativeness, high coherence and simplicity in answering.

In [None]:
#load data and model
def test(model_path=forward_model_path):
    testing_data = open(path_to_questions, 'r').read().split('\n')
    word_vector = KeyedVectors.load_word2vec_format('model/word_vector.bin', binary=True)

    _, index_to_word, _ = preProBuildWordVocab(word_count_threshold=word_count_threshold)

    model = Chatbot(dim_wordvec, len(index_to_word), dim_hidden, batch_size,
                            input_sequence_length, target_sequence_length, Training=False)

    place_holders, predictions, logits = model.build_model()

    sess = tf.InteractiveSession()

    saver = tf.train.Saver()

    saver.restore(sess, model_path)

#open response file and prepare list of questions
    with open(responses_path, 'w') as out:

        for idx, question in enumerate(testing_data):
            print('question =>', question)

            question = [refine(w) for w in question.lower().split()]
            question = [word_vector[w] if w in word_vector else np.zeros(dim_wordvec) for w in question]
            question.insert(0, np.random.normal(size=(dim_wordvec,)))  # insert random normal at the first step

            if len(question) > input_sequence_length:
                question = question[:input_sequence_length]
            else:
                for _ in range(input_sequence_length - len(question)):
                    question.append(np.zeros(dim_wordvec))

            question = np.array([question])

            feed_dict = {place_holders["word_vectors"]: np.concatenate([question] * 2, 0),
                         }

            word_indices, prob_logit = sess.run([predictions, logits], feed_dict=feed_dict)

            # print(word_indices[0].shape)
            generated_sentence = index2sentence(word_indices[0], prob_logit[0], index_to_word)

            print('generated_sentence =>', generated_sentence)
            out.write(generated_sentence + '\n')

In [None]:
# reference path to trained models
reinforcement_model_path = "model/reinforcement/seq2seq"
forward_model_path = "model/forward/seq2seq"
reverse_model_path = "model/reverse/seq2seq"

# reference to sample questions and responses
path_to_questions = 'results/sample_input.txt'
responses_path = 'results/sample_output_RL.txt'

#define model hyper parameters
word_count_threshold = 20
dim_wordvec = 300
dim_hidden = 1000
input_sequence_length = 25
target_sequence_length = 22
batch_size = 2

In [None]:
test(reinforcement_model_path)