In [29]:
import numpy as np
import nltk
import os
import ast
import pickle
import tensorflow as tf
import random
import configparser
from tqdm import tqdm
import string

In [2]:
cwd = os.getcwd()
#print(cwd)
corpusDir = os.path.join(cwd, 'data/cornell')
#print(corpusDir)

In [3]:
lines = {}
with open(os.path.join(corpusDir, 'movie_lines.txt'), 'r', encoding='iso-8859-1') as f:
    for line in f:
        #print(line)
        fields = line.split(' +++$+++ ')
        #print(fields)
        obj = {}
        obj['lineID'] = fields[0]
        obj['characterID'] = fields[1]
        obj['movieID'] = fields[2]
        obj['characterName'] = fields[3]
        obj['text'] = fields[4]
        lines[fields[0]] = obj
#print(lines)

In [4]:
conversations = []
with open(os.path.join(corpusDir, 'movie_conversations.txt'), 'r', encoding='iso-8859-1') as f:
    for line in f:
        #print(line)
        fields = line.split(' +++$+++ ')
        #print(fields)
        obj = {}
        obj['character1ID'] = fields[0]
        obj['character2ID'] = fields[1]
        obj['movieID'] = fields[2]
        #obj['lineIDs'] = fields[3]
        #print(obj)
        lineIDs = ast.literal_eval(fields[3])
        #print(lineIDs)
        obj['lineIDs'] = lineIDs
        #print(obj)
        obj['lines'] = []
        for lineID in lineIDs:
            #print(lineID, "--", lines[lineID])
            obj['lines'].append(lines[lineID])
        conversations.append(obj)
#print(conversations)

In [5]:
wordIDMap = {}
IDWordMap = {}
unknownToken = -1
trainingSamples = []
goToken = -1
eosToken = -1
padToken = -1
sentMaxLength = 10 #maximum length of an input or output sentence
encoderMaxLength = sentMaxLength
decoderMaxLength = sentMaxLength + 2

In [6]:
def getWordID(word, shouldAddToDict=True):
    word = word.lower()
    wordID = wordIDMap.get(word, -1)
    if wordID == -1:
        if shouldAddToDict:
            wordID = len(wordIDMap)
            wordIDMap[word] = wordID
            IDWordMap[wordID] = word
        else:
            wordID = unknownToken
    return wordID

In [7]:
def getWordsFromLine(line, isReply=False):
    '''Returns the word IDs from the vovabulary'''
    words = []
    sentences = nltk.sent_tokenize(line)
    #print(sentences)
    # Since we are limited by a maxmimum length of sentences, we keep the last lines if the statement is a question/input
    # and the first few lines if the statement is an answer/reply
    for i in range(len(sentences)):
        if not isReply:
            i = len(sentences) - 1 - i
        tokensFromCurrSent = nltk.word_tokenize(sentences[i])
        #print(tokensFromCurrSent)
        if len(words) + len(tokensFromCurrSent) > sentMaxLength:
            break
        else:
            temp = []
            for token in tokensFromCurrSent:
                temp.append(getWordID(token))
            if isReply:
                words = words + temp
            else:
                words = temp + words # Append in the reverse order because we're considering the last few lines
    return words

In [8]:
padToken = getWordID('<pad>')
unknownToken = getWordID('<unknown>')
eosToken = getWordID('<eos>')
goToken = getWordID('<go>')
for conversation in conversations:
    #print(conversation)
    for i in range(len(conversation['lines']) - 1):
        #print(conversation['lines'][i])
        inputStatement = conversation['lines'][i]
        #print(inputStatement)
        replyStatement = conversation['lines'][i + 1]
        inputWords = getWordsFromLine(inputStatement['text'])
        replyWords = getWordsFromLine(replyStatement['text'], True)
        #print(inputWords)
        #print(replyWords)
        
        if inputWords and replyWords:
            trainingSamples.append([inputWords, replyWords])
#print(trainingSamples)

In [9]:
print("Saving dataset samples ...")
with open(os.path.join(cwd, 'data/samples', 'sampleData.pkl'), 'wb') as f:
    data = {
        'wordIDMap': wordIDMap,
        'IDWordMap': IDWordMap,
        'trainingSamples': trainingSamples
    }
    pickle.dump(data, f, -1)
print('Done')

Saving dataset samples ...
Done


In [10]:
#Parameters
globalStep = 85
cellUnitCount = 512
numOfLayers = 2
embeddingSize = 64
learningRate = 0.02
batchSize = 256
dropout = 0.9
softmaxSamples = 0
numOfEpochs = 2

In [11]:
def make_lstm_cell():
    encoderDecoderCell = tf.contrib.rnn.BasicLSTMCell(cellUnitCount)
    encoderDecoderCell = tf.contrib.rnn.DropoutWrapper(encoderDecoderCell, input_keep_prob=1.0, output_keep_prob=dropout)
    return encoderDecoderCell

In [12]:
#Expand the list comprehension below
encoderDecoderCell = tf.contrib.rnn.MultiRNNCell(
    [make_lstm_cell() for _ in range(numOfLayers)],
)

In [13]:
with tf.name_scope('encoder'):
    encoderInputs = [tf.placeholder(tf.int32, [None, ]) for _ in range(sentMaxLength)]
with tf.name_scope('decoder'):
    decoderInputs = [tf.placeholder(tf.int32, [None, ], name="inputs") for _ in range(sentMaxLength + 2)]
    decoderTargets = [tf.placeholder(tf.int32, [None, ], name="targets") for _ in range(sentMaxLength + 2)]
    decoderWeights = [tf.placeholder(tf.float32, [None, ], name="weights") for _ in range(sentMaxLength + 2)]

In [14]:
#Verify this - is different from the existing
decoderOutput, state = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
    encoderInputs,
    decoderInputs,
    encoderDecoderCell,
    len(wordIDMap),
    len(wordIDMap),
    embeddingSize,
    output_projection=None,
    feed_previous=False
)

In [15]:
lossFunc = tf.contrib.legacy_seq2seq.sequence_loss(
    decoderOutput,
    decoderTargets,
    decoderWeights,
    len(wordIDMap),
    softmax_loss_function=None
)
tf.summary.scalar('loss', lossFunc)

<tf.Tensor 'loss:0' shape=() dtype=string>

In [16]:
optimizer = tf.train.AdamOptimizer(
    learning_rate=learningRate,
    beta1=0.9,
    beta2=0.999,
    epsilon=1e-08
)
optimizationOperation = optimizer.minimize(lossFunc)

In [17]:
writer = tf.summary.FileWriter('seq2seq')
saver = tf.train.Saver(max_to_keep=200)

In [18]:
sess = tf.Session(
    config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False
    )
)
sess.run(tf.global_variables_initializer())

In [19]:
#Change variable scope name
with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True):
    in_embedding = tf.get_variable("embedding")
with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True):
    out_embedding = tf.get_variable("embedding")

embedding_vars = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)
embedding_vars.remove(in_embedding)
embedding_vars.remove(out_embedding)

'''
if globalStep != 0:
    return
'''

'\nif globalStep != 0:\n    return\n'

In [20]:
with open(os.path.join(cwd, 'data/word2vec/GoogleNews-vectors-negative300.bin'), "rb", 0) as f:
    header = f.readline().split()
    #print(header)
    vocabulary_size = int(header[0])
    word_vector_size = int(header[1])
    #print('{}, {}'.format(vocabulary_size, word_vector_size))
    binary_length = np.dtype('float32').itemsize * word_vector_size
    #print(binary_length)
    initial_weights = np.random.uniform(-0.25, 0.25, (len(wordIDMap), word_vector_size))
    #print(initial_weights)
    for line in range(word_vector_size):
        word = []
        while True:
            ch = f.read(1)
            if ch == b' ':
                word = b''.join(word).decode('utf-8')
                break
            if ch != b'\n':
                word.append(ch)
        if word in wordIDMap:
            initial_weights[wordIDMap[word]] = np.fromstring(f.read(binary_length), dtype='float32')
        else:
            f.read(binary_length)

In [21]:
if embeddingSize < word_vector_size:
    u, s, vt = np.linalg.svd(initial_weights, full_matrices=False)
    S = np.zeros((word_vector_size, word_vector_size), dtype=complex)
    S[:word_vector_size, :word_vector_size] = np.diag(s)
    initial_weights = np.dot(u[:, :embeddingSize], S[:embeddingSize, :embeddingSize])

In [22]:
sess.run(in_embedding.assign(initial_weights))
sess.run(out_embedding.assign(initial_weights))

  nparray = values.astype(dtype.as_numpy_dtype)


array([[ 0.15700914, -0.08681116,  0.06051316, ...,  0.23852083,
        -0.16272853,  0.01847846],
       [ 0.2192065 ,  0.24926345,  0.01855532, ..., -0.17493464,
         0.03990545,  0.01945812],
       [ 0.07839889, -0.28233093,  0.29489046, ..., -0.09639359,
        -0.01002899,  0.09629203],
       ..., 
       [-0.10829171,  0.24631451,  0.12743433, ...,  0.08865245,
        -0.30084032, -0.26996762],
       [ 0.03106795, -0.21142645,  0.02663859, ..., -0.07136977,
         0.14422165,  0.14177349],
       [ 0.14966476,  0.06425916,  0.28310198, ..., -0.0233477 ,
        -0.12494318, -0.10956229]], dtype=float32)

In [23]:
def generateNextSample():
    for i in range(0, len(trainingSamples), batchSize):
        yield trainingSamples[i:min(i + batchSize, len(trainingSamples))]

In [24]:
class Batch:
    def __init__(self):
        self.encoderSeqs = []
        self.decoderSeqs = []
        self.targetSeqs = []
        self.weights = []

In [27]:
def saveModel():
    print('Saving model checkpoint...')
    model_name = 'model_' + globalStep + '.ckpt'
    if globalStep == 30:
        model_name = 'model.ckpt'
    saver.save(sess, os.path.join(cwd, 'saved_model', model_name))
    print('Done')

In [28]:
def makeBatch(samples):
    batch = Batch()
    batchSize = len(samples)
    for i in range(batchSize):
        sample = samples[i]
        #print(sample)
        batch.encoderSeqs.append(list(reversed(sample[0])))
        batch.decoderSeqs.append([goToken] + sample[1] + [eosToken])
        batch.targetSeqs.append(batch.decoderSeqs[-1][1:])

        batch.encoderSeqs[i] = [padToken] * (encoderMaxLength - len(batch.encoderSeqs[i])) + batch.encoderSeqs[i]
        batch.weights.append([1.0] * len(batch.targetSeqs[i]) + [0.0] * (decoderMaxLength - len(batch.targetSeqs[i])))
        batch.decoderSeqs[i] = batch.decoderSeqs[i] + [padToken] * (decoderMaxLength - len(batch.decoderSeqs[i]))
        batch.targetSeqs[i] = batch.targetSeqs[i] + [padToken] * (decoderMaxLength - len(batch.targetSeqs[i]))

    encoderSeqListT = []
    for i in range(encoderMaxLength):
        encoderSeqT = []
        for j in range(batchSize):
            encoderSeqT.append(batch.encoderSeqs[j][i])
        encoderSeqListT.append(encoderSeqT)
    batch.encoderSeqs = encoderSeqListT

    decoderSeqListT = []
    targetSeqListT = []
    weightListT = []
    for i in range(decoderMaxLength):
        decoderSeqT = []
        targetSeqT = []
        weightT = []
        for j in range(batchSize):
            #print('j: {}, i:{}'.format(j,i))
            decoderSeqT.append(batch.decoderSeqs[j][i])
            targetSeqT.append(batch.targetSeqs[j][i])
            weightT.append(batch.weights[j][i])
        decoderSeqListT.append(decoderSeqT)
        targetSeqListT.append(targetSeqT)
        weightListT.append(weightT)
    batch.decoderSeqs = decoderSeqListT
    batch.targetSeqs = targetSeqListT
    batch.weights = weightListT
    return batch

In [None]:
# Training Loop
completeSummary = tf.summary.merge_all()
if globalStep == 0:
    writer.add_graph(sess.graph)
try:
    for epoch in range(numOfEpochs):
        print("\nEpoch {}".format(epoch+1))
        random.shuffle(trainingSamples)
        
        batches = []
        for samples in generateNextSample():
            makeBatch(samples)
            batches.append(batch)
        
        for batch in tqdm(batches, desc="Training"):
            feedDict = {}
            ops = None
            for i in range(encoderMaxLength):
                feedDict[encoderInputs[i]] = batch.encoderSeqs[i]
            for i in range(decoderMaxLength):
                feedDict[decoderInputs[i]] = batch.decoderSeqs[i]
                feedDict[decoderTargets[i]] = batch.targetSeqs[i]
                feedDict[decoderWeights[i]] = batch.weights[i]
            ops = (optimizationOperation, lossFunc)
            assert len(ops) == 2
            #print(feedDict)
            _, loss, summary = sess.run(ops + (completeSummary,), feedDict)
            writer.add_summary(summary, globalStep)
            globalStep =+ 1
            if globalStep % 100 == 0:
                perplexity = math.exp(float(loss))
                print("Step %d " % (globalStep))
                print("Loss %.2f" % (loss))
                print("Perplexity %.2f" % (perplexity))
            if globalStep % 10 == 0:
                saveModel()
except (KeyboardInterrupt, SystemExit):
    print('Exiting')
saveModel()
sess.close()


Epoch 1


Training:  10%|▉         | 67/690 [1:10:47<14:05:56, 81.47s/it]

In [32]:
saved_model_dir = 'model-pretrainedv2'
model_name = 'model.ckpt'
if os.path.exists(os.path.join(cwd, saved_model_dir, model_name)):
    print('Restoring model {}'.format(model_name))
    saver.restore(sess, os.path.join(cwd, 'model-pretrainedv2', 'model.ckpt'))
    print('Welcome to DeepChat! I am Alex. You can ask me questions and ponder on the answers I provide.')
    print('Type \'exit\' to end the chat.')
    while True:
        user_input = input('You: ')
        if user_input == '':
            print('Alex: Please say something! I don\'t like silence!')
        if user_input == 'exit':
            break
        inputSequence = []
        tokens = nltk.word_tokenize(user_input)
        if len(tokens) > sentMaxLength:
            print('I didn\'t understand! Please try a smaller sentence')
            continue
        wordIDs = []
        for token in tokens:
            wordIDs.append(getWordID(token, shouldAddToDict=False))
        batch = makeBatch([[wordIDs,[]]])
        inputSequence.extend(batch.encoderSeqs)
        feedDict = {}
        ops = None
        for i in range(encoderMaxLength):
            feedDict[encoderInputs[i]] = batch.encoderSeqs[i]
        feedDict[decoderInputs[0]] = [goToken]
        ops = (decoderOutput,)
        outputs = sess.run(ops[0], feedDict)
        outputSequence = []
        for output in outputs:
            outputSequence.append(np.argmax(output))
        responseTokens = []
        for wordID in outputSequence:
            if wordID == eosToken:
                break
            elif wordID != padToken and wordID != goToken:
                responseTokens.append(IDWordMap[wordID])
        response = ''
        for t in responseTokens:
            if not t.startswith('\'') and t not in string.punctuation:
                response.join(' ' + t)
            else:
                response.join(t)
        response = response.strip().capitalize()
        print(response)
        print()

Restoring model model.ckpt
Welcome to DeepChat! I am Alex. You can ask me questions and ponder on the answers I provide.
Type 'exit' to end the chat.
You: Hello


InvalidArgumentError: You must feed a value for placeholder tensor 'decoder/inputs_7' with dtype int32
	 [[Node: decoder/inputs_7 = Placeholder[dtype=DT_INT32, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'decoder/inputs_7', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-334e20f529c3>", line 4, in <module>
    decoderInputs = [tf.placeholder(tf.int32, [None, ], name="inputs") for _ in range(sentMaxLength + 2)]
  File "<ipython-input-13-334e20f529c3>", line 4, in <listcomp>
    decoderInputs = [tf.placeholder(tf.int32, [None, ], name="inputs") for _ in range(sentMaxLength + 2)]
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1502, in placeholder
    name=name)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 2149, in _placeholder
    name=name)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/sayak/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1226, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'decoder/inputs_7' with dtype int32
	 [[Node: decoder/inputs_7 = Placeholder[dtype=DT_INT32, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
