In [1]:
import numpy as np
import nltk
import os
import ast
import pickle
import tensorflow as tf

In [2]:
cwd = os.getcwd()
#print(cwd)
corpusDir = os.path.join(cwd, 'data/lightweight')
#print(corpusDir)

In [3]:
lines = {}
with open(os.path.join(corpusDir, 'movie_lines.txt'), 'r', encoding='iso-8859-1') as f:
    for line in f:
        #print(line)
        fields = line.split(' +++$+++ ')
        #print(fields)
        obj = {}
        obj['lineID'] = fields[0]
        obj['characterID'] = fields[1]
        obj['movieID'] = fields[2]
        obj['characterName'] = fields[3]
        obj['text'] = fields[4]
        lines[fields[0]] = obj
#print(lines)

In [4]:
conversations = []
with open(os.path.join(corpusDir, 'movie_conversations.txt'), 'r', encoding='iso-8859-1') as f:
    for line in f:
        #print(line)
        fields = line.split(' +++$+++ ')
        #print(fields)
        obj = {}
        obj['character1ID'] = fields[0]
        obj['character2ID'] = fields[1]
        obj['movieID'] = fields[2]
        #obj['lineIDs'] = fields[3]
        #print(obj)
        lineIDs = ast.literal_eval(fields[3])
        #print(lineIDs)
        obj['lineIDs'] = lineIDs
        #print(obj)
        obj['lines'] = []
        for lineID in lineIDs:
            #print(lineID, "--", lines[lineID])
            obj['lines'].append(lines[lineID])
        conversations.append(obj)
#print(conversations)

In [5]:
wordIDMap = {}
IDWordMap = {}
unknownToken = -1
trainingSamples = []

In [6]:
def getWordID(word, shouldAddToDict=True):
    word = word.lower()
    wordID = wordIDMap.get(word, -1)
    if wordID == -1:
        if shouldAddToDict:
            wordID = len(wordIDMap)
            wordIDMap[word] = wordID
            IDWordMap[wordID] = word
        else:
            wordID = unknownToken
    return wordID

In [7]:
sentMaxLength = 10 #maximum length of an input or output sentence
def getWordsFromLine(line, isReply=False):
    '''Returns the word IDs from the vovabulary'''
    words = []
    sentences = nltk.sent_tokenize(line)
    #print(sentences)
    # Since we are limited by a maxmimum length of sentences, we keep the last lines if the statement is a question/input
    # and the first few lines if the statement is an answer/reply
    for i in range(len(sentences)):
        if not isReply:
            i = len(sentences) - 1 - i
        tokensFromCurrSent = nltk.word_tokenize(sentences[i])
        #print(tokensFromCurrSent)
        if len(words) + len(tokensFromCurrSent) > sentMaxLength:
            break
        else:
            temp = []
            for token in tokensFromCurrSent:
                temp.append(getWordID(token))
            if isReply:
                words = words + temp
            else:
                words = temp + words # Append in the reverse order because we're considering the last few lines
    return words

In [8]:
for conversation in conversations:
    #print(conversation)
    for i in range(len(conversation['lines']) - 1):
        #print(conversation['lines'][i])
        inputStatement = conversation['lines'][i]
        #print(inputStatement)
        replyStatement = conversation['lines'][i + 1]
        inputWords = getWordsFromLine(inputStatement['text'])
        replyWords = getWordsFromLine(replyStatement['text'], True)
        #print(inputWords)
        #print(replyWords)
        
        if inputWords and replyWords:
            trainingSamples.append([inputWords, replyWords])
#print(trainingSamples)

In [9]:
print("Saving dataset samples ...")
with open(os.path.join(cwd, 'data/samples', 'sampleData.pkl'), 'wb') as f:
    data = {
        'wordIDMap': wordIDMap,
        'IDWordMap': IDWordMap,
        'trainingSamples': trainingSamples
    }
    pickle.dump(data, f, -1)
print('Done')

Saving dataset samples ...
Done


In [10]:
#Parameters
globalStep = 85
cellUnitCount = 512
numOfLayers = 2
embeddingSize = 64
learningRate = 0.02
batchSize = 256
dropout = 0.9
softmaxSamples = 0

In [11]:
def make_lstm_cell():
    encoderDecoderCell = tf.contrib.rnn.BasicLSTMCell(cellUnitCount)
    encoderDecoderCell = tf.contrib.rnn.DropoutWrapper(encoderDecoderCell, input_keep_prob=1.0, output_keep_prob=dropout)
    return encoderDecoderCell

In [12]:
#Expand the list comprehension below
encoderDecoderCell = tf.contrib.rnn.MultiRNNCell(
    [make_lstm_cell() for _ in range(numOfLayers)],
)

In [20]:
with tf.name_scope('encoder'):
    encoderInputs = [tf.placeholder(tf.int32, [None, ]) for _ in range(sentMaxLength)]
with tf.name_scope('decoder'):
    decoderInputs = [tf.placeholder(tf.int32, [None, ], name="inputs") for _ in range(sentMaxLength + 2)]
    decoderTargets = [tf.placeholder(tf.int32, [None, ], name="targets") for _ in range(sentMaxLength + 2)]
    decoderWeights = [tf.placeholder(tf.float32, [None, ], name="weights") for _ in range(sentMaxLength + 2)]

In [14]:
#Verify this - is different from the existing
decoderOutput, state = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
    encoderInputs,
    decoderInputs,
    encoderDecoderCell,
    len(wordIDMap),
    len(wordIDMap),
    embeddingSize,
    output_projection=None,
    feed_previous=False
)

In [21]:
lossFunc = tf.contrib.legacy_seq2seq.sequence_loss(
    decoderOutput,
    decoderTargets,
    decoderWeights,
    len(wordIDMap),
    softmax_loss_function=None
)
tf.summary.scalar('loss', lossFunc)

<tf.Tensor 'loss_2:0' shape=() dtype=string>

In [22]:
optimizer = tf.train.AdamOptimizer(
    learning_rate=learningRate,
    beta1=0.9,
    beta2=0.999,
    epsilon=1e-08
)
optimizationOperation = optimizer.minimize(lossFunc)