# LSTM for dialogue system

In [3]:
import tensorflow as tf
import numpy as np

# preprocessed data
# from datasets.cornell_corpus import data
# import data_utils

In [4]:
tf.__version__

'1.0.0'

In [5]:
## Read movie lines

lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

dialogue_lines = {}  # Dictionary --> Key= Line Number , Value = Dialogues

for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        dialogue_lines[_line[0]] = _line[4]



In [6]:
len(dialogue_lines)

304713

In [7]:
# get conversation blocks

conversation_lines = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = [ ]
for line in conversation_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    conversations.append(_line.split(','))

print(conversations[:10])

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203'], ['L204', 'L205', 'L206'], ['L207', 'L208'], ['L271', 'L272', 'L273', 'L274', 'L275'], ['L276', 'L277'], ['L280', 'L281'], ['L363', 'L364'], ['L365', 'L366']]


In [8]:
conversations_len=np.shape(conversations)[0]
conversations_len

83097

In [9]:
## Print line Ids and the dialogues

for i in range(10):
    for line in conversations[i]:
        if line in dialogue_lines:
            print(line, dialogue_lines[line])

L194 Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
L195 Well, I thought we'd start with pronunciation, if that's okay with you.
L196 Not the hacking and gagging and spitting part.  Please.
L197 Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
L198 You're asking me out.  That's so cute. What's your name again?
L199 Forget it.
L200 No, no, it's my fault -- we didn't have a proper introduction ---
L201 Cameron.
L202 The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
L203 Seems like she could get a date easy enough...
L204 Why?
L205 Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.
L206 That's a shame.
L207 Gosh, if only we could find Kat a boyfriend...
L208 Let me see what I can do.
L271 C'esc ma tete. This is my head
L27

In [10]:
## extract conversations // extract dialogues for the conversations
counter = 0
for conv in conversations[:10]:
    for line_id in conv:
        print(dialogue_lines[line_id])
    counter += 1


Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.
Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
You're asking me out.  That's so cute. What's your name again?
Forget it.
No, no, it's my fault -- we didn't have a proper introduction ---
Cameron.
The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
Seems like she could get a date easy enough...
Why?
Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.
That's a shame.
Gosh, if only we could find Kat a boyfriend...
Let me see what I can do.
C'esc ma tete. This is my head
Right.  See?  You're ready for the quiz.
I don't want to know how to say that thoug

In [11]:
## gather dataset

initiation = []; response = []

for conv in conversations:
    if len(conv) %2 != 0:
        conv = conv[:-1]
    for i in range(len(conv)):
        if i%2 == 0:
            initiation.append(dialogue_lines[conv[i]])
        else:
            response.append(dialogue_lines[conv[i]])


In [13]:
# change to lower case 
initiation = [ line.lower() for line in initiation ]
response = [ line.lower() for line in response ]


In [16]:
for i in range(2):
    print("<initiation> ", initiation[i], "\n","<response>", response[i], "\n")

<initiation>  can we make this quick?  roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad.  again. 
 <response> well, i thought we'd start with pronunciation, if that's okay with you. 

<initiation>  not the hacking and gagging and spitting part.  please. 
 <response> okay... then how 'bout we try out some french cuisine.  saturday?  night? 



In [17]:
# Filter out short and long sentences


from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

dialogue_limits = {'initiation_max': 25,
                   'initiation_min': 2,
                   'response_max': 25,
                   'response_min': 2
                  }


len_filtered_initiation = []
len_filtered_response = []

dataset_length = len(initiation)

# check to have responses to every initiation
assert len(initiation) == len(response)

for i in range(dataset_length):
    init_len, resp_len = len(tknzr.tokenize(initiation[i])), len(tknzr.tokenize(response[i]))
    if init_len >= dialogue_limits['initiation_min'] and init_len <= dialogue_limits['initiation_max']:
        if resp_len >= dialogue_limits['response_min'] and resp_len <= dialogue_limits['response_max']:
            len_filtered_initiation.append(initiation[i])
            len_filtered_response.append(response[i])

# print the filtered data value
filtered_data_len = len(len_filtered_initiation)
filtered = int((dataset_length - filtered_data_len)*100/dataset_length)
print(str(filtered) + '% filtered from original data')

print("fileterd q \n" ,len_filtered_initiation[:10], " \n \nfiltered a", len_filtered_response[:10])


21% filtered from original data
fileterd q 
 ['not the hacking and gagging and spitting part.  please.', "you're asking me out.  that's so cute. what's your name again?", "no, no, it's my fault -- we didn't have a proper introduction ---", 'gosh, if only we could find kat a boyfriend...', "c'esc ma tete. this is my head", 'how is our little find the wench a date plan progressing?', 'there.', 'you got something on your mind?', 'you have my word.  as a gentleman', 'how do you get your hair to look like that?']  
 
filtered a ["okay... then how 'bout we try out some french cuisine.  saturday?  night?", 'forget it.', 'cameron.', 'let me see what i can do.', "right.  see?  you're ready for the quiz.", "well, there's someone i think might be --", 'where?', "i counted on you to help my cause. you and that thug are obviously failing. aren't we ever going on our date?", "you're sweet.", "eber's deep conditioner every two days. and i never, ever use a blowdryer without the diffuser attachment."]

In [16]:
assert len(len_filtered_initiation) == len(len_filtered_response)

In [18]:
tokenized_initiation = []
tokenized_response = []

for init in len_filtered_initiation[:]:
    tokenized_initiation.append(tknzr.tokenize(init))

for resp in len_filtered_response[:]:
    tokenized_response.append(tknzr.tokenize(resp))

In [20]:
## Tokenize every initiation and the response

for init,resp in zip(tokenized_initiation[100:105], tokenized_response[100:105]):
    print('q : [{0}]; \na : [{1}]\n'.format(init,resp))

q : [['gigglepuss', 'is', 'playing', 'there', 'tomorrow', 'night', '.']]; 
a : [["don't", 'make', 'me', 'do', 'it', ',', 'man']]

q : [['cameron', ',', "i'm", 'a', 'little', 'busy']]; 
a : [["it's", 'off', '.', 'the', 'whole', 'thing', '.']]

q : [['what', "'", 're', 'you', 'talking', 'about', '?']]; 
a : [["she's", 'partial', 'to', 'joey', ',', 'not', 'me']]

q : [["what'd", 'you', 'do', 'to', 'her', '?']]; 
a : [['i', 'don', "'", 't', 'know', '.', 'i', 'decided', 'not', 'to', 'nail', 'her', 'when', 'she', 'was', 'too', 'drunk', 'to', 'remember', 'it', '.']]

q : [['she', 'hates', 'you', 'with', 'the', 'fire', 'of', 'a', 'thousand', 'suns', '.', "that's", 'a', 'direct', 'quote']]; 
a : [['she', 'just', 'needs', 'time', 'to', 'cool', 'off', "i'll", 'give', 'it', 'a', 'day', '.']]



## Index 2 word and word to index


In [21]:
import nltk
import itertools
from collections import defaultdict

import numpy as np
UNK = 'unk'
vocab_size = 8000

# import pickle

tokenized_sentences = tokenized_initiation + tokenized_response
freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
# get vocabulary of 'vocab_size' most used words
vocabulary = freq_dist.most_common(vocab_size)
# index2word
index2word = ['_'] + [UNK] + [ vocab[0] for vocab in vocabulary ]
# word2index
word2index = dict([(w,i) for i,w in enumerate(index2word)] )


In [22]:
## Filter out data based on the number of unknown tokens 
data_len = len(tokenized_initiation)

unk_filtered_inititation = []
unk_filtered_response = []


for inits, resps in zip(tokenized_initiation, tokenized_response):
    unk_count_init = len([ w for w in inits if w not in word2index ])
    unk_count_resp = len([ w for w in resps if w not in word2index ])
    if unk_count_resp <= 2:
        if unk_count_init > 0:
            if unk_count_init/len(inits) > 0.2:
                pass
        unk_filtered_inititation.append(inits)
        unk_filtered_response.append(resps)

# print the fraction of the original data, filtered
filt_data_len = len(unk_filtered_inititation)
filtered = int((data_len - filt_data_len)*100/data_len)
print(str(filtered) + '% filtered from original data')
print(data_len - filt_data_len)
print(len(unk_filtered_inititation))
print(data_len)
# return filtered_q, filtered_a

1% filtered from original data
1744
107043
108787


In [23]:
print('\n Final dataset len : ' + str(len(unk_filtered_inititation)))


 Final dataset len : 107043


In [26]:
def padding_w2v(seq, lookup, maxlen):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            indices.append(lookup[UNK])
    return indices + [0]*(maxlen - len(seq))

In [29]:
## Padding zeros at the end of sentences and making them of equal length for the input in LSTM

dataset_len = len(tokenized_initiation)


final_initiations = np.zeros([dataset_len, dialogue_limits['initiation_max']], dtype=np.int32) 
final_responses = np.zeros([dataset_len, dialogue_limits['response_max']], dtype=np.int32)

for i in range(dataset_len):
    init_indices = padding_w2v(tokenized_initiation[i], word2index, dialogue_limits['initiation_max'])
    resp_indices = padding_w2v(tokenized_response[i], word2index, dialogue_limits['response_max'])
    final_initiations[i] = np.array(init_indices)
    final_responses[i] = np.array(resp_indices)


In [30]:
len(final_initiations) == len(final_responses)

True

In [31]:
final_initiations[100:105]

array([[   1,   19,  653,   66,  294,  175,    2,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [5014,    3,   24,    9,  127,  691,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [  14,   50, 3034,    5,  223,   43,    4,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [ 586,    5,   22,    8,   74,    4,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [  75, 1747,    5,   40,    7,  534,   17,    9,  466,    1,    2,
          55,    9, 2081, 4754,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]])

In [32]:
final_responses[100:105]

array([[  21,  129,   16,   22,   12,    3,  111,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [  34,  134,    2,    7,  360,  143,    2,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [ 149, 6076,    8, 1513,    3,   31,   16,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   6, 1302,   50, 1294,   26,    2,    6, 1069,   31,    8, 2495,
          74,  103,   75,   37,  112,  875,    8,  225,   12,    2,    0,
           0,    0,    0],
       [  75,   39,  617,   90,    8,  552,  134,   80,  151,   12,    9,
         231,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]])

In [33]:
## Print the unknown count and the total dataset count
unk_count = (final_initiations == 1).sum() + (final_responses == 1).sum()
word_count = (final_initiations > 1).sum() + (final_responses > 1).sum()

print('% unknown : {0}'.format(100 * (unk_count/word_count)))
print('Dataset count : ' + str(final_initiations.shape[0]))

% unknown : 3.5067504233579667
Dataset count : 108787


In [34]:
# save the arrays to disk for future imports
np.save('idx_init.npy', final_initiations)
np.save('idx_resp.npy', final_responses)

In [35]:
## Prepare data for testing and training

ratio = [0.70, 0.15, 0.15]
data_len = len(final_initiations)

train_ratio = int(0.70 * data_len)
test_ratio = int(0.15 * data_len)
valid_ratio = int(0.15 * data_len)

lens = [ int(data_len*item) for item in ratio ]

trainX = final_initiations[:train_ratio]
trainY = final_responses[:train_ratio]

testX = final_initiations[train_ratio : (train_ratio+test_ratio)]
testY = final_responses[train_ratio : (train_ratio+test_ratio)]

validX = final_initiations[:-valid_ratio]
validY = final_responses[:-valid_ratio]


In [36]:
init_seq_length = trainX.shape[-1]
resp_seq_length = trainY.shape[-1]
batch_size = 16
init_vocab_size = len(index2word)  
resp_vocab_size = init_vocab_size
emb_dim = 1024

In [37]:
## Creating batches to feed into LSTM

import numpy as np
from random import sample

def batch(x, y, batch_size):
    while True:
        sample_idx = sample(list(np.arange(len(x))), batch_size)
        yield x[sample_idx].T, y[sample_idx].T

validation_batch = batch(validX, validY, 32)
test_batch = batch(testX, testY, 256)
train_batch = batch(trainX, trainY, batch_size)

In [76]:
## Building Tensorflow graph

import seq2seq_wrapper

model = seq2seq_wrapper.Seq2Seq(xseq_len=init_seq_length,
                               yseq_len=resp_seq_length,
                               xvocab_size=init_vocab_size,
                               yvocab_size=resp_vocab_size,
                               ckpt_path='ckpt/cornell_corpus/',
                               emb_dim=emb_dim,
                               num_layers=3
                               )



<log> Building Graph </log>

In [80]:
sess = model.train(train_batch, validation_batch)


<log> Training started </log>
Interrupted by user at iteration 41


In [81]:
input_ = test_batch.__next__()[0]
output = model.predict(sess, input_)
print(output.shape)

(256, 25)


In [None]:
def decode(sequence, lookup, separator=''):
    return separator.join([ lookup[element] for element in sequence if element ])

In [85]:
replies = []
for m, n in zip(input_.T, output):
    init = data_utils.decode(sequence=m, lookup=index2word, separator=' ')
    decoded = data_utils.decode(sequence=n, lookup=index2word, separator=' ').split(' ')
    if decoded.count('unk') == 0:
        if decoded not in replies:
            print('inti : [{0}]; resp : [{1}]'.format(init, ' '.join(decoded)))
            replies.append(decoded)

init : [bill id like you to meet jack torrance]; resp : [how much do you think]
init : [and who are you to talk you were nothing you couldnt even sing i must have been out of my mind]; resp : [i dont know what youre talking about]
init : [by breaking up a companys assets]; resp : [what are you talking about]
init : [what is it]; resp : [i dont know]
init : [ill see you there]; resp : [ill get out]
init : [okay ill be talking to you]; resp : [youre not going to get out]
init : [i must be outta my mind buddy inituit it]; resp : [okay for a minute]
init : [when are you going to let the police know]; resp : [you dont know what youre talking about]
init : [you can do it]; resp : [yeah i think so]
init : [like hell you know if you fellows stuck together stead of letting them walk all over you they might not try it]; resp : [if you werent talking about me i dont know what youre talking about]
init : [wait are you saying that i dont appreciate]; resp : [i know you know what i mean]
init : [no 

In [None]:
#####   <<<<<<<< End of CODE >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.