In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Input, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from collections import Counter
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(2018)

In [2]:
# set default parameters
BATCH_SIZE = 128
NUM_EPOCHS = 100
HIDDEN_UNITS = 256
MAX_INPUT_SEQ_LENGTH = 20
MAX_TARGET_SEQ_LENGTH = 20
MAX_VOCAB_SIZE = 100
DATA_PATH = 'movie_lines.txt'
WEIGHT_FILE_PATH = 'word-weights.h5'

In [8]:
input_counter = Counter()
target_counter = Counter()

In [10]:
# read the data
with open(DATA_PATH, 'r', encoding="latin-1") as f:
    df = f.read()
df[:200]

'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\nL1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\nL985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\nL984 +++$++'

In [11]:
rows = df.split('\n')
lines = [row.split(' +++$+++ ')[-1] for row in rows]
input_texts = []
target_texts = []

In [12]:
lines[:10]

['They do not!',
 'They do to!',
 'I hope so.',
 'She okay?',
 "Let's go.",
 'Wow',
 "Okay -- you're gonna need to learn how to lie.",
 'No',
 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'Like my fear of wearing pastels?']

In [13]:
import re
import string 

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = text.split()
    text = " ".join(text)
    return text
    
for i in range(len(lines)):
    clean_text = clean_text_round1(lines[i])
    lines[i] = clean_text

In [14]:
prev_words = []
for line in lines:
    next_words = [w.lower() for w in nltk.word_tokenize(line)]
    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        for w in prev_words:
            input_counter[w] += 1
        target_words = next_words[:]
        target_words.insert(0, 'START')
        target_words.append('END')
        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)
    prev_words = next_words

In [15]:
input_texts

[['they', 'do', 'not'],
 ['they', 'do', 'to'],
 ['i', 'hope', 'so'],
 ['she', 'okay'],
 ['lets', 'go'],
 ['wow'],
 ['okay', 'youre', 'gon', 'na', 'need', 'to', 'learn', 'how', 'to', 'lie'],
 ['no'],
 ['im',
  'kidding',
  'you',
  'know',
  'how',
  'sometimes',
  'you',
  'just',
  'become',
  'this',
  'persona',
  'and',
  'you',
  'dont',
  'know',
  'how',
  'to',
  'quit'],
 ['like', 'my', 'fear', 'of', 'wearing', 'pastels'],
 ['the', 'real', 'you'],
 ['what', 'good', 'stuff'],
 ['i', 'figured', 'youd', 'get', 'to', 'the', 'good', 'stuff', 'eventually'],
 ['thank',
  'god',
  'if',
  'i',
  'had',
  'to',
  'hear',
  'one',
  'more',
  'story',
  'about',
  'your',
  'coiffure'],
 ['me',
  'this',
  'endless',
  'blonde',
  'babble',
  'im',
  'like',
  'boring',
  'myself'],
 ['what', 'crap'],
 ['do', 'you', 'listen', 'to', 'this', 'crap'],
 ['no'],
 ['then',
  'guillermo',
  'says',
  'if',
  'you',
  'go',
  'any',
  'lighter',
  'youre',
  'gon',
  'na',
  'look',
  'like',
 

In [16]:
target_texts

[['START', 'they', 'do', 'to', 'END'],
 ['START', 'i', 'hope', 'so', 'END'],
 ['START', 'she', 'okay', 'END'],
 ['START', 'lets', 'go', 'END'],
 ['START', 'wow', 'END'],
 ['START',
  'okay',
  'youre',
  'gon',
  'na',
  'need',
  'to',
  'learn',
  'how',
  'to',
  'lie',
  'END'],
 ['START', 'no', 'END'],
 ['START',
  'im',
  'kidding',
  'you',
  'know',
  'how',
  'sometimes',
  'you',
  'just',
  'become',
  'this',
  'persona',
  'and',
  'you',
  'dont',
  'know',
  'how',
  'to',
  'quit',
  'END'],
 ['START', 'like', 'my', 'fear', 'of', 'wearing', 'pastels', 'END'],
 ['START', 'the', 'real', 'you', 'END'],
 ['START', 'what', 'good', 'stuff', 'END'],
 ['START',
  'i',
  'figured',
  'youd',
  'get',
  'to',
  'the',
  'good',
  'stuff',
  'eventually',
  'END'],
 ['START',
  'thank',
  'god',
  'if',
  'i',
  'had',
  'to',
  'hear',
  'one',
  'more',
  'story',
  'about',
  'your',
  'coiffure',
  'END'],
 ['START',
  'me',
  'this',
  'endless',
  'blonde',
  'babble',
  'im

In [17]:
# encode the data
input_word2idx = dict()
target_word2idx = dict()
for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1

In [18]:
input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
target_word2idx['UNK'] = 0

In [19]:
input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)

np.save('model/word-input-word2idx.npy', input_word2idx)
np.save('model/word-input-idx2word.npy', input_idx2word)
np.save('model/word-target-word2idx.npy', target_word2idx)
np.save('model/word-target-idx2word.npy', target_idx2word)

In [20]:
num_encoder_tokens

102

In [21]:
encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    for w in input_words:
        w2idx = 1
        if w in input_word2idx:
            w2idx = input_word2idx[w]
        encoder_input_wids.append(w2idx)

    encoder_input_data.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)

In [22]:
context = dict()
context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

np.save('model/word-context.npy', context)

In [63]:
# custom function to generate batches

def generate_batch(input_data, output_text_data):
    num_batches = len(input_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length)
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

In [64]:
# Compiling and training

encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length, name='encoder_embedding')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 256)    26112       encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None, 101)]  0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 256), (None, 525312      encoder_embedding[0][0]          
____________________________________________________________________________________________

In [65]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

json = model.to_json()
open('model/word-architecture.json', 'w').write(json)

3697

In [66]:
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, target_texts, test_size=0.2, random_state=42)

train_gen = generate_batch(X_train, y_train)
test_gen = generate_batch(X_test, y_test)

train_num_batches = len(X_train) // BATCH_SIZE
test_num_batches = len(X_test) // BATCH_SIZE

In [None]:
model.fit_generator(generator=train_gen,
                    steps_per_epoch=train_num_batches,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=test_gen,
                    validation_steps=test_num_batches)

In [None]:
model.save_weights(WEIGHT_FILE_PATH)