In [1]:
import keras
import numpy as np
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Dense, LSTM, Dropout, Input
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [31]:
# loading the data into file

data_dir = 'data/moes_tavern_lines.txt'

with open(data_dir) as f:
    data = f.read()
    
data = data[81:].lower()

In [32]:
data[:447]

"moe_szyslak: (into phone) moe's tavern. where the elite meet to drink.\nbart_simpson: eh, yeah, hello, is mike there? last name, rotch.\nmoe_szyslak: (into phone) hold on, i'll check. (to barflies) mike rotch. mike rotch. hey, has anybody seen mike rotch, lately?\nmoe_szyslak: (into phone) listen you little puke. one of these days i'm gonna catch you, and i'm gonna carve my name on your back with an ice pick.\nmoe_szyslak: what's the matter homer?"

In [33]:
data[:240].split('\n')

["moe_szyslak: (into phone) moe's tavern. where the elite meet to drink.",
 'bart_simpson: eh, yeah, hello, is mike there? last name, rotch.',
 "moe_szyslak: (into phone) hold on, i'll check. (to barflies) mike rotch. mike rotch. hey, has anybody see"]

In [34]:
# seperate the punchuations from the words

punch = ['.', '[', ']', '(', ')', ';', ':', "'", '/', '"', ',', '?', '*', '!', '-', '$', '%', '&', '\n']

for i in punch:    
    data = data.replace(i, ' ' + i + ' ')
    
data = data.replace('\n', '<NEWLINE>')    

In [35]:
data[:400]

"moe_szyslak :   ( into phone )  moe ' s tavern .  where the elite meet to drink .  <NEWLINE> bart_simpson :  eh ,  yeah ,  hello ,  is mike there ?  last name ,  rotch .  <NEWLINE> moe_szyslak :   ( into phone )  hold on ,  i ' ll check .   ( to barflies )  mike rotch .  mike rotch .  hey ,  has anybody seen mike rotch ,  lately ?  <NEWLINE> moe_szyslak :   ( into phone )  listen you little puke ."

In [6]:
def get_vocab(text):
    
    vocab_to_int = dict()
    int_to_vocab = dict()
    
    vocab = Counter()
    for word in text.split():
        vocab[word] += 1
        
    index = 0    
    for word in vocab:
        vocab_to_int[word] = index
        int_to_vocab[index] = word
        index += 1
        
    return vocab, vocab_to_int, int_to_vocab

In [7]:
vocab, vocab_to_int, int_to_vocab = get_vocab(data)

In [8]:
print("vocab size:", len(vocab))

vocab size: 6363


In [9]:
# converting text into int

text_int = []

for word in data.split():
    text_int.append(vocab_to_int[word])
    
text_int = np.array(text_int)    

In [100]:
vocab_size = len(vocab)
seq_len = 100
embedding = 300
lstm_size = 128

In [101]:
def get_training_data(data, seq_len):
    
    x_train = []
    y_train = []
    
    for i in range(0, len(data)-seq_len):
        
        x = data[i:i+seq_len]
        y = data[i+1:i+seq_len+1]
        
        x_train.append(np.array(x))
        y_train.append(np.array(y))
        
    return x_train, y_train

In [102]:
x, y = get_training_data(text_int, seq_len)

x = np.array(x)
y = np.array(y)
y = y.reshape(y.shape[0], y.shape[1], 1)

In [103]:
x.shape

(79630, 100)

In [111]:
inp = Input((None,))

embed = Embedding(input_dim=vocab_size, output_dim=embedding)
lstm1 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm2 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm3 = LSTM(lstm_size, return_sequences=True, return_state=True)
dense = Dense(vocab_size)

net = embed(inp)
net, h1, c1 = lstm1(net)
net, h2, c2 = lstm2(net)
net, h3, c3 = lstm3(net)
out = dense(net)

model = Model(inp, out)

init_states = [Input((lstm_size,)) for i in range(6)]

inference = embed(inp)
inference, h1, c1 = lstm1(inference, initial_state=init_states[:2])
inference, h2, c2 = lstm2(inference, initial_state=init_states[2:4])
inference, h3, c3 = lstm3(inference, initial_state=init_states[4:6])
inf_out = dense(inference)

states = [h1, c1, h2, c2, h3, c3]
inf_model = Model([inp]+init_states, [inf_out]+states)

In [112]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [113]:
model.optimizer.lr = 0.01
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [125]:
print(model.fit(x, y, batch_size=128, epochs=1, shuffle=True))
model.save('model.model')

Epoch 1/1
<keras.callbacks.History object at 0x7f13e5576d30>


In [97]:
def extract_text(length, start):
    
    states = [np.zeros((1, lstm_size)) for i in range(6)]

    token = np.zeros((1,1))
    token[0,0] = start
    text = int_to_vocab[start] + ' '
    
    for i in range(length):
        
        out = inf_model.predict([token]+states)
        word = np.argmax(out[0][0,0,:])
        text += int_to_vocab[word] + ' '
        states = out[1:7]
        token[0][0] = word
        
    return text    

In [126]:
generated_text = extract_text(1000, 0)

In [151]:
def post_process_text(text):
    
    punch1 = ['.', ':', '!', ';', ')', ']', '?', ',', '%']
    for i in punch1:
        text = text.replace(' '+i, i)
    punch2 = ['[', '(', '$']    
    for i in punch2:
        text = text.replace(i+' ', i)
    punch3 = ["'", '-']    
    for i in punch3:
        text = text.replace(' '+i+' ', i)
        
    text = text.split('<NEWLINE>')  
    for line in text:
        if len(line)
    return text    

In [162]:
post_process_text(generated_text)

[' moe_szyslak: (late) homer are carve at make your new yuh. it. ',
 ' homer_simpson: you get something something to love your creepy man? ',
 ' moe_szyslak: well to get from the man. ',
 " moe_szyslak: oh, who's the bar? ",
 " moe_szyslak: oh, no, who's this bar? ",
 " moe_szyslak: (cowboys) then all right, from who's moe. ",
 ' moe_szyslak: oh, how get a man? ',
 " moe_szyslak: (to talkin '? ",
 ' moe_szyslak: (counterfeit runt! ',
 ' moe_szyslak: what am i? am? ',
 ' moe_szyslak: oh, oh, i am a man. ',
 " moe_szyslak: (to talkin'then.. ",
 " moe_szyslak: (to runt, hey, you'm gonna points with the new died. ",
 " moe_szyslak: (tummies? hello, i'm gonna candles, life, and i'm the menace, here, my droning'here. i d an creepy to you? "]