In [4]:

import keras
import numpy as np
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Dense, LSTM, Dropout, Input
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [5]:
data_dir = 'House.txt'

with open(data_dir) as f:
    data = f.read()
    
data = data[81:].lower()

# seperate the punchuations from the words
punch = ['.', '[', ']', '(', ')', ';', ':', "'", '/', '"', ',', '?', '*', '!', '-', '$', '%', '&', '\n']

for i in punch:    
    data = data.replace(i, ' ' + i + ' ')
    
data = data.replace('\n', '<NEWLINE>')

In [6]:
def get_vocab(text):
    
    vocab_to_int = dict()
    int_to_vocab = dict()
    
    vocab = Counter()
    for word in text.split():
        vocab[word] += 1
        
    index = 0    
    for word in vocab:
        vocab_to_int[word] = index
        int_to_vocab[index] = word
        index += 1
        
    return vocab, vocab_to_int, int_to_vocab

vocab, vocab_to_int, int_to_vocab = get_vocab(data)

# converting text into int
text_int = []

for word in data.split():
    text_int.append(vocab_to_int[word])
    
text_int = np.array(text_int)  

In [8]:
data[:300]

'alking on the telephone without a shirt on .  ]  <NEWLINE>  <NEWLINE> brandon :  i didn’t sleep well last night ,  and i woke up with a scratchy throat .  i just don’t feel so good .   [ pause ]  uh ,  cough ,   [ clears his throat ]  yeah ,  i’m ,  i’m ,  a bit of an upset stomach too ,  and i thin'

In [9]:
data[:240].split('\n')

['alking on the telephone without a shirt on .  ]  <NEWLINE>  <NEWLINE> brandon :  i didn’t sleep well last night ,  and i woke up with a scratchy throat .  i just don’t feel so good .   [ pause ]  uh ,  cough ,   [ clears his throat ]  yeah ']

In [11]:
print(text_int[:100])


[ 0  1  2  3  4  5  6  1  7  8  9  9 10 11 12 13 14 15 16 17 18 19 12 20
 21 22  5 23 24  7 12 25 26 27 28 29  7 30 31  8 32 18 33 18 30 34 35 24
  8 36 18 37 18 37 18  5 38 39 40 41 42 43 18 19 12 44 37 45  5 46 18 37
 25 47 12 48 49 50  7 30 51  8 52 18 53 18 36 37 54 55 49 56 57 58 59  7
 30 31  8 60]


In [12]:
seq_len = 200

def get_training_data(data, seq_len):
    
    x_train = []
    y_train = []
    
    for i in range(0, len(data)-seq_len):
        
        x = data[i:i+seq_len]
        y = data[i+1:i+seq_len+1]
        
        x_train.append(np.array(x))
        y_train.append(np.array(y))
        
    return x_train, y_train
  
x, y = get_training_data(text_int, seq_len)

x = np.array(x)
y = np.array(y)
y = y.reshape(y.shape[0], y.shape[1], 1)

print(x.shape)

(9691, 200)


In [28]:
embedding = 300
lstm_size = 128
vocab_size = len(vocab)

inp = Input((None,))

embed = Embedding(input_dim=vocab_size, output_dim=embedding)
lstm1 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm2 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm3 = LSTM(lstm_size, return_sequences=True, return_state=True)
dense = Dense(vocab_size)

net = embed(inp)
net, h1, c1 = lstm1(net)
net, h2, c2 = lstm2(net)
net, h3, c3 = lstm3(net)
out = dense(net)

model = Model(inp, out)
print(model.summary())

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 300)         437700    
_________________________________________________________________
lstm_4 (LSTM)                [(None, None, 128), (None 219648    
_________________________________________________________________
lstm_5 (LSTM)                [(None, None, 128), (None 131584    
_________________________________________________________________
lstm_6 (LSTM)                [(None, None, 128), (None 131584    
_________________________________________________________________
dense_2 (Dense)              (None, None, 1459)        188211    
Total params: 1,108,727
Trainable params: 1,108,727
Non-trainable params: 0
_________________________________________________

In [16]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.optimizer = 0.01
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print(model.fit(x, y, batch_size=128, epochs=4, shuffle=True))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
<keras.callbacks.callbacks.History object at 0x1a4156af28>


In [17]:
init_states = [Input((lstm_size,)) for i in range(6)]

inference = embed(inp)
inference, h1, c1 = lstm1(inference, initial_state=init_states[:2])
inference, h2, c2 = lstm2(inference, initial_state=init_states[2:4])
inference, h3, c3 = lstm3(inference, initial_state=init_states[4:6])
inf_out = dense(inference)

states = [h1, c1, h2, c2, h3, c3]
inf_model = Model([inp]+init_states, [inf_out]+states)

In [18]:
def extract_text(length, start):
    
    states = [np.zeros((1, lstm_size)) for i in range(6)]

    token = np.zeros((1,1))
    token[0,0] = start
    text = int_to_vocab[start] + ' '
    
    for i in range(length):
        
        out = inf_model.predict([token]+states)
        word = np.argmax(out[0][0,0,:])
        text += int_to_vocab[word] + ' '
        states = out[1:7]
        token[0][0] = word
        
    return text    

In [26]:
def post_process_text(text):
    
    punch1 = ['.', ':', '!', ';', ')', ']', '?', ',', '%']
    for i in punch1:
        text = text.replace(' '+i, i)
        
    punch2 = ['[', '(', '$']    
    for i in punch2:
        text = text.replace(i+' ', i)
        
    punch3 = ["'", '-']    
    for i in punch3:
        text = text.replace(' '+i+' ', i)
        
    text = text.split('<NEWLINE>')  
    for line in text:
        if len(line):
        
            return text    

In [27]:
generated_text = extract_text(200, 0)
generated_text = post_process_text(generated_text)
print(generated_text)

['alking [the the the,,,,,,, [[has to to,,,,,, [[off off to]]. ', ' ', '::: you we’re we’re on on on on on on on]] ', ' ', ' did:: are!! start start on on on on on,,,, so – – –:: [the the the the the the do do do do do do see see []]],, – – –::,,, we’re [[the the the the the]] do do do do do to see you?? [', ' ', '::,,,,. [[[to the]]]] hey hey hey hey hey we’re ', ' house walks walks.. ', ' ', ' ', '::: has has,,,,,, [[[[the]]]] to hey hey hey hey hey ', ' ', ':: house walks walks.. ', ' ', ' ', '::: has,, on ']
