In [1]:
import keras
import numpy as np
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Dense, LSTM, Dropout, Input
from sklearn.model_selection import train_test_split

In [3]:
# loading the data into file

data_dir = 'shakespeare_input.txt'

with open(data_dir) as f:
    data = f.read()
    
data = data[81:].lower()

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving shakespeare_input.txt to shakespeare_input.txt
User uploaded file "shakespeare_input.txt" with length 4573338 bytes


In [4]:
data[:447]

"\nfirst citizen:\nyou are all resolved rather to die than to famish?\n\nall:\nresolved. resolved.\n\nfirst citizen:\nfirst, you know caius marcius is chief enemy to the people.\n\nall:\nwe know't, we know't.\n\nfirst citizen:\nlet us kill him, and we'll have corn at our own price.\nis't a verdict?\n\nall:\nno more talking on't; let it be done: away, away!\n\nsecond citizen:\none word, good citizens.\n\nfirst citizen:\nwe are accounted poor citizens, the patricians go"

In [5]:
data[:240].split('\n')

['',
 'first citizen:',
 'you are all resolved rather to die than to famish?',
 '',
 'all:',
 'resolved. resolved.',
 '',
 'first citizen:',
 'first, you know caius marcius is chief enemy to the people.',
 '',
 'all:',
 "we know't, we know't.",
 '',
 'first citizen:',
 "let us kill him, and we'll "]

In [6]:
# seperate the punchuations from the words

punch = ['.', '[', ']', '(', ')', ';', ':', "'", '/', '"', ',', '?', '*', '!', '-', '$', '%', '&', '\n']

for i in punch:    
    data = data.replace(i, ' ' + i + ' ')
    
data = data.replace('\n', '<NEWLINE>') 

In [7]:
data[:400]

" <NEWLINE> first citizen :  <NEWLINE> you are all resolved rather to die than to famish ?  <NEWLINE>  <NEWLINE> all :  <NEWLINE> resolved .  resolved .  <NEWLINE>  <NEWLINE> first citizen :  <NEWLINE> first ,  you know caius marcius is chief enemy to the people .  <NEWLINE>  <NEWLINE> all :  <NEWLINE> we know ' t ,  we know ' t .  <NEWLINE>  <NEWLINE> first citizen :  <NEWLINE> let us kill him ,  "

In [8]:
def get_vocab(text):
    
    vocab_to_int = dict()
    int_to_vocab = dict()
    
    vocab = Counter()
    for word in text.split():
        vocab[word] += 1
        
    index = 0    
    for word in vocab:
        vocab_to_int[word] = index
        int_to_vocab[index] = word
        index += 1
        
    return vocab, vocab_to_int, int_to_vocab

In [9]:
vocab, vocab_to_int, int_to_vocab = get_vocab(data)

In [10]:
print("vocab size:", len(vocab))

vocab size: 22595


In [11]:
# converting text into int

text_int = []

for word in data.split():
    text_int.append(vocab_to_int[word])
    
text_int = np.array(text_int) 

In [12]:
vocab_size = len(vocab)
seq_len = 100
embedding = 300
lstm_size = 128

In [13]:
def get_training_data(data, seq_len):
    
    x_train = []
    y_train = []
    
    for i in range(0, len(data)-seq_len):
        
        x = data[i:i+seq_len]
        y = data[i+1:i+seq_len+1]
        
        x_train.append(np.array(x))
        y_train.append(np.array(y))
        
    return x_train, y_train

In [14]:
x, y = get_training_data(text_int, seq_len)

x = np.array(x)
y = np.array(y)
y = y.reshape(y.shape[0], y.shape[1], 1)

In [15]:
x.shape

(1247462, 100)

In [16]:
inp = Input((None,))

embed = Embedding(input_dim=vocab_size, output_dim=embedding)
lstm1 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm2 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm3 = LSTM(lstm_size, return_sequences=True, return_state=True)
dense = Dense(vocab_size)

net = embed(inp)
net, h1, c1 = lstm1(net)
net, h2, c2 = lstm2(net)
net, h3, c3 = lstm3(net)
out = dense(net)

model = Model(inp, out)

init_states = [Input((lstm_size,)) for i in range(6)]

inference = embed(inp)
inference, h1, c1 = lstm1(inference, initial_state=init_states[:2])
inference, h2, c2 = lstm2(inference, initial_state=init_states[2:4])
inference, h3, c3 = lstm3(inference, initial_state=init_states[4:6])
inf_out = dense(inference)

states = [h1, c1, h2, c2, h3, c3]
inf_model = Model([inp]+init_states, [inf_out]+states)

In [17]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
model.optimizer.lr = 0.01
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
print(model.fit(x, y, batch_size=128, epochs=2, shuffle=True))

Epoch 1/2
 118/9746 [..............................] - ETA: 27:11:19 - loss: 7.5212 - accuracy: 0.1334

In [None]:
model.save('model.model')

In [None]:
'''
init_states = [Input((lstm_size,)) for i in range(6)]

inference = embed(inp)
inference, h1, c1 = lstm1(inference, initial_state=init_states[:2])
inference, h2, c2 = lstm2(inference, initial_state=init_states[2:4])
inference, h3, c3 = lstm3(inference, initial_state=init_states[4:6])
inf_out = dense(inference)

states = [h1, c1, h2, c2, h3, c3]
inf_model = Model([inp]+init_states, [inf_out]+states)'''

In [None]:
#Generating the script using the inference model
def extract_text(length, start):
    
    states = [np.zeros((1, lstm_size)) for i in range(6)]

    token = np.zeros((1,1))
    token[0,0] = start
    text = int_to_vocab[start] + ' '
    
    for i in range(length):
        
        out = inf_model.predict([token]+states)
        word = np.argmax(out[0][0,0,:])
        text += int_to_vocab[word] + ' '
        states = out[1:7]
        token[0][0] = word
        
    return text    

In [None]:
#generated_text = extract_text(1000, 0)

In [None]:
#post processing the script
def post_process_text(text):
    
    punch1 = ['.', ':', '!', ';', ')', ']', '?', ',', '%']
    for i in punch1:
        text = text.replace(' '+i, i)
    punch2 = ['[', '(', '$']    
    for i in punch2:
        text = text.replace(i+' ', i)
    punch3 = ["'", '-']    
    for i in punch3:
        text = text.replace(' '+i+' ', i)
        
    text = text.split('<NEWLINE>')  
    for line in text:
        if len(line)
    return text    

In [None]:
generated_text = extract_text(200, 0
generated_text = post_process_text(generated_text)
print(generated_text)

How to furthur improve the model
We can generate a much more realistic script by doing a few things such as: using the large dataset since our dataset is a small subset of the original dataset, tuning the hyperparameters, experimenting with different network architectures like bidirectional and encoder-decoder, applying text augmentation etc.