## Importing Libraries and Data Set

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, GRU, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import string
import requests
from sklearn.metrics import accuracy_score

In [2]:
data_link = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

## Data Preprocessing

In [3]:
data = data_link.text.split('\n')

In [4]:
data = data[253:]

In [5]:
data = ' '.join(data)

In [6]:
def text(doc):
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [7]:
tokens = text(data)

In [8]:
len(set(tokens))

27956

In [9]:
length = 10 + 1

In [10]:
lines = []

In [11]:
for i in range(length, len(tokens)):
    seq = tokens[i - length:i]
    line = ' '.join(seq)
    lines.append(line)
    if i > 250000:
        break

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
word_seq = tokenizer.texts_to_sequences(lines)

In [13]:
len(lines)

249991

In [14]:
lines[0]

'from fairest creatures we desire increase that thereby beautys rose might'

In [15]:
tokens[0]

'from'

In [16]:
len(tokens)

898199

In [17]:
train_tokens = tokens[: 400000]

In [18]:
test_tokens = tokens[400000 : 800000]

In [19]:
words_train = sorted(set(train_tokens))
words_test = sorted(set(test_tokens))

In [20]:
len(words_train), len(words_test)

(19050, 18519)

In [21]:
word_train_index = {f: i for i, f in enumerate(words_train)}
index_words_train = np.array(words_train)
word_test_index = {f: i for i, f in enumerate(words_test)}
index_words_test = np.array(words_test)

In [22]:
train_text_to_int = np.array([word_train_index[c] for c in train_tokens])
test_text_to_int = np.array([word_test_index[c] for c in test_tokens])

In [23]:
seq_len = 10

In [24]:
train_word_data = tf.data.Dataset.from_tensor_slices(train_text_to_int)
test_word_data = tf.data.Dataset.from_tensor_slices(test_text_to_int)

In [25]:
train_data_seq = train_word_data.batch(seq_len + 1, drop_remainder = True)
test_data_seq = test_word_data.batch(seq_len + 1, drop_remainder = True)

In [26]:
def x_y_split(portion):
    x = portion[: -1]
    y = portion[1 :]
    return x, y

In [27]:
train_df = train_data_seq.map(x_y_split)
test_df = test_data_seq.map(x_y_split)

In [28]:
batch_size = 64
buffer = 10000

In [29]:
train_df = train_df.shuffle(buffer).batch(batch_size, drop_remainder = True)
test_df = test_df.shuffle(buffer).batch(batch_size, drop_remainder = True)

## Model Architecture

In [30]:
model_rnn = Sequential([
    Embedding(len(words_train), 256, batch_input_shape = [batch_size, None]),
    #GRU(512, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
    #Dropout(0.25),
    GRU(1024, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
    Dropout(0.25),
    #GRU(2048, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
    #Dropout(0.25),
    GRU(4096, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
    Dropout(0.25),
    #GRU(8192, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
    #Dropout(0.25),
    GRU(16382, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
    Dropout(0.25),
    Dense(len(words_train))
])

In [31]:
model_rnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           4876800   
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dropout (Dropout)            (64, None, 1024)          0         
_________________________________________________________________
gru_1 (GRU)                  (64, None, 4096)          62939136  
_________________________________________________________________
dropout_1 (Dropout)          (64, None, 4096)          0         
_________________________________________________________________
gru_2 (GRU)                  (64, None, 16382)         1006510080
_________________________________________________________________
dropout_2 (Dropout)          (64, None, 16382)         0

In [32]:
model_rnn.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
rnn_fit = model_rnn.fit(train_df, epochs = 50)

Epoch 1/50


In [None]:
model_rnn.save('text_rnn_2.0_.h5')