### Text Generation

In [1]:
import tensorflow

In [3]:
import glob
import zipfile

from sklearn.utils import shuffle
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend

In [6]:
#zip_ref = zipfile.ZipFile(Tago)
filelist = glob.glob("Tagore/data/*.txt")
len(filelist)

20

In [13]:
text_data = []
for file in filelist:
    with open(file, 'rb') as file:
        file_content = file.read().decode('UTF-8')
        text_data.append(file_content)
        #print(len(file_content.split(' ')))

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
word_idx = tokenizer.word_index
idx_word = tokenizer.index_word

In [21]:
word_counts = tokenizer.word_counts
num_words = len(word_counts)
num_words

29566

In [23]:
text_data[0][:200]

'The Project Gutenberg EBook of Chitra, by Rabindranath Tagore\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r'

In [26]:
word_idx['project']

57

In [27]:
sequences = tokenizer.texts_to_sequences(text_data)


In [28]:
features = []
labels = []

training_length = 50

for seq in sequences:
    for i in range(training_length, training_length+300):
        extract = seq[i-training_length: i - training_length+20]
        
        features.append(extract[:-1])
        labels.append(extract[-1])

In [32]:
print(features[0]), print(labels[0])
print(features[1]), print(labels[1])

[2, 57, 43, 256, 3, 2068, 37, 544, 729, 1, 17, 256, 9, 16, 2, 169, 3, 752, 1218]
32
[57, 43, 256, 3, 2068, 37, 544, 729, 1, 17, 256, 9, 16, 2, 169, 3, 752, 1218, 32]
44


(None, None)

In [36]:
len(features)

6000

In [44]:
from sklearn.utils import shuffle
import numpy as np

features, labels = shuffle(features, labels, random_state=1)

train_end = int(0.75 * len(labels))

train_features = np.array(features[:train_end])
valid_features = np.array(features[train_end:])

train_labels = labels[:train_end]
valid_labels = labels[train_end:]

X_train, X_valid = np.array(train_features), np.array(valid_features)

y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

for example_index, word_index in enumerate(train_labels):
    y_train[example_index, word_index] = 1

for example_index, word_index in enumerate(valid_labels):
    y_valid[example_index, word_index] = 1

In [54]:
print('Input sequence \n')
print([idx_word[i] for i in X_train[0]])
print('Label \n')
print(idx_word[train_labels[0]])


Input sequence 

['terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included\r', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'org\r', '\r', '\r', 'title']
Label 

glimpses


In [55]:
model = Sequential()
model.add(Embedding( 
    input_dim=num_words,
    output_dim=100,
    weights=None,
    trainable=True))
model.add(LSTM( 
    64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1, activation='tanh'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_words, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         2956600   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 29566)             1921790   
Total params: 4,924,790
Trainable params: 4,924,790
Non-trainable params: 0
_________________________________________________________________


In [81]:
history = model.fit(X_train, y_train, epochs=200, batch_size=64, verbose=0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
print(model.evaluate(X_train, y_train, batch_size=32))
print(model.evaluate(X_valid, y_valid, batch_size=32))

[4.784709930419922, 0.19022221863269806]
[6.74057674407959, 0.1666666716337204]


## Generate Text

In [64]:
X_valid.shape

(1500, 19)

In [84]:

predict_label = np.argmax(model.predict(X_valid[40].reshape(1, 19))[0])

In [77]:
model.predict(X_valid[0].reshape(1, 19))[0].shape

(29566,)

In [89]:
temp = X_valid[0].reshape(1, 19)

labels = []
for i in range(50):
    pred = model.predict(temp)[0]
    label_id = np.argmax(pred)
    labels.append(label_id)
    temp = list(temp[0])
    temp.append(label_id)
    temp = np.array([temp[1:]])

In [94]:
' '.join([idx_word[i] for i in X_valid[0]])

"\r first chapter\r \r \r bimala's story\r \r i \r \r mother today i see your red one again"

In [91]:
[idx_word[i] for i in labels]

['of',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the']