# Text Generator



In [4]:
import numpy as np 
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [1]:
#Download the sonnects file from this drive location or this repo. 
#sonnets = 'https://drive.google.com/uc?id=108jAePKK4R3BVYBbYJZ32JWUwxeMg20K'
SONNETS_FILE = 'C:/Users/RTiwari1/sonnets.txt'

In [2]:
# Read the data
with open('./sonnets.txt') as f:
    data = f.read()

# Convert to lower case and save as a list
corpus = data.lower().split("\n")

print(f"There are {len(corpus)} lines of sonnets\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
    print(corpus[i])

There are 2159 lines of sonnets

The first 5 lines look like this:

from fairest creatures we desire increase,
that thereby beauty's rose might never die,
but as the riper should by time decease,
his tender heir might bear his memory:
but thou, contracted to thine own bright eyes,


In [5]:
#tokenizer fit on texts generates token number for all words in corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

#text to sequences uses token numbers to create sequence of each line
tokenizer.texts_to_sequences([corpus[0]])



[[34, 417, 877, 166, 213, 517]]

In [8]:
# This function breaks every line by removing one word every time.
# The model will be trained to predict the next word everytime.
def n_gram_seqs(corpus, tokenizer):
    input_sequences = []
    
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences

In [17]:
# Testing function with one example.
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)

print("n_gram sequences for first example look like this:\n")
first_example_sequence

# Apply the n_gram_seqs transformation to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)

# Save max length 
max_sequence_len = max([len(x) for x in input_sequences])

# Pad the corpus by adding 0 to start of shorter sequences.
padded_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding = 'pre'))
print(f"padded corpus has shape: {padded_sequences.shape}")

n_gram sequences for first example look like this:

padded corpus has shape: (15462, 11)


In [18]:
print(padded_sequences[0])

[  0   0   0   0   0   0   0   0   0  34 417]


In [11]:
#Divide sequences into features and labels by taking the last token as label and entire sequence except the last token as features.
#Then one hot encode the labels using total words as classes. Only the correct word is one rest all are zero.
def features_and_labels(input_sequences, total_words):

    features = input_sequences[:,:-1]
    labels = input_sequences[:,-1]

    one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes = total_words)
    return features, one_hot_labels

In [22]:
# Split the whole corpus
features, labels = features_and_labels(padded_sequences, total_words)

print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")

features have shape: (15462, 10)
labels have shape: (15462, 3211)


In [26]:
# create_model
def create_model(total_words, max_sequence_len):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=None))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dense(total_words, activation = 'softmax'))


    # Compile the model
    adam = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    return model

In [27]:
# Get the untrained model
model = create_model(total_words, max_sequence_len)

# Train the model
history = model.fit(features, labels, epochs=50, verbose=1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
# Analyzing the training curve of the model.

acc = history.history['accuracy']
loss = history.history['loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.title('Training accuracy')

plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.title('Training loss')
plt.legend()

plt.show()

NameError: name 'history' is not defined

In [39]:
def download_history():
    import pickle
        from google.colab import files

    with open('history.pkl', 'wb') as f:
    
        pickle.dump(history.history, f)

        files.download('history.pkl')

    download_history()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [40]:
#Feed an input sequence and generate the next hundred words. 
seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 100
  
for _ in range(next_words):

    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted, axis=-1).item()
    output_word = tokenizer.index_word[predicted]
    seed_text += " " + output_word

print(seed_text)

Help me Obi Wan Kenobi, you're my only hope my argument all hate to thee all me see all other words thee all me or me more or less thee for love one thine doth mine pen truth again thee new mine eyes the write of heart that you right you had you dearer of another's account ' say so one of such you call so you die me do thee thy fair part thee so true thy 'will ' die in thee so best to thee i thine not thee thy joy can thee that die me still the heart i see the 'will ' die ' die
