# Project: Text generation with LSTM

*Adapted from: https://keras.io/examples/generative/lstm_character_level_text_generation/*

# 1. Dataset Preparation

### 1. Dataset selection

In [3]:
import keras
from keras import layers

import numpy as np
import random
import io

# Dataset used: Picture of Dorian Gray from Project Gutenburg

path = keras.utils.get_file(
    "dorian.txt",
    origin="https://www.gutenberg.org/cache/epub/174/pg174.txt",
)

Downloading data from https://www.gutenberg.org/cache/epub/174/pg174.txt
[1m465857/465857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2us/step



### 2. Preprocess text

In [4]:
path = keras.utils.get_file(
    "dorian.txt",
    origin="https://www.gutenberg.org/cache/epub/174/pg174.txt",
)
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()
text = text.replace("\n", " ")
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype="bool")
y = np.zeros((len(sentences), len(chars)), dtype="bool")
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


Corpus length: 448622
Total chars: 69
Number of sequences: 149528


# 2. Building the LSTM

### 1/2. Choose framework and define architecture

In [5]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [6]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


# 3. Training LSTM / Text Generation

In [9]:
epochs = 30
batch_size = 128

for epoch in range(epochs + 1):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()

    if epoch % 5 == 0:

      print("Generating text after epoch: %d" % epoch)

      start_index = random.randint(0, len(text) - maxlen - 1)
      for diversity in [0.2, 1.2]: # Experiment with high and low temp
          print("...Diversity:", diversity)

          generated = ""
          sentence = text[start_index : start_index + maxlen]
          print('...Generating with seed: "' + sentence + '"')

          for i in range(400):
              x_pred = np.zeros((1, maxlen, len(chars)))
              for t, char in enumerate(sentence):
                  x_pred[0, t, char_indices[char]] = 1.0
              preds = model.predict(x_pred, verbose=0)[0]
              next_index = sample(preds, diversity)
              next_char = indices_char[next_index]
              sentence = sentence[1:] + next_char
              generated += next_char

          print("...Generated: ", generated)
          print("-")

[1m1169/1169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 1.1784

Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: "d looked at dorian.  “are you better, my"
...Generated:   dear from the senses of the right of the sat on the same start that he had been the senses that she had been the soure, and in the senses and the senses of the senses of the senses that is a standing on the table.  “yes, i am side the face of the senses and the senses of the senses of the senses and the senses and wonderful things and the senses of the senses of the senses and white seems of the 
-
...Diversity: 1.2
...Generating with seed: "d looked at dorian.  “are you better, my"
...Generated:   dear beauty home lifes in bend whose always i lord.’llw. i wins8and only is the’r with them one of any bod?  cwe there is viltly, that who all. how feept doarm, love.”  “it is in it. she contrive casen sent upers. some free-fbustent uncontal thragies, elecity of live. l

In [10]:
# Final epoch at end of training

print("Generating text after epoch: %d" % epoch)

start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 1.2]: # Experiment with high and low temp
    print("...Diversity:", diversity)

    generated = ""
    sentence = text[start_index : start_index + maxlen]
    print('...Generating with seed: "' + sentence + '"')

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        sentence = sentence[1:] + next_char
        generated += next_char

    print("...Generated: ", generated)
    print("-")

Generating text after epoch: 29
...Diversity: 0.2
...Generating with seed: "yet stirred him by its suggestion of a s"
...Generated:  tanding the senses of the strange for a strangerctic bor married and then a spection with the senses of the chair of the senses of the strange which start the strange from the strangery of the thing and looked at him and then a mode of the senses of the colour of the coloured painted with his life, and the man and great part of the colour of the striking and great common could not all the more in 
-
...Diversity: 1.2
...Generating with seed: "yet stirred him by its suggestion of a s"
...Generated:  cholec, door, butgind it look of eyes topin soir.fol, do yet a rather an actirous. no?” isreroone more laugher. there was no get underred. unwaye we will be kepted itsques.”  “jumb this yorryinged, and consolled spoiled in the gravel’s  fire all attless. laid he sure onqubrury yes, he away, not of life and the greats their life. they always romed, asautryn met,

# 4. Evaluating the Model

1. *Evaluating generated text*: Low temperature produces much more coherent sentences, while high temperature hardly resembles English. In terms of the low temperature responses, the words used are mostly coherent and not repetitive, however, the model has yet to produce sentences with proper grammatical construction or sense, mostly generating stream-of-consciousness style responses.

2. *Refining model*: With more time and resources, increasing the number of training epochs would likely significantly increase the quality of the final output.