# Text generation

In this notebook text will be generated with multiple different methods, as well as multiple different kinds of text (poetry, news artciles, stories). The generated text will be evaluated using the BERTscore.

In [1]:
import numpy as np
import tensorflow as tf
import os
from os.path import join, isfile
import time

In [2]:
import sys
print(sys.version)
print(tf.__version__)

3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
2.0.0


### Load data

In [3]:
if not isfile('data/harrypotter.txt'):
    files= ['1SorcerersStone.txt', '2ChamberofSecrets.txt', '3ThePrisonerOfAzkaban.txt', '5OrderofthePhoenix.txt', '6TheHalfBloodPrince.txt', '7DeathlyHollows.txt']
    with open('data/harrypotter.txt', 'w') as outfile:
        for file in files:
            filename = join('data', file)
            with open(filename) as infile:
                text = infile.read()
                outfile.write(text)

In [4]:
text = open('data/harrypotter.txt').read()

### 1. LSTM
From https://medium.com/towards-artificial-intelligence/create-your-own-harry-potter-short-story-using-rnn-and-tensorflow-853b3ed1b8f3 and https://www.tensorflow.org/tutorials/text/text_generation

In [5]:
print(text[:300])

Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they 


In [6]:
vocab = sorted(set(text))
print(vocab)

char2index = {char:index for index, char in enumerate(vocab)}
index2char = np.array(vocab)

text_as_int = np.array([char2index[char] for char in text])

['\t', '\n', '\x1f', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '~', '¢', '¦', '©', '«', '»', 'Â', 'Ã', 'â', 'œ', '˜', '’', '“', '€', '™']


In [7]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1) # rounds to nearest number
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(data):
    input_text = data[:-1]
    target_text = data[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [8]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units[0], return_sequences=True, stateful=True,recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units[1], return_sequences=True, stateful=True,recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
        ])
    return model

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [9]:
embedding_dim = 256

# Number of RNN units 
rnn_units = [512, 256]

model = build_model(vocab_size=len(vocab), embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)

model.compile(optimizer='adam', loss=loss)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           27136     
_________________________________________________________________
gru (GRU)                    (64, None, 512)           1182720   
_________________________________________________________________
gru_1 (GRU)                  (64, None, 256)           591360    
_________________________________________________________________
dense (Dense)                (64, None, 106)           27242     
Total params: 1,828,458
Trainable params: 1,828,458
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Directory where the checkpoints will be saved
checkpoint_dir = 'training_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [None]:
EPOCHS= 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
latest_check = tf.train.latest_checkpoint(checkpoint_dir)

Epoch 1/10
    772/Unknown - 3112s 4s/step - loss: 1.8104

In [None]:
def generate_text(model, start_string):
    
    num_generate = 1000  #amount of letters to be generated
    input_eval = [char2index[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    scaling = 0.5 #kept at a lower value here
    # Here batch size == 1
    model.reset_states()
    
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / scaling
        predicted_id = tf.random.categorical(predictions, num_samples=1)[1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
        
return (start_string + ''.join(text_generated))

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(latest_check)
model.build(tf.TensorShape([1, None]))
model.summary()

start_string = 'Severus Snape'
generated_text = generate_text(model, start_string)
print(generated_text)

### 2. RelGANs

### 3. LSTM