# Automatic Essay Generator

This notebook is an attempt to automatically generate a high scoring essay. For simplicity the text basis is limited to the highest scoring essays from topic number 1 ("Computers").

In [3]:
# Load LSTM network and generate text
import numpy as np
import spacy

# spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load('en') # not needed for character based generation.

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd

ModuleNotFoundError: No module named 'keras'

In [None]:
training_set = pd.read_pickle('training_corr.pkl')


# for clarity, rename numbered essay topics to one-word topic summary 

topic_dict = {'topic':{1: 'computers', 
                       2: 'censorship', 
                       3: 'cyclist', 
                       4: 'hibiscus', 
                       5: 'mood', 
                       6: 'dirigibles', 
                       7: 'patience', 
                       8: 'laughter'}}

training_set.replace(topic_dict, inplace=True)

In [None]:
# Load ascii text and covert to lowercase
# Select high scoring essays from two topics. 
# Use only language corrected essays.
essays = training_set[((training_set.topic == 'computers') &
         (training_set.target_score > 11))]['corrected']

print(len(essays), 'essays used.')

The first step is to prepare a list of units on which the sequence will be based. The units could be essays, sentences, words or characters. The smaller the unit, the smaller the vocabulary and the more efficient the training.

Generally, a smart tokenizer such as SpaCy will return better tokens, for example, "don't" does not contain whitespace, but should be split into two tokens, "do" and "n't", while "U.K." should always remain one token. For character based generation, using SpaCy doesn't add any value.

In [None]:
# Create single list of words from all essays
texts = []
for essay in essays:
    essay = nlp(essay, disable=['parser', 'ner'])
    texts.append([tok.text.lower() for tok in essay])

# words/tokens
tokens = [word for e in texts for word in e]

# characters
char_list = [char for word in tokens for char in word]

In [None]:
# create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(tokens))) # or char_list
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(tokens) # char_list
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 40
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = tokens[i:i + seq_length] # char_list
    seq_out = tokens[i + seq_length] # char_list
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

In [None]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2, seed=42))
model.add(Dense(y.shape[1], activation='softmax'))

filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
optimizer = adam

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
# fit the model
model.fit(X, y, epochs=40, batch_size=128, callbacks=callbacks_list, verbose=0)

In [None]:
# Load weights from most improved
# filename = "weights-character-base.hdf5" # character
filename = "weights-improvement-40-3.9929.hdf5" # word
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ' '.join([int_to_char[value] for value in pattern]), "\"")

In [None]:
def sample_prediction(prediction):
    """Get rand index from preds based on its prob distribution.

    Params
    ——
    prediction (array (array)): array of length 1 containing array of probs that sums to 1

    Returns
    ——-
    rnd_idx (int): random index from prediction[0]

    Notes
    —–
    Helps to solve problem of repeated outputs.

    len(prediction) = 1
    len(prediction[0]) >> 1
    """
    X = prediction[0] # sum(X) is approx 1
    rnd_idx = np.random.choice(len(X), p=X)
    return rnd_idx

In [None]:
generated = ''
for i in range(400):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = sample_prediction(prediction)
    result = int_to_char[index]
    generated += result + ' '
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone.')
print(generated)

### Character based essay generation

Note that the space symbol was removed during tokenization. Nonetheless, it is still difficult to imagine where to separate the text into meaningful words.

`"Done.
2nrarliutistrteveoppuneedtes,lhonnvao@onlatfzuhniesioeeplmstrrowiteeoieaninetooeoofeirhanpegy.nhaentlgruuaa,onsoencyeakss@edhabophdtrslailirueu.ehttmhneedsoeoadwmamftaecfts.ohoacetcsetenkypersecpmvpthcbnoyuees.ttecoiputeidvedyots'eveoaohwereesjranaptaegileeteeaedysyobcyiencposabtoeb-niheiatanol.oavaeecgfclnalbyaakts1snoaslhfabnheseailn1tenscmricptdsefnrnofoaboipiteilno?itp2nraoenrtirianppntetcsaue"`