In [1]:
# Lab 7 - Text generation with LSTM
#
# Step 1 (not assessed): build and train a model to generate text in the style of a corpus.
#
# Based on the Keras text generation example (https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py)
#
# Step 2: build a model to distinguish genuine from fake sentences.

In [2]:
# Import essential modules
import pickle
import random
import sys
import time

import numpy as np
from sklearn.model_selection import train_test_split

from keras.layers import Input, LSTM, GRU, Dense, Activation
from keras.optimizers import RMSprop
from keras.models import Model, Sequential
from keras.models import save_model
from keras.utils.data_utils import get_file

Using TensorFlow backend.


In [3]:
# Helper function to sample an index from an array of predictions.
#
# The input array 'preds' should be the output of a text generation model.
# The elements contain the values of the units in the final layer.
# Each unit corresponds to a character in the text alphabet.
# The final layer should have SoftMax activation, and thus the
# value corresponds to the 'strength of prediction' of that character
# as the next output value---so the maximum value indicates which character
# is most strongly predicted (considerd most likely) as the next one.
#
def sample(preds, temperature=1.0):
    # Convert to high-precision datatype (we are going to be manipulating some
    # very small values in this function)
    preds = np.asarray(preds).astype('float64')  
    
    # The next line has the effect of raising each prediction value to the power 1/T.
    # It's done using logs to improve numerical precision.  This is a kind of value-dependent
    # scaling: for T < 1.0 (1/T > 1.0), small values are made smaller (proportionally) than 
    # large values (unlike a linear scaling, such as multiplication by 0.9, which scales all values
    # the same).
    #
    # Example: Consider that we have only two symbols (letters) in our alphabet, and our 
    # probabilities are [0.2, 0.8].  A temperature of 1.0 means 'do not adjust the
    # probabilities at all', so in this case there will be a 20% chance that the 
    # function will return 'symbol 0' and an 80% chance  that it will return 'symbol 1'.
    # Note that symbol 1 is 4x more likely than symbol 0.
    #
    # Now: if we supply a temperature of 0.5, our probabilites will be raised to the
    # power 1/0.5 = 2, becoming [0.04, 0.64].  These will then be normalized to sum to 1,
    # but anyway it is clear that symbol 1 is here 16x (the square of 4x) more likely than 
    # symbol 0.
    #
    # Conversely, for a temperature of 2, our probabilities will be raised to 0.5 (square-rooted),
    # becoming [.4472, 0.8944] - and so here symbol 1 is only 2x (sqrt of 4x) more likely than
    # symbol 0.
    #
    # So: low temperatures make the distribution peakier, exaggerating the difference between
    # values.  High temperatures flatten the distribution, reducing the difference between values.
    #
    # As the return value is a sample of the manipulated distribution, manipulating it to
    # be peakier (by supplying a low temperature) makes the sample more conservative, i.e.
    # more likely to pick the highest-probability symbol.
    #
    # Making the distribution flatter (by suppyling a high temperature) causes the
    # sample to be less conservative, i.e. more likely to pick some lower-likelihood
    # symbol.
    #
    # Phew!
    preds = np.exp(np.log(preds) / temperature)
    
    preds = preds / np.sum(preds)  # ensure that probs sum to 1
    probas = np.random.multinomial(1, preds, 1)  # take 1 sample from the distribution
    return np.argmax(probas)

In [4]:
# Decide how much data to use for training.
# You might want to reduce this to ~100k for faster experimentation, and then bring it back
# to 600k when you're happy with your network architecture.
# IMPORTANT: make sure you end up with a 57-symbol alphabet after reducing the corpus size!
# If the number of symbols (shown in the next cell) gets smaller than it was with the full
# corpus, bring your sample size back up.  This is necessary because the encoding used for
# training must match that used for assessment.
desired_num_chars = 600*1000  # Max: 600893

random.seed(43)  # Fix random seed for repeatable results.

# Slurp down all of Nietzsche from Amazon.
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('original corpus length:', len(text))

start_index = random.randint(0, len(text) - desired_num_chars - 1)
text = text[start_index:start_index + desired_num_chars]
text
print('length for training:', len(text))

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
original corpus length: 600893
length for training: 600000


In [5]:
# Let's have a quick look at a random exceprt.
#
# Caution: Nietzsche might drive you mad: dare you behold more than 1000 of his terrible chars..? 
sample_length = 1000

random.seed(None)  # Seeds random from current time (so re-eval this cell for a new sample).

start_index = random.randint(0, len(text) - sample_length - 1)
print(text[start_index:start_index+sample_length])

l man, is duly styled "good". (at first other
and more important kinds of utilitarian qualities stand in the
foreground.) bad is "not habitual" (unusual), to do things not in
accordance with usage, to oppose the traditional, however rational or
the reverse the traditional may be. to do injury to one's social group
or community (and to one's neighbor as thus understood) is looked upon,
through all the variations of moral laws, in different ages, as the
peculiarly "immoral" act, so that to-day we associate the word "bad"
with deliberate injury to one's neighbor or community. "egoistic" and
"non-egoistic" do not constitute the fundamental opposites that have
brought mankind to make a distinction between moral and immoral, good
and bad; but adherence to traditional custom, and emancipation from it.
how the traditional had its origin is quite immaterial; in any event it
had no reference to good and bad or any categorical imperative but to
the all important end of maintaining and sustaining 

In [6]:
# Establish the alphabet (set of symbols) we are going to use.
chars = sorted(list(set(text)))
print('total chars:', len(chars))
print(chars)

char_indices = dict((c, i) for i, c in enumerate(chars))  # Map to look up index of a particular char (e.g. x['a'] = 0)
indices_char = dict((i, c) for i, c in enumerate(chars))  # Map to look up char at an index (e.g. x[0] = 'a')

total chars: 57
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']


In [7]:
# Establish a training set of semi-redundant (i.e. overlapping) sequences of maxlen characters.
maxlen = 40
step = 3
sentences = []  # Not syntactic sentences, but just sequences of 40 chars pulled from the corpus.
next_chars = [] # next_chars[n] stores the character which followed sentences[n]
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 199987


In [8]:
# Convert the data to one-hot encoding.
# 'x' will contain the one-hot encoding of the training 'sentences'.
# 'y' will contain the one-hot encoding of the 'next char' for each sentence.
#
# 
# Let's consider that we have N sentences of length L:
#
# The 'native' encoding is an NxL matrix where element [n][l]
# is the symbol index for character at index (l) of sentence (n)
# (e.g., say, 5, corresponding to 'e').
#
# The one-hot encoding is an NxLxS matrix, where S is the 
# number of symbols in the alphabet, such that element [n][l][s]
# is 1 if the character at index (l) in sentence (n) has the
# symbol index (s), and 0 otherwise.
def onehot_encode(sentence, maxlen):
    x = np.zeros((maxlen, len(chars)), dtype=np.bool)
    for t, char in enumerate(sentence):
        x[t, char_indices[char]] = 1
    return x

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    x[i,:,:] = onehot_encode(sentence, maxlen)
    y[i, :] = onehot_encode(next_chars[i], 1)

print(x.shape)
print(y.shape)

(199987, 40, 57)
(199987, 57)


In [9]:
# Build the generator model: a single GRU layer with 128 cells.
generator_model = Sequential()
generator_model.add(GRU(128, input_shape=(maxlen, len(chars))))
generator_model.add(Dense(len(chars)))
generator_model.add(Activation('softmax'))

# You could experiment with NAdam instead of RMSProp.
optimizer = RMSprop(lr=0.01)
generator_model.compile(loss='categorical_crossentropy', optimizer='NAdam')
trained_epochs = 0

In [10]:
def generate_sentence_list(seed_list, length=400, temperature=0.25):
    sentence_list = []
    generated_list = []
    n = len(seed_list)
    # copy lists
    for seed in seed_list:
        sentence_list.append(seed[:])
        generated_list.append(seed[:])    
    
    for i in range(length):
      
        workdone = (i+1)*1.0 / length
        sys.stdout.write("\rgenerating sentences: [{0:20s}] {1:.1f}%".format('#' * int(workdone * 20), workdone*100))
        sys.stdout.flush()
            
        x_pred_list = np.zeros((n, maxlen, len(chars)))
        for j, sentence in enumerate(sentence_list):
            for t, char in enumerate(sentence):
                x_pred_list[j, t, char_indices[char]] = 1.

        start = time.time()
        pred_list = generator_model.predict(x_pred_list, verbose=0)
        end = time.time()

        for j in range(n):
            next_index = sample(pred_list[j,:], temperature)
            next_char = indices_char[next_index]
            generated_list[j] += next_char
            sentence_list[j] = sentence_list[j][1:] + next_char
    
    sys.stdout.write(' - done\n')
    sys.stdout.flush()
    
    return generated_list

def print_sentences(seeds, sentences):
    for seed, sentence in zip(seeds, sentences):
        print('-'*5)
        sys.stdout.write('\x1b[32m')
        sys.stdout.write(sentence[0:len(seed)])
        sys.stdout.write('\x1b[34m')
        sys.stdout.write(sentence[len(seed):-1])
        sys.stdout.write('\x1b[m')
        sys.stdout.write('\n')    
        sys.stdout.flush()
        
def pick_sentences(n, maxlen):
    global text    
    start_index_list = np.random.randint(len(text) - maxlen - 1, size=(1, n)).flatten().tolist()
    seed_list = [] 
    for start_index in start_index_list:
        seed_list.append(text[start_index: start_index + maxlen])
    return seed_list

In [11]:
# Generate 3 seeds which we will use to inspect the progress of our training:
preview_seeds = pick_sentences(3, maxlen=40)

# Train the model, output generated text after each iteration
for iteration in range(1, 10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    generator_model.fit(x, y,
                  batch_size=1024,
                  epochs=4)

    generated_sentences = generate_sentence_list(preview_seeds)
    print_sentences(preview_seeds, generated_sentences)


--------------------------------------------------
Iteration 1
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
generating sentences: [####################] 100.0% - done
-----
[32my in pain, in emotion especially (whence[34m and a more and the contion of the resintion of the revery and the revertion of the sore the sore to the resintion of the some his the sore the sore in the some the sore and and the sour the sounter and in the restrest of the resintion of the resertion of the some the regred and the sore the existion of the sour the concress of the sour for the revertion of the restrals to shile and and and in th[m
-----
[32mording to the
same principle; it may be [34ma tore and the wath of the artion of the intored and and and became the regere to the some to the sore the contions of the regines and the astion of the rogred the exprestion the sour the becour and and and the resertion of the regerse and reare the sure of the resertion of the will the souther and and and and and and as

-----
[32mording to the
same principle; it may be [34messential to an all the same the soul of the subjection of the same probably the same to be the problem of man in the present of the subjection of the experience of the experience of the spirit and the experience of the same the subjection of the same the state of the ears of the world of the experience of the same more and assumed to see in all the some of the supposed to an all the same to be t[m
-----
[32mm which we
ourselves are--or more correc[34mtion of the superstition of the state of the superstition of the spirit of the experience of the souls and sense of the same problem a sertion of the same problem of the supposed the same the world of the supposed to and the experience of the superstition of the exception of the belief in the same the same that it is the experience of the same person simple as the experiences of the subjection of[m

--------------------------------------------------
Iteration 6
Epoch 1/4
Epoch 2/

In [12]:
# For a more complete inspection, print out a load of sentences:
#
num_sentences = 100             # how many to generate
sentence_length = 400            # 100--400 is good
sample_temperature = 0.37     # see discussion of temperature up near the top

start_index_list = np.random.randint(len(text) - maxlen - 1, size=(1, num_sentences)).flatten().tolist()
preview_seeds = [] 
for start_index in start_index_list:
    preview_seeds.append(text[start_index: start_index + maxlen])

generated_sentences = generate_sentence_list(preview_seeds, length=sentence_length, temperature=sample_temperature) 
print_sentences(preview_seeds, generated_sentences)

generating sentences: [####################] 100.0% - done
-----
[32me canon of truth of eternal popular sens[34me and desire to the highest and soul and the great standard and he dive the thing and the conscience has all the realish of his equaling for the same probabily become strength which seems to be the god that is not a scientific men of
it has not the logical and his destrusting the distrous distrasted is a fold something that is almost and the spirit of the procociest for the day and consequently o[m
-----
[32mth your higher aims. you had to
grasp th[34mem as i way the great standard and in the same prove compared him even in the
stronger is a man and not a stronger and development of the respossible the same any
domain of the proudure in the same deserve to the most spiritual and his moral of the highest and distrest and sensually and so from the
same is a man from a strong as a great drave and the considered all the
compusionery of the spirit [m
-----
[32mer child wha

-----
[32m vanity wants him
only "to suffer with h[34mas all the truth in the fisster in the
spiritual and the distrest and sense of the screeth and does not dereverence and frantly and the conscience of the conscience has all the
state of the same person senticule to the emotions of the desire to god a concealed, his own
laste and present that we have religion to the conscience has always be the god in the require self-examile and strongly the worl[m
-----
[32me. the spectacle of the tartuffery of ol[34md and love of the suderntion to the lightral conscience that is a certain could and
process of the spirit that his every individuals and senses and interpretent and the superstitional is a both of the respect to the
most same people to the conscience has not to him that you has in the conscience is the conscience for the the conscience himself and destrusting that is always to any distrast to the[m
-----
[32mt depth and significance: in the
former [34mand strongest and characte

-----
[32m of
the living being as a primary organi[34mc to the considered and honoured the the
individual and his own with the rest of the conscience has all the far as it is all the state of the first the world of the
had his world and ready the most self-stron an action to his own freedom that it is really or an action in the rather in the delight in the possible the greatest in the does not to see and in the
suffering and not only the religious s[m
-----
[32mmpo of the
assimilation of its nutriment[34m this constituted to the greater had and however, and the self-der very the necessity, and as it
way the spirit that hat it is the bring and sense of all the considered and account is a serveted and sentumest in the conscience has been something
lever and consequently of the respected in the really who
would for his even to a still and not only the longers of the subtlers who is more readined and[m
-----
[32mation, and will-renunciation, both
sympt[34moms of a conscience that i

-----
[32mge but the welfare of human society is
p[34mleasure--and his deepest and not only an any his to and have a spectingly the conscience have
receributly and not only the happiness in the growa self man of
the conscience for the day with the hardle and sense of an action to any other for the
religion in the subject of the rest of the strength of the speak of an action and such a still
every present distrust and such a long him even the greates[m
-----
[32main and again made a sort of chamber
mus[34mt be present and consequently the soul of the subject works who has also a which a personal of the god and not say the desire which is in reality that not the fact to the contrary, the most soul and the logical intellect and the philosophy, and for the
conscience has been probably the power, of the truth in this motives itself which his power of the
dissinctions and states which has also the rest[m
-----
[32mh suitable auxiliaries who will shorten [34min the former and science 

-----
[32menerate and go
to ruin, to acquire quali[34mty--it is a profound himself and domain of
strongest and destruction of the pride of the desire the
delight in the fact that the light in the spectators and not a stronger and restentive man in the spectarn of the
does not nature is a man in the delight in the confection of the spection of the explesible the the
individual in the among gratitude. the strength of the respect the heart of the consc[m
-----
[32mn forces the growth of love too much, so[34m from any stards himself such as the universelves the realing standard of the
present that it is the most soul and sensious soul and the most are spirit to his destrusting that is the good and self-der
very fear and conscience has been something that it is always been such a soul and conscience that it is a served stonce an artisticape of all the conscience is the extent that he has the world, ha[m
-----
[32mloppe le corps."

143. our vanity would [34ma "great resplining in the

-----
[32ms
should now be of more consequence to u[34ms the greatest any of the greatest and distrest and such a self
sentumest one of the will in the worst and art of present that it is a man of the
dangerous sense, and "the development and sense of the subjection to the greater in the moral of the conscience has been the truth is a man in the spirit of the greatest in the fact that he will not a rilific can be a man of the distinction to the dange[m
-----
[32mis grateful in any mood,
it almost sets [34mand distrust of the conscience has a religious action to the hame of the rests conscience and hered any self-contrated
self-day looks with the world of a life his too to the good and states of the demose that is the and not and does not the conscience that it is a sympathe and his even in the
proses with the strength of the desire for the does not be an ancient still
uncertain crude and conscienc[m
-----
[32m
merrier and more comfortable time of it[34m were desire
feets in the 

In [13]:
# This is just a checkpoint, which will let you download and re-upload (or add to git) this model.
save_model(generator_model, './generator_model.h5')

In [14]:
# Generating the training fake sentences for the Discriminator network
#
# These are saved to the file 'fake.pkl' -- you could download this to your
# user drive and re-upload it in a subsequent session, to save regenerating
# it again (in which case you don't need to evaluate this cell).

training_seeds = pick_sentences(3000, maxlen=40)
training_generated_sentences = generate_sentence_list(training_seeds, length=40)
# Strip out the initial 40 chars (the seed sequence, which is genuine data from the corpus).
for i, sentence in enumerate(training_generated_sentences):
    training_generated_sentences[i] = sentence[40:40+40]
    
output = open('fake.pkl', 'wb')
pickle.dump(training_seeds, output)
pickle.dump(training_generated_sentences, output)
output.close()

generating sentences: [####################] 100.0% - done


In [15]:
# Load the training set from the file
pkl_file = open('fake.pkl', 'rb')
training_seeds = pickle.load(pkl_file)
training_generated_sentences = pickle.load(pkl_file)
pkl_file.close()

In [16]:
# Make a 50:50 set of 'fake' (generated) and genuine sentences:
num_generated = len(training_generated_sentences)
training_real_sentences = pick_sentences(num_generated, maxlen=40)

all_training_sentences = training_generated_sentences + training_real_sentences
n = len(all_training_sentences)
x = np.zeros((n, 40, len(chars)))
y = np.zeros((n, 1))

for i, sentence in enumerate(all_training_sentences):
    x[i, :, :] = onehot_encode(sentence, maxlen=40)
y[num_generated:] = 1  # Encodes the fact that sentences with indexes larger than (num_generated) are real.


In [122]:
from keras.layers import Dropout
from keras.optimizers import adam

print('Build model...')


# Define some layers here..
discriminator_model = Sequential()
discriminator_model.add(LSTM(256, input_shape=(maxlen, len(chars))))
discriminator_model.add(Activation ('relu'))
discriminator_model.add(Dropout(0.2))
discriminator_model.add(Dense(128, activation='relu'))
discriminator_model.add(Dense(64, activation='relu'))
discriminator_model.add(Dense(32, activation='relu'))
discriminator_model.add(Dropout(0.2))

discriminator_model.add(Dense(1, activation='sigmoid'))

# Use your layers to create the model.


# Setup the optimisation strategy.
discriminator_model.compile(optimizer='NAdam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
                             
print('compiled.')
discriminator_model.summary()


Build model...
compiled.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_48 (LSTM)               (None, 256)               321536    
_________________________________________________________________
activation_45 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_38 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_146 (Dense)            (None, 128)               32896     
_________________________________________________________________
dense_147 (Dense)            (None, 64)                8256      
_________________________________________________________________
dense_148 (Dense)            (None, 32)                2080      
_________________________________________________________________
dropout_39 (Dropout)         (None, 32)            

In [None]:

[x_train, x_test, y_train, y_test] = train_test_split(x, y, test_size=0.33, random_state=42)
discriminator_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20, batch_size=256) 

Train on 4020 samples, validate on 1980 samples
Epoch 1/20
 256/4020 [>.............................] - ETA: 1:33 - loss: 0.6929 - acc: 0.5234

# Once you're happy with your discriminator model, evaluate this cell to save it:

save_model(discriminator_model, './discriminator_model.h5')


# Run these commands in the terminal to submit your model for assessment.
# git add lab-07/discriminator_model.h5
# git commit -m "Add/update discriminator model."
# git push
# submit-lab 7

In [108]:
save_model(discriminator_model, './discriminator_model.h5')
