In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!ls

gdrive	sample_data


In [0]:
import os
import sys
source_root_path = 'gdrive/My Drive/Colab/GenerativeDeepLearning/source'
sys.path.append(source_root_path)

In [0]:
import numpy as np
import re
from IPython.display import clear_output

from keras.layers import Dense, LSTM, Input, Embedding, Dropout
from keras.utils import np_utils
from keras.models import Model, load_model
from keras.optimizers import Adam, RMSprop
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback

Using TensorFlow backend.


In [0]:
logs_root_path = 'gdrive/My Drive/Colab/GenerativeDeepLearning/logs'
data_dir = 'gdrive/My Drive/Colab/GenerativeDeepLearning/data'

In [0]:
load_saved_model = False
train_model = True
token_type = 'word'

In [0]:
#load in the text and perform some cleanup

seq_length = 20

filename = os.path.join(data_dir, "aesop/data.txt")

with open(filename, encoding='utf-8-sig') as f:
    text = f.read()
    
    
#removing text before and after the main stories
start = text.find("THE FOX AND THE GRAPES\n\n\n")
end = text.find("ILLUSTRATIONS\n\n\n[")
text = text[start:end]
print(len(text))
print(text[0:200])

197444
THE FOX AND THE GRAPES


A hungry Fox saw some fine bunches of Grapes hanging from a vine that
was trained along a high trellis, and did his best to reach them by
jumping as high as he could into the 


In [0]:
start_story = '| ' * seq_length
    
text = start_story + text
text = text.lower()
text = text.replace('\n\n\n\n\n', start_story)
text = text.replace('\n', ' ')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)
len(text)
print(text[0:200])

 | | | | | | | | | | | | | | | | | | | | the fox and the grapes . a hungry fox saw some fine bunches of grapes hanging from a vine that was trained along a high trellis , and did his best to reach the


In [0]:
if token_type == 'word':
    tokenizer = Tokenizer(char_level = False, filters = '')
else:
    tokenizer = Tokenizer(char_level = True, filters = '', lower = False)
    
    
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1

token_list = tokenizer.texts_to_sequences([text])[0]

In [0]:
total_words

4169

In [0]:
def generate_sequences(token_list, step):
    
    X = []
    y = []

    for i in range(0, len(token_list) - seq_length, step):
        X.append(token_list[i: i + seq_length])
        y.append(token_list[i + seq_length])
    

    y = np_utils.to_categorical(y, num_classes = total_words)
    
    num_seq = len(X)
    print('Number of sequences:', num_seq, "\n")
    
    return X, y, num_seq

step = 1
seq_length = 20

X, y, num_seq = generate_sequences(token_list, step)

X = np.array(X)
y = np.array(y)

print('X.shape: {}'.format(X.shape))
print('y.shape: {}'.format(y.shape))

Number of sequences: 50416 

X.shape: (50416, 20)
y.shape: (50416, 4169)


In [0]:
if load_saved_model:
    # model = load_model('./saved_models/lstm_aesop_1.h5')
    model = load_model('./saved_models/aesop_dropout_100.h5')

else:

    n_units = 256
    embedding_size = 100

    text_in = Input(shape = (None,))
    embedding = Embedding(total_words, embedding_size)
    x = embedding(text_in)
    x = LSTM(n_units)(x)
    # x = Dropout(0.2)(x)
    text_out = Dense(total_words, activation = 'softmax')(x)

    model = Model(text_in, text_out)

    opti = RMSprop(lr = 0.001)
    model.compile(loss='categorical_crossentropy', optimizer=opti)

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 100)         416900    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_1 (Dense)              (None, 4169)              1071433   
Total params: 1,853,901
Trainable params: 1,853,901
Non-trainable params: 0
_________________________________________________________________


In [0]:
 temperature=1.0
preds = np.array([0.1, 0.15, 0.1, 0.6])
preds = np.asarray(preds).astype('float64')
print(np.log(preds))
preds = np.log(preds) / temperature
print(preds)
exp_preds = np.exp(preds)
print(exp_preds)
preds = exp_preds / np.sum(exp_preds)
print(preds)
probas = np.random.multinomial(1, preds, 1)
print(probas)

[-2.30258509 -1.89711998 -2.30258509 -0.51082562]
[-2.30258509 -1.89711998 -2.30258509 -0.51082562]
[0.1  0.15 0.1  0.6 ]
[0.10526316 0.15789474 0.10526316 0.63157895]
[[0 1 0 0]]


In [0]:
def sample_with_temp(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)



def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    output_text = seed_text
    
    seed_text = start_story + seed_text
    
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        probs = model.predict(token_list, verbose=0)[0]
        y_class = sample_with_temp(probs, temperature = temp)
        
        if y_class == 0:
            output_word = ''
        else:
            output_word = tokenizer.index_word[y_class]
            
        if output_word == "|":
            break
            
        if token_type == 'word':
            output_text += output_word + ' '
            seed_text += output_word + ' '
        else:
            output_text += output_word + ' '
            seed_text += output_word + ' '
            
            
    return output_text

In [0]:
def on_epoch_end(epoch, logs):
    seed_text = ""
    gen_words = 500

    print('Temp 0.2')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.2))
    print('Temp 0.33')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.33))
    print('Temp 0.5')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.5))
    print('Temp 1.0')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 1))
    
    
if train_model:
    epochs = 1000
    batch_size = 32
    num_batches = int(len(X) / batch_size)
    callback = LambdaCallback(on_epoch_end=on_epoch_end)
    model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks = [callback], shuffle = True)

W0817 06:38:26.107273 140069779732352 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0817 06:38:26.874135 140069779732352 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/1000
Temp 0.2
the lion and the . . a was was a lion . a was , and the , and to the of , and the , and the wolf , and was the of the lion , and the had , and was the was , and the himself , and was the man of the ; and the lion , and was the was , and was the lion , and he , and the lion , and the at , and he , and to the ass , and the was of the lion , and he , he was his ass , and the was , and he , and the ass and the eagle , and was to the . and the was of the lion , and was the was , and the lion , and the came , and was the a lion , and the fox , and the and , and the lion , and he , and the lion , and he , and the up , and , he was to the lion of the , and the lion , and at the lion of the ass , and he , and he , and the of , and he , and the was , and the had , and the lion , and , and said , " i , " you , " you me , " you you , " you you . " you , " you you , " you you you , " i you , " you you you , " you you , " you you , " you you you , " you you you , " i you you , 

KeyboardInterrupt: ignored

In [0]:
seed_text = "the frog and the snake . "
gen_words = 500
temp = 0.1

print (generate_text(seed_text, gen_words, model, seq_length, temp))

the frog and the snake . a lion and a lion were on the fox , and , when he had , the ass the lion , and was a man at all . the lion , who was when it was a to be , and they had it out of them 


In [0]:
def generate_human_led_text(model, max_sequence_len):
    
    output_text = ''
    seed_text = start_story
    
    while 1:
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        probs = model.predict(token_list, verbose=0)[0]

        top_10_idx = np.flip(np.argsort(probs)[-10:])
        top_10_probs = [probs[x] for x in top_10_idx]
        top_10_words = tokenizer.sequences_to_texts([[x] for x in top_10_idx])
        
        for prob, word in zip(top_10_probs, top_10_words):
            print('{:<6.1%} : {}'.format(prob, word))

        chosen_word = input()
                
        if chosen_word == '|':
            break
            
        
        seed_text += chosen_word + ' '
        output_text += chosen_word + ' '
        
        clear_output()

        print (output_text)

In [0]:
generate_human_led_text(model, 20)

the wolf 
78.2%  : and
13.5%  : ,
2.4%   : in
0.7%   : .
0.4%   : had
0.4%   : with
0.3%   : -
0.3%   : for
0.2%   : :
0.2%   : as


KeyboardInterrupt: ignored