In [1]:
import pandas as pd
import numpy as np
import keras
import time
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import os

Using TensorFlow backend.


In [2]:
# Settings
step_length = 1
epochs = 100
batch_size = 32
latent_dim = 64
dropout_rate = 0.2
model_path = os.path.realpath('./poke_gen_model.h5')
load_model = False
store_model = True
verbosity = 0
gen_amount = 10

In [3]:
input_path = os.path.realpath('./input/names.txt')

input_names = []
with open(input_path) as f:
    for name in f:
        input_names.append(name.rstrip())

concat_names = '\n'.join(input_names).lower()
chars = sorted(list(set(concat_names)))
num_chars = len(chars)
char2idx = dict((c, i) for i, c in enumerate(chars))
idx2char = dict((i, c) for i, c in enumerate(chars))
max_sequence_length = max([len(name) for name in input_names])

print('Total chars: {}'.format(num_chars))
print('Corpus length:', len(concat_names))
print('Number of names: ', len(input_names))
print('Longest name: ', max_sequence_length)

Total chars: 27
Corpus length: 6734
Number of names:  799
Longest name:  12


In [4]:
sequences = []
next_chars = []

for i in range(0, len(concat_names) - max_sequence_length, step_length):
    sequences.append(concat_names[i: i + max_sequence_length])
    next_chars.append(concat_names[i + max_sequence_length])

num_sequences = len(sequences)

print('Number of sequences:', num_sequences)
print('First 10 sequences and next chars:')
for i in range(10):
    print('[{}] => [{}]'.replace('\n', ' ').format(sequences[i], next_chars[i]).replace('\n', ' '))

Number of sequences: 6722
First 10 sequences and next chars:
[bulbasaur ch] => [i]
[ulbasaur chi] => [k]
[lbasaur chik] => [o]
[basaur chiko] => [r]
[asaur chikor] => [i]
[saur chikori] => [t]
[aur chikorit] => [a]
[ur chikorita] => [ ]
[r chikorita ] => [t]
[ chikorita t] => [r]


In [5]:
X = np.zeros((num_sequences, max_sequence_length, num_chars), dtype=np.bool)
Y = np.zeros((num_sequences, num_chars), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for j, char in enumerate(sequence):
        X[i, j, char2idx[char]] = 1
    Y[i, char2idx[next_chars[i]]] = 1
    
print('X shape: {}'.format(X.shape))
print('Y shape: {}'.format(Y.shape))

X shape: (6722, 12, 27)
Y shape: (6722, 27)


In [6]:
model = Sequential()
model.add(LSTM(latent_dim, 
               input_shape=(max_sequence_length, num_chars),  
               recurrent_dropout=dropout_rate))
model.add(Dense(units=num_chars, activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer)

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64)                23552     
_________________________________________________________________
dense_1 (Dense)              (None, 27)                1755      
Total params: 25,307
Trainable params: 25,307
Non-trainable params: 0
_________________________________________________________________


In [7]:
if load_model:
    model.load(model_path)
else:
    
    start = time.time()
    print('Start training for {}'.format(epochs))
    history = model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=verbosity)
    end = time.time()
    print('Finished training - time elapsed:', (end - start)/60, 'min')
    
if store_model:
    print('Storing model at:', model_path)
    model.save(model_path)

Start training for 100
Instructions for updating:
Use tf.cast instead.
Finished training - time elapsed: 1.9454469521840414 min
Storing model at: C:\Projects\Github\pokemon-name-generator\poke_gen_model.h5


In [12]:
# Start sequence generation from end of the input sequence
sequence = concat_names[-(max_sequence_length - 1):] + '\n'

new_names = []

print('{} new names are being generated'.format(gen_amount))

while len(new_names) < gen_amount:
    
    # Vectorize sequence for prediction
    x = np.zeros((1, max_sequence_length, num_chars))
    for i, char in enumerate(sequence):
        x[0, i, char2idx[char]] = 1

    # Sample next char from predicted probabilities
    probs = model.predict(x, verbose=0)[0]
    probs /= probs.sum()
    next_idx = np.random.choice(len(probs), p=probs)   
    next_char = idx2char[next_idx]   
    sequence = sequence[1:] + next_char

    # New line means we have a new name
    if next_char == '\n':

        gen_name = [name for name in sequence.split('\n')][1]

        # Never start name with two identical chars
        if len(gen_name) > 2 and gen_name[0] == gen_name[1]:
            gen_name = gen_name[1:]

        # Discard all names that are too short
        if len(gen_name) > 2:
            # Only allow new and unique names
            if gen_name not in input_names + new_names:
                new_names.append(gen_name.capitalize())

        if 0 == (len(new_names) % (gen_amount/ 10)):
            print('Generated {} names'.format(len(new_names)))

10 new names are being generated
Generated 1 names
Generated 2 names
Generated 3 names
Generated 4 names
Generated 5 names
Generated 6 names
Generated 7 names
Generated 8 names
Generated 8 names
Generated 8 names
Generated 8 names
Generated 9 names
Generated 10 names


In [13]:
print_first_n = min(10, gen_amount)

print('First {} generated names:'.format(print_first_n))
for i, name in enumerate(new_names[:print_first_n]):
    print(name)

First 10 generated names:
Illileon
Piboe
Solreot
Deeltin
Mare
Caruix
Sanddale
Beavalle
Toulgole
Zangow


In [14]:
concat_output = '\n'.join(new_names)
output_path = os.path.realpath('./output/generated_names.txt')

with open(output_path, 'w') as f:
    f.write(concat_output)