## Author: Samuel Hickey
### Assignment 5 - Recurrent Neural Network

### <center>Problem Statement 1</center>

<center>Build a sequential model to classify names into gender. </center>
<center>Input to the model will be a name, i.e. a sequence of characters. </center>
<center>Use one hot representation of the characters. </center>
<center>Remove non-ascii characters, if there are any</center>

<center>Show the effect of the following on the accuracy:</center>
<center>RNN Cells: SimpleRNN, LSTM, and GRU</center>
<center>Dataset size: 25%, 50%, 75%, and 100% of the data (.8 to .2 split)</center>

In [1]:
from keras.layers import Dense, GRU, LSTM, SimpleRNN, Bidirectional

import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
warnings.simplefilter(action='ignore')

PATH = r"C:\Users\samue\Documents\Applied Data Science\INFO-H518 Deep Learning\Assignments\A5"

In [54]:
# Read in the data and shuffle it
data = pd.read_csv(PATH+r"\data\name_gender.csv").dropna().sample(frac=1.0)
max_len = data['name'].map(lambda x: len(x)).max()
EPOCHS = 15
VERBOSE = 1
optimizer = tf.keras.optimizers.Adam(clipvalue=.3)

data['gender'] = data.gender.map({'M': 0, 'F': 1})
data['M'] = data.gender.map({0:1, 1:0})
data['F'] = data.gender.map({0:0, 1:1})

# Remove non-ascii characters from names
names = data['name'].replace({r'[^\x00-\x7F]+':''}, regex=True)
name_chars = tf.strings.unicode_split(names.to_numpy(str), 'UTF-8')

# Create the model's vocabulary
letters = []
for i, name in names.iteritems():
    letters = list(set(list(set(letters)) + list(set(name))))
o_letters = letters

In [58]:
ids_from_chars = keras.layers.StringLookup(vocabulary=letters, mask_token=None)
ids_from_chars_o = ids_from_chars
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# # Pad each name, new shape: (1 x max_len) 
# x, y = [], []
# for i, row in data.iterrows():
#     tmp = (max_len - len(row['name'])) * '0' + row['name']
#     x.append(tmp)
#     y.append([row['M']*row['probability'], row['F']*row['probability']])

# # One Hot Encode the padded names
# _x = np.zeros((len(x), max_len, len(letters)), dtype=bool)
# _y = np.zeros((len(x), 2), dtype=bool)
# for i, seq in enumerate(x):
#     for j, c in enumerate(seq):
#         if c != '0':
#             _x[i, j, ids_from_chars(c).numpy()-1] = 1
#     _y[i] = y[i]
#     if i % 1000 == 0: print(i)

# np.save(PATH+r'\data\x_names_w_single_class_shuffled', _x)
# np.save(PATH+r'\data\y_names_w_single_class_shuffled', _y)

train_x = np.load(PATH+r'\data\x_names_w_single_class_shuffled.npy')
train_y = np.load(PATH+r'\data\y_names_w_single_class_shuffled.npy')

In [4]:
def split(x, y, split):
    index = round(x.shape[0]*split)

    x_, y_ = x[:index].astype('float32'), y[:index].astype('float32')

    index = round(x_.shape[0]*.8)

    train_x_, train_y_ = x_[:index], y_[:index]
    test_x_, test_y_ = x_[index:], y_[index:]
    return (train_x_, train_y_, test_x_, test_y_)

### Dataset Size: 25%

In [5]:
train_x_, train_y_, test_x_, test_y_ = split(train_x, train_y, .25)
print(train_x_.shape, train_y_.shape)

(19005, 15, 52) (19005, 2)


In [6]:
# s_25 = keras.models.Sequential([
#     Bidirectional(SimpleRNN(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# s_25.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# s_25.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# s_25.save(PATH+r'\models\srnn\twenty_five')

In [7]:
s_25 = keras.models.load_model(PATH+r'\models\srnn\twenty_five', custom_objects={'optimizer':optimizer})
s_25.evaluate(test_x_, test_y_)



[0.3454729914665222, 0.8615028262138367]

In [8]:
# lstm_25 = keras.models.Sequential([
#     Bidirectional(LSTM(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# lstm_25.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# lstm_25.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# lstm_25.save(PATH+r'\models\lstm\twenty_five')

In [9]:
lstm_25 = keras.models.load_model(PATH+r'\models\lstm\twenty_five', custom_objects={'optimizer':optimizer})
lstm_25.evaluate(test_x_, test_y_)



[0.32392430305480957, 0.8695011734962463]

In [10]:
# gru_25 = keras.models.Sequential([
#     Bidirectional(GRU(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# gru_25.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# gru_25.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# gru_25.save(PATH+r'\models\gru\twenty_five')

In [11]:
gru_25 = keras.models.load_model(PATH+r'\models\gru\twenty_five', custom_objects={'optimizer':optimizer})
gru_25.evaluate(test_x_, test_y_)



[0.3261739909648895, 0.8724479079246521]

### Dataset Size: 50%

In [12]:
train_x_, train_y_, test_x_, test_y_ = split(train_x, train_y, .5)
train_x_.shape

(38010, 15, 52)

In [13]:
# s_50 = keras.models.Sequential([
#     Bidirectional(SimpleRNN(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# s_50.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# s_50.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# s_50.save(PATH+r'\models\srnn\fifty')

In [14]:
s_50 = keras.models.load_model(PATH+r'\models\srnn\fifty', custom_objects={'optimizer':optimizer})
s_50.evaluate(test_x_, test_y_)



[0.3296985924243927, 0.8639233708381653]

In [15]:
# lstm_50 = keras.models.Sequential([
#     Bidirectional(LSTM(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# lstm_50.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# lstm_50.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# lstm_50.save(PATH+r'\models\lstm\fifty')

In [16]:
lstm_50 = keras.models.load_model(PATH+r'\models\lstm\fifty', custom_objects={'optimizer':optimizer})
lstm_50.evaluate(test_x_, test_y_)



[0.2992900311946869, 0.8801304697990417]

In [17]:
# gru_50 = keras.models.Sequential([
#     Bidirectional(GRU(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# gru_50.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# gru_50.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# gru_50.save(PATH+r'\models\gru\fifty')

In [18]:
gru_50 = keras.models.load_model(PATH+r'\models\gru\fifty', custom_objects={'optimizer':optimizer})
gru_50.evaluate(test_x_, test_y_)



[0.30372515320777893, 0.8831824660301208]

### Dataset Size: 75%

In [19]:
train_x_, train_y_, test_x_, test_y_ = split(train_x, train_y, .75)
train_x_.shape

(57015, 15, 52)

In [20]:
# s_75 = keras.models.Sequential([
#     Bidirectional(SimpleRNN(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# s_75.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# s_75.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# s_75.save(PATH+r'\models\srnn\seventy_five')

In [21]:
s_75 = keras.models.load_model(PATH+r'\models\srnn\seventy_five', custom_objects={'optimizer':optimizer})
s_75.evaluate(test_x_, test_y_)



[0.30558502674102783, 0.8774378895759583]

In [22]:
# lstm_75 = keras.models.Sequential([
#     Bidirectional(LSTM(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# lstm_75.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# lstm_75.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# lstm_75.save(PATH+r'\models\lstm\seventy_five')

In [23]:
lstm_75 = keras.models.load_model(PATH+r'\models\lstm\seventy_five', custom_objects={'optimizer':optimizer})
lstm_75.evaluate(test_x_, test_y_)



[0.27294453978538513, 0.8914690613746643]

In [24]:
# gru_75 = keras.models.Sequential([
#     Bidirectional(GRU(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# gru_75.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# gru_75.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# gru_75.save(PATH+r'\models\gru\seventy_five')

In [25]:
gru_75 = keras.models.load_model(PATH+r'\models\gru\seventy_five', custom_objects={'optimizer':optimizer})
gru_75.evaluate(test_x_, test_y_)



[0.28710517287254333, 0.8904868960380554]

### Dataset Size: 100%

In [26]:
train_x_, train_y_, test_x_, test_y_ = split(train_x, train_y, 1.0)
train_x_.shape

(76020, 15, 52)

In [27]:
# s_100 = keras.models.Sequential([
#     Bidirectional(SimpleRNN(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# s_100.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# s_100.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# s_100.save(PATH+r'\models\srnn\all')

In [28]:
s_100 = keras.models.load_model(PATH+r'\models\srnn\all', custom_objects={'optimizer':optimizer})
s_100.evaluate(test_x_, test_y_)



[0.28891894221305847, 0.8811365365982056]

In [29]:
# lstm_100 = keras.models.Sequential([
#     Bidirectional(LSTM(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# lstm_100.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# lstm_100.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# lstm_100.save(PATH+r'\models\lstm\all')

In [30]:
lstm_100 = keras.models.load_model(PATH+r'\models\lstm\all', custom_objects={'optimizer':optimizer})
lstm_100.evaluate(test_x_, test_y_)



[0.256156861782074, 0.8981320858001709]

In [31]:
# gru_100 = keras.models.Sequential([
#     Bidirectional(GRU(64, input_shape=(max_len, len(letters)))),
#     Dense(2, activation='softmax')
# ])
# gru_100.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
# gru_100.fit(train_x_, train_y_, epochs=EPOCHS, validation_split=.2, verbose=VERBOSE)
# gru_100.save(PATH+r'\models\gru\all')

In [32]:
gru_100 = keras.models.load_model(PATH+r'\models\gru\all', custom_objects={'optimizer':optimizer})
gru_100.evaluate(test_x_, test_y_)



[0.26202625036239624, 0.8997105956077576]

# Problem 2

Train a language model using these names, generate 100 male and female names, compare the accuracies

In [59]:
# Create padded sequence tensors, each of the size (1 x max_len) for each name
# the associated labels represent the next character in the sequence
x, y = [], []
for word in names:
    tmp = (max_len - len(word)) * '0' + word + '\n'
    x.append(tmp[2:])
    y.append('')
    for i, j in enumerate(word):
        if (i >= len(word) - 1):
            break
        tmp = (max_len - len(word[:-1-i])) * '0' + word[:-1-i].lower()
        x.append(tmp)
        y.append(word[-1-i])

letters = list(set(list(set([i.lower() for i in letters])) + ['\n']))

In [92]:
ids_from_chars = keras.layers.StringLookup(vocabulary=letters, mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# # One Hot Encode the sequences
# _x = np.zeros((len(x), max_len, len(letters)), dtype=bool)
# _y = np.zeros((len(x), len(letters)), dtype=bool)
# for i, seq in enumerate(x):
#     for j, c in enumerate(seq):
#         if c != '0':
#             _x[i, j, ids_from_chars(c).numpy()-1] = 1
#     _y[i, ids_from_chars(y[i]).numpy()-1] = 1
#     if i % round(.1*_x.shape[0]) == 0: print()

# np.save(PATH+r'\data\x_names_shuffled_lower_stop', _x)
# np.save(PATH+r'\data\y_names_shuffled_lower_stop', _y)

train_x = np.load(PATH+r'\data\x_names_shuffled_lower_stop.npy')
train_y = np.load(PATH+r'\data\y_names_shuffled_lower_stop.npy')

In [93]:
# lang_mod = keras.models.Sequential([
#     Bidirectional(GRU(64, input_shape=(max_len, len(letters)))),
#     Dense(len(letters), activation='softmax')
# ])
# lang_mod.compile(optimizer='adam', loss='categorical_crossentropy')
# lang_mod.fit(train_x.astype('float32'), train_y.astype('float32'), epochs=15, validation_split=.2, verbose=1)
# lang_mod.save(PATH+r'\models\lang_mod')

In [62]:
lang_mod = keras.models.load_model(PATH+r'\models\lang_mod')

In [80]:
import string

base = [np.random.choice(list(string.ascii_uppercase)) for _ in range(100)]
print(base)
dist = [int(i) for i in np.random.normal(6, 2, 100)]
males, females = [], []
while len(males) < 100 or len(females) < 100:
    for i in base:
        seq = ('{0:0>' + str(max_len)+'}').format(i).lower()
        new_word = i
        name_len = np.random.choice(dist)
        while (len(new_word) < max_len):
            # Vectorize the input of the model.
            x_pred = np.zeros((1, max_len, len(letters)))
            for j, c in enumerate(seq):
                if c != '0' and j < x_pred.shape[1]:
                    x_pred[0, j, ids_from_chars(c)-1] = 1

            # Predict the probabilities of the next char.
            preds = lang_mod.predict(x_pred, verbose=0)[0]
            preds = np.exp(preds) / np.sum(np.exp(preds))

            # Randomly sample from the top ten percent of probabilities 
            top_ten_pct = []
            for _ in range(np.ceil(len(preds)*.05).astype(np.int16)):
                index = np.where(preds == max(preds))[0][0]
                top_ten_pct.append(index)
                preds[index] = 0.0
            next_char = chars_from_ids(np.random.choice(top_ten_pct)+1) \
                .numpy().decode('utf-8')

            if ((next_char == '\n'
                    or next_char in string.ascii_uppercase
                    or len(new_word) > name_len) and len(new_word) > 1):
                break
            else:
                # Append the character
                new_word += next_char.lower()
                # Add pre-padding of zeros to the sequence generated and continue.
                seq = ('{0:0>' + str(max_len) + '}').format(new_word).lower()
        tmp = (max_len - len(new_word)) * '0' + new_word
        new_vec = np.zeros((1, max_len, len(o_letters)))
        for k, c in enumerate(tmp):
            new_vec[0, k, ids_from_chars_o(c).numpy()-1] = 1
        idx = list(gru_100.predict(new_vec, verbose=0)[0])
        if idx.index(max(idx)) == 0 and len(males) < 100:
            males.append(new_word)
        elif idx.index(max(idx)) == 0 and len(females) < 100:
            females.append(new_word)
    print(f"Males: {len(males)} | Females: {len(females)}")

['N', 'M', 'X', 'B', 'G', 'J', 'G', 'T', 'I', 'J', 'V', 'K', 'A', 'V', 'Q', 'W', 'N', 'D', 'B', 'Y', 'M', 'D', 'F', 'R', 'A', 'J', 'L', 'O', 'E', 'E', 'I', 'Y', 'F', 'N', 'J', 'P', 'B', 'E', 'V', 'I', 'H', 'C', 'I', 'U', 'X', 'Q', 'L', 'B', 'C', 'V', 'M', 'T', 'N', 'E', 'K', 'A', 'S', 'X', 'E', 'M', 'L', 'A', 'X', 'V', 'M', 'C', 'K', 'J', 'W', 'S', 'X', 'L', 'W', 'C', 'R', 'J', 'Y', 'D', 'B', 'F', 'R', 'Q', 'P', 'P', 'H', 'F', 'E', 'M', 'N', 'W', 'C', 'J', 'X', 'H', 'E', 'Y', 'Y', 'W', 'B', 'G']
Males: 83 | Females: 0
Males: 100 | Females: 67
Males: 100 | Females: 100


In [82]:
print(f"Males: {males}")
print(f"Females: {females}")

Males: ['Malekai', 'Xilandrasia', 'Gabria', 'Gabria', 'Ilisa', 'Jame', 'Vanella', 'Karille', 'Alisan', 'Qaileney', 'Waldardand', 'Brayl', 'Yasi', 'Malan', 'Demerica', 'Falis', 'Raylee', 'Alexi', 'Latrenish', 'Orlineth', 'Elissiaha', 'Emi', 'Ilandra', 'Yarish', 'Freddic', 'Nathaleerah', 'Jerima', 'Pettya', 'Bertina', 'Ellie', 'Verlien', 'Ilisanneralded', 'Hellena', 'Chanteli', 'Ilis', 'Urellan', 'Xilar', 'Qaarisha', 'Carmelle', 'Vela', 'Margarielan', 'Taneisha', 'Nathaliner', 'Elissande', 'Karlee', 'Alian', 'Salimar', 'Xaivien', 'Ellis', 'Malekain', 'Lasha', 'Annes', 'Xila', 'Mikay', 'Kendrel', 'Jameichaella', 'Win', 'Shelli', 'Lassan', 'Wald', 'Channethean', 'Roselle', 'Jessi', 'Yarelis', 'Darlisah', 'Brannel', 'Freedicheekw', 'Ros', 'Qaatielia', 'Patreli', 'Paritta', 'Helliea', 'Elizethe', 'Nickol', 'Winteriou', 'Carlieg', 'Jessell', 'Xaivienet', 'Helianah', 'Eliza', 'Yosef', 'Wilmanig', 'Brendanne', 'Na', 'Mikeiahia', 'Xiara', 'Genevi', 'Tenishaw', 'Ili', 'Jerem', 'Vanil', 'Kendric',

In [85]:
# MALES
# Pad each name, new shape: (1 x max_len) 
males_x, males_y = [], []
for i, word in enumerate(males):
    tmp = (max_len - len(word)) * '0' + word
    males_x.append(tmp)
    males_y.append([1.0, 0.0])

# One Hot Encode the padded names
m_x = np.zeros((len(males_x), max_len, len(o_letters)), dtype=bool)
m_y = np.zeros((len(males_x), 2), dtype=bool)
for i, seq in enumerate(males_x):
    for j, c in enumerate(seq):
        if c != '0':
            m_x[i, j, ids_from_chars(c).numpy()-1] = 1
    m_y[i] = y[i]

# FEMALES
# Pad each name, new shape: (1 x max_len) 
females_x, females_y = [], []
for i, word in enumerate(females):
    tmp = (max_len - len(word)) * '0' + word
    females_x.append(tmp)
    females_y.append([0.0, 1.0])

# One Hot Encode the padded names
f_x = np.zeros((len(females_x), max_len, len(o_letters)), dtype=bool)
f_y = np.zeros((len(females_x), 2), dtype=bool)
for i, seq in enumerate(females_x):
    for j, c in enumerate(seq):
        if c != '0':
            f_x[i, j, ids_from_chars(c).numpy()-1] = 1
    f_y[i] = y[i]

In [90]:
m_score = lstm_100.evaluate(m_x, m_y)
f_score = lstm_100.evaluate(f_x, f_y)



# Problem #2.a
Create a model trained on only names starting with A, M, or Z, generate 50 names, determine the quality of the names using perplexity

In [96]:
# Create padded sequence tensors, each of the size (1 x max_len) for each name
# the associated labels represent the next character in the sequence
x, y = [], []
for word in names:
    if word[0] in 'AMZ':
        tmp = (max_len - len(word)) * '0' + word + '\n'
        x.append(tmp[2:])
        y.append('')
        for i, j in enumerate(word):
            if (i >= len(word) - 1):
                break
            tmp = (max_len - len(word[:-1-i])) * '0' + word[:-1-i].lower()
            x.append(tmp)
            y.append(word[-1-i])

letters = list(set(list(set([i.lower() for i in letters])) + ['\n']))

In [104]:
ids_from_chars = keras.layers.StringLookup(vocabulary=letters, mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# # One Hot Encode the sequences
# _x = np.zeros((len(x), max_len, len(letters)), dtype=bool)
# _y = np.zeros((len(x), len(letters)), dtype=bool)
# for i, seq in enumerate(x):
#     for j, c in enumerate(seq):
#         if c != '0':
#             _x[i, j, ids_from_chars(c).numpy()-1] = 1
#     _y[i, ids_from_chars(y[i]).numpy()-1] = 1
#     if i % round(.1*_x.shape[0]) == 0: print(i)

# np.save(PATH+r'\data\x_names_shuffled_lower_stop_amz', _x)
# np.save(PATH+r'\data\y_names_shuffled_lower_stop_amz', _y)

train_x = np.load(PATH+r'\data\x_names_shuffled_lower_stop_amz.npy')
train_y = np.load(PATH+r'\data\y_names_shuffled_lower_stop_amz.npy')

In [106]:
amz_lang_mod = keras.models.Sequential([
    Bidirectional(GRU(64, input_shape=(max_len, len(letters)))),
    Dense(len(letters), activation='softmax')
])
amz_lang_mod.compile(optimizer='adam', loss='categorical_crossentropy')
amz_lang_mod.fit(train_x.astype('float32'), train_y.astype('float32'), epochs=15, validation_split=.2, verbose=1)
amz_lang_mod.save(PATH+r'\models\amz_lang_mod')

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15




INFO:tensorflow:Assets written to: C:\Users\samue\Documents\Applied Data Science\INFO-H518 Deep Learning\Assignments\A5\models\amz_lang_mod\assets


INFO:tensorflow:Assets written to: C:\Users\samue\Documents\Applied Data Science\INFO-H518 Deep Learning\Assignments\A5\models\amz_lang_mod\assets


In [113]:
base = [np.random.choice(list('AMZ')) for _ in range(50)]
print(base)
dist = [int(i) for i in np.random.normal(6, 2, 100)]
results = []
for i in base:
    seq = ('{0:0>' + str(max_len)+'}').format(i).lower()
    new_word = i
    name_len = np.random.choice(dist)
    while (len(new_word) < max_len):
        # Vectorize the input of the model.
        x_pred = np.zeros((1, max_len, len(letters)))
        for j, c in enumerate(seq):
            if c != '0' and j < x_pred.shape[1]:
                x_pred[0, j, ids_from_chars(c)-1] = 1

        # Predict the probabilities of the next char.
        preds = lang_mod.predict(x_pred, verbose=0)[0]
        preds = np.exp(preds) / np.sum(np.exp(preds))

        # Randomly sample from the top ten percent of probabilities 
        top_ten_pct = []
        for _ in range(np.ceil(len(preds)*.05).astype(np.int16)):
            index = np.where(preds == max(preds))[0][0]
            top_ten_pct.append(index)
            preds[index] = 0.0
        next_char = chars_from_ids(np.random.choice(top_ten_pct)+1) \
            .numpy().decode('utf-8')

        if ((next_char == '\n'
                or next_char in string.ascii_uppercase
                or len(new_word) > name_len) and len(new_word) > 1):
            break
        else:
            # Append the character
            new_word += next_char.lower()
            # Add pre-padding of zeros to the sequence generated and continue.
            seq = ('{0:0>' + str(max_len) + '}').format(new_word).lower()
    results.append(new_word)

['A', 'Z', 'A', 'Z', 'M', 'M', 'M', 'A', 'Z', 'Z', 'A', 'A', 'Z', 'M', 'A', 'A', 'A', 'M', 'M', 'Z', 'A', 'Z', 'Z', 'Z', 'M', 'A', 'M', 'A', 'M', 'M', 'A', 'M', 'M', 'A', 'A', 'M', 'Z', 'M', 'Z', 'M', 'M', 'A', 'Z', 'Z', 'Z', 'Z', 'A', 'Z', 'A', 'Z']


In [129]:
amz_x, amz_y = [], []
for word in results:
    tmp = (max_len - len(word)) * '0' + word + '\n'
    amz_x.append(tmp[2:])
    amz_y.append('')
    for i, j in enumerate(word):
        if (i >= len(word) - 1):
            break
        tmp = (max_len - len(word[:-1-i])) * '0' + word[:-1-i].lower()
        amz_x.append(tmp)
        amz_y.append(word[-1-i])

# One Hot Encode the sequences
x_amz = np.zeros((len(amz_x), max_len, len(letters)), dtype=bool)
y_amz = np.zeros((len(amz_x), len(letters)), dtype=bool)
for i, seq in enumerate(amz_x):
    for j, c in enumerate(seq):
        if c != '0':
            x_amz[i, j, ids_from_chars(c).numpy()-1] = 1
    y_amz[i, ids_from_chars(amz_y[i]).numpy()-1] = 1
    if i % round(.1*x_amz.shape[0]) == 0: print(i)

loss = amz_lang_mod.evaluate(x_amz, y_amz)

0
33
66
99
132
165
198
231
264
297


In [132]:
tf.exp(loss).numpy()

4.21912

Since we are using cross_entropy as our loss, we will take tf.exp(loss) to get the perplexity of our model on the result set. 