# Name Generator

In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, TimeDistributed, Dense, Masking
from tensorflow.keras.models import load_model

In [2]:
DATA_PATH = '../data'
OUTPUT_PATH = '../output'

MALE_NAMES_FILEPATH = '../data/male_names.csv'

### Auxiliary Elements

Create mappings **map_char_to_int** and **map_int_to_char**, which map a character to its integer representation and viceversa. 

In [3]:
standard_chars = [chr(i) for i in range(97, 123)]
special_chars = [' ', 'à', 'á', 'è', 'é', 'í', 'ò', 'ó', 'ú', 'ñ', 'ç', '.',"'",]
chars = standard_chars + special_chars

seq = [i for i in range(1, len(chars) + 1)]  # They start at 1
map_char_to_int = dict(zip(chars, seq))
map_int_to_char = dict(zip(seq, chars))

In [4]:
def one_hot_encoding(encoded_array, features):
    n = len(encoded_array)
    results = np.zeros((n, features), dtype = 'int8')
    for i in range(n):
        pos = encoded_array[i]
        results[i, pos - 1] = 1
    return results

In [5]:
def one_hot_decoding(decoded_array):
    # Shape: (timesteps, n)
    a, b = decoded_array.shape
    results = np.zeros(a, dtype = 'int32')
    for i in range(a):
        pos = np.argmax(decoded_array[i])
        results[i] = pos + 1
    return results

In [6]:
def encode_word_to_int(word, mapping):
    n = len(word)
    result = np.zeros(n, dtype = 'int32')
    for i, c in enumerate(word):
        result[i] = mapping[c]
    return result

In [7]:
def decode_int_to_word(word_int, mapping):
    n = len(word_int)
    result = ''
    for i in word_int:
        if i in mapping.keys():
            result += mapping[i]
        else:
            result += 'UNK'
    return result

In [8]:
def encode_list(array, mapping):
    result = []
    for word in array:
        word_int = encode_word_to_int(word, mapping)
        word_encoded = one_hot_encoding(word_int, len(mapping))
        result.append(word_encoded)
    return result

In [9]:
def decode_list(array, mapping):
    result = []
    for word_encoded in array:
        word_int = one_hot_decoding(word_encoded)
        word = decode_int_to_word(word_int, mapping)
        result.append(word)
    return result

### Load Data

Load names and store them in **male_names_data**

In [10]:
male_names_raw = pd.read_csv(MALE_NAMES_FILEPATH, sep = ';', decimal = ',')
male_names_data = male_names_raw['Nombre'].tolist()

Store the parameters of the model:  
`m:` number of samples  
`n:` number of features  
`timesteps:` length of the input vector. Since names have different lenghts, we will have to pad them.

In [11]:
m = len(male_names_data)
n = len(map_char_to_int)
timesteps = len(max(male_names_data, key = len))  # We will add a dot later

Tranform everything to **lowercase**.

In [12]:
male_names_data = [x.lower() for x in male_names_data]

Create matrices $X$ and $y$

In [13]:
X_male = male_names_data
y_male = [name[1:] + '.' for name in X_male]

Each letter will be encoded as an integer, which in turn, will be one-hot encoded. For example:  

$$ carlos \rightarrow [2, 0, 17, 11, 14, 18] \rightarrow [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [...], ..., [...]]$$



In [14]:
X_male_int = []
X_male_encoded = []
for i in range(m):
    word = X_male[i]
    mapped_word = list(map(lambda x: map_char_to_int[x], word))  # Word as a list of integers representing letters
    X_male_int.append(mapped_word)
    
    mapped_word = one_hot_encoding(mapped_word, n)  # 2D array with integers one-hot encoded
    X_male_encoded.append(mapped_word)

In [15]:
# Show some examples
print(f"Name Antonio mapped to integer:\n{X_male_int[0]}")
print(f"\nPrevious integers one-hot encoded:\n{X_male_encoded[0]}")

Name Antonio mapped to integer:
[1, 14, 20, 15, 14, 9, 15]

Previous integers one-hot encoded:
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]]


As for the **labels**, they are the same as the inputs shifted one character forward, and adding a final **EOF** character.

In [16]:
dot_encoded = one_hot_encoding([map_char_to_int['.']], n)
y_male_int = [word_int[1:] + [map_char_to_int['.']] for word_int in X_male_int]
y_male_encoded = [np.concatenate((l[1:], dot_encoded), axis = 0)for l in X_male_encoded]

Let's add some **padding**

In [17]:
padding = np.zeros(n, dtype = 'int')
X_male_padded = tf.keras.preprocessing.sequence.pad_sequences(X_male_encoded,
                                                            maxlen = timesteps,
                                                            padding = 'post',
                                                            truncating = 'post', 
                                                            value = 0)

In [18]:
y_male_padded = tf.keras.preprocessing.sequence.pad_sequences(y_male_encoded,
                                                       maxlen = timesteps,
                                                       padding = 'post',
                                                       truncating = 'post', 
                                                       value = 0)

In [19]:
X_male_input = X_male_padded
y_male_input = y_male_padded
# X_male_input = np.array(X_male, dtype = 'float32')
# y_male_input = np.array(y_male, dtype = 'float32')

### Model

In [20]:
model = tf.keras.Sequential()
model.add(Masking(input_shape = (None, n),
                  mask_value = 0))
model.add(SimpleRNN(units = 100,
                   return_sequences = True,
                   activation = 'tanh'))
model.add(TimeDistributed(Dense(units = n,
                               activation = 'softmax')))

In [21]:
model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, None, 39)          0         
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, None, 100)         14000     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 39)          3939      
Total params: 17,939
Trainable params: 17,939
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model.fit(X_male_input, y_male_input, epochs = 60)
# model = load_model(os.path.join(OUTPUT_PATH, 'model.h5'))

Train on 5000 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [23]:
model.save(os.path.join(OUTPUT_PATH, 'model.h5'))

In [24]:
model.evaluate(X_male_input, y_male_input)



[0.42711880807876584, 0.7236106]

### Performance

In [25]:
preds = model.predict(X_male_input)
preds_decoded = np.array(decode_list(preds, map_int_to_char))

In [26]:
wrong_correct_df = pd.DataFrame(zip(preds_decoded, y_male), columns = ["Prediction", "Original"])
wrong_correct_df

Unnamed: 0,Prediction,Original
0,ntonio,ntonio.
1,ose,ose.
2,aruel,anuel.
3,rancisco,rancisco.
4,avid,avid.
...,...,...
4995,rlerico jarlos.......,ederico carlos.
4996,rlnando alresto......,ernando augusto.
4997,urardo jrancisco.....,erardo francisco.
4998,ertai................,ossam.


In [27]:
mask_correct_w_stop = []
mask_correct_wo_stop = []
for i in range(m):
    for j, c in enumerate(y_male[i]):
        mask_correct_w_stop.append(preds_decoded[i][j] == c)
        mask_correct_wo_stop.append(preds_decoded[i][j] == c)
        if c == '.':
            mask_correct_wo_stop[-1] = True

In [28]:
# Get words predicted correctly as a whole

mask_correct_words = np.array([False] * m)
for i in range(m):
    pred = preds_decoded[i]
    orig = y_male[i][:-1]
    mask_correct_words[i] = orig in pred
    
correct_preds = np.array(X_male)[mask_correct_words]
correct_preds

array(['antonio', 'jose', 'francisco', 'david', 'carlos', 'rafael',
       'pablo', 'luis', 'oscar', 'santiago', 'eduardo', 'victor',
       'marcos', 'guillermo', 'marc', 'tomas', 'hector', 'xavier',
       'isaac', 'bernardo', 'marco', 'antoni', 'kevin', 'eduard', 'anton',
       'xavi', 'guillermo jose', 'fran', 'eduardo antonio', 'santi',
       'bernardo jose', 'jose alexis', 'rafael alejandro', 'carlo',
       'rafa', 'tomas antonio', 'santiago antonio', 'david alexander',
       'francis', 'hector antonio', 'franc', 'kevin alejandro',
       'carlos alexis', 'francisco alexis', 'isaac jose', 'bernard',
       'quirino', 'antonio alexis', 'tom', 'luis alexis', 'edu',
       'victor alexander'], dtype='<U21')

In [29]:
# Accuracy
sum(mask_correct_w_stop)/len(mask_correct_w_stop)

0.7236106002826226

### Prediction
Randomly predict names

In [119]:
# Text generation
x = np.zeros((1, 1, n))
word = ''
c = '-'
while c != '.':
    y = model.predict(x)
    y_n = y[0, -1, :]
    y_n_hat = np.random.choice(range(1, n + 1), p = y_n)  # Canviar si es canvia n
    y_n_hat_encoded = np.reshape(one_hot_encoding([y_n_hat], n), (1, 1, -1))
    
    c = map_int_to_char[y_n_hat]
    word += c
    x = np.concatenate((x, y_n_hat_encoded), axis = 1)
print(word)

òuritz.
