# Name Generator

In [34]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, TimeDistributed, Dense, Masking
from tensorflow.keras.models import load_model

In [2]:
DATA_PATH = '../data'
OUTPUT_PATH = '../output'

MALE_NAMES_FILEPATH = '../data/male_names.csv'

### Auxiliary Elements

Create mappings **map_char_to_int** and **map_int_to_char**, which map a character to its integer representation and viceversa. 

In [3]:
standard_chars = [chr(i) for i in range(97, 123)]
special_chars = [' ', 'à', 'á', 'è', 'é', 'í', 'ò', 'ó', 'ú', 'ñ', 'ç', '.',"'",]
chars = standard_chars + special_chars

seq = [i for i in range(1, len(chars) + 1)]  # They start at 1
map_char_to_int = dict(zip(chars, seq))
map_int_to_char = dict(zip(seq, chars))

In [4]:
def one_hot_encoding(encoded_array, features):
    n = len(encoded_array)
    results = np.zeros((n, features), dtype = 'int8')
    for i in range(n):
        pos = encoded_array[i]
        results[i, pos] = 1
    return results

In [5]:
def one_hot_decoding(decoded_array):
    # Shape: (timesteps, n)
    a, b = decoded_array.shape
    results = np.zeros(a, dtype = 'int32')
    for i in range(a):
        pos = np.argmax(decoded_array[i])
        results[i] = pos
    return results

In [6]:
def encode_word_to_int(word, mapping):
    n = len(word)
    result = np.zeros(n, dtype = 'int32')
    for i, c in enumerate(word):
        result[i] = mapping[c]
    return result

In [7]:
def decode_int_to_word(word_int, mapping):
    n = len(word_int)
    result = ''
    for i in word_int:
        if i in mapping.keys():
            result += mapping[i]
        else:
            result += 'UNK'
    return result

In [8]:
def encode_list(array, mapping):
    result = []
    for word in array:
        word_int = encode_word_to_int(word, mapping)
        word_encoded = one_hot_encoding(word_int, len(mapping))
        result.append(word_encoded)
    return result

In [9]:
def decode_list(array, mapping):
    result = []
    for word_encoded in array:
        word_int = one_hot_decoding(word_encoded)
        word = decode_int_to_word(word_int, mapping)
        result.append(word)
    return result

### Load Data

Load names and store them in **male_names_data**

In [10]:
male_names_raw = pd.read_csv(MALE_NAMES_FILEPATH, sep = ';', decimal = ',')
male_names_data = male_names_raw['Nombre'].tolist()

Store the parameters of the model:  
`m:` number of samples  
`n:` number of features  
`timesteps:` length of the input vector. Since names have different lenghts, we will have to pad them.

In [11]:
m = len(male_names_data)
n = len(map_char_to_int) + 1
timesteps = len(max(male_names_data, key = len))  # We will add a dot later

Tranform everything to **lowercase**.

In [12]:
male_names_data = [x.lower() for x in male_names_data]

Create matrices $X$ and $y$

In [13]:
X_male = male_names_data
y_male = [name[1:] + '.' for name in X_male]

Each letter will be encoded as an integer, which in turn, will be one-hot encoded. For example:  

$$ carlos \rightarrow [2, 0, 17, 11, 14, 18] \rightarrow [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [...], ..., [...]]$$



In [14]:
X_male_int = []
X_male_encoded = []
for i in range(m):
    word = X_male[i]
    mapped_word = list(map(lambda x: map_char_to_int[x], word))  # Word as a list of integers representing letters
    X_male_int.append(mapped_word)
    
    mapped_word = one_hot_encoding(mapped_word, n)  # 2D array with integers one-hot encoded
    X_male_encoded.append(mapped_word)

In [15]:
# Show some examples
print(f"Name Antonio mapped to integer:\n{X_male_int[0]}")
print(f"\nPrevious integers one-hot encoded:\n{X_male_encoded[0]}")

Name Antonio mapped to integer:
[1, 14, 20, 15, 14, 9, 15]

Previous integers one-hot encoded:
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]]


As for the **labels**, they are the same as the inputs shifted one character forward, and adding a final **EOF** character.

In [16]:
dot_encoded = one_hot_encoding([map_char_to_int['.']], n)
y_male_int = [word_int[1:] + [map_char_to_int['.']] for word_int in X_male_int]
y_male_encoded = [np.concatenate((l[1:], dot_encoded), axis = 0)for l in X_male_encoded]

Let's add some **padding**

In [17]:
padding = np.zeros(n, dtype = 'int')
X_male_padded = tf.keras.preprocessing.sequence.pad_sequences(X_male_encoded,
                                                            maxlen = timesteps,
                                                            padding = 'post',
                                                            truncating = 'post', 
                                                            value = 0)

In [18]:
y_male_padded = tf.keras.preprocessing.sequence.pad_sequences(y_male_encoded,
                                                       maxlen = timesteps,
                                                       padding = 'post',
                                                       truncating = 'post', 
                                                       value = 0)

In [19]:
X_male_input = X_male_padded
y_male_input = y_male_padded
# X_male_input = np.array(X_male, dtype = 'float32')
# y_male_input = np.array(y_male, dtype = 'float32')

### Model

In [26]:
model = tf.keras.Sequential()
model.add(Masking(input_shape = (timesteps, n),
                  mask_value = 0))
model.add(SimpleRNN(units = 100,
                   return_sequences = True,
                   activation = 'tanh'))
model.add(TimeDistributed(Dense(units = 40,
                               activation = 'softmax')))

In [27]:
model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, 21, 40)            0         
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 21, 100)           14100     
_________________________________________________________________
time_distributed (TimeDistri (None, 21, 40)            4040      
Total params: 18,140
Trainable params: 18,140
Non-trainable params: 0
_________________________________________________________________


In [32]:
# history = model.fit(X_male_input, y_male_input, epochs = 20)
model = model.load_model(os.path.join(OUTPUT_PATH, 'model.h5'))

Train on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
model.save(os.path.join(OUTPUT_PATH, 'model.h5'))

In [35]:
model.evaluate(X_male_input, y_male_input)



[0.4048293816566467, 0.7343928]

### Performance

In [37]:
preds = model.predict(X_male_input)
preds_decoded = np.array(decode_list(preds, map_int_to_char))

In [95]:
wrong_correct_df = pd.DataFrame(zip(preds_decoded, y_male), columns = ["Prediction", "Original"])
wrong_correct_df

Unnamed: 0,Prediction,Original
0,ntonio,ntonio.
1,ose,ose.
2,anuel,anuel.
3,rancisco,rancisco.
4,anid,avid.
...,...,...
4995,rrerico jrrlos.......,ederico carlos.
4996,rrnando angesto......,ernando augusto.
4997,urardo jrancisco.....,erardo francisco.
4998,emsaiaaaaaaaaaaaaaaaa,ossam.


In [72]:
mask_correct_w_stop = []
mask_correct_wo_stop = []
for i in range(m):
    for j, c in enumerate(y_male[i]):
        mask_correct_w_stop.append(preds_decoded[i][j] == c)
        mask_correct_wo_stop.append(preds_decoded[i][j] == c)
        if c == '.':
            mask_correct_wo_stop[-1] = True

In [96]:
# Get words predicted correctly as a whole

mask_correct_words = np.array([False] * m)
for i in range(m):
    pred = preds_decoded[i]
    orig = y_male[i][:-1]
    mask_correct_words[i] = orig in pred
    
correct_preds = np.array(X_male)[mask_correct_words]
correct_preds

array(['antonio', 'jose', 'manuel', 'francisco', 'daniel', 'carlos',
       'rafael', 'pedro', 'luis', 'oscar', 'vicente', 'santiago',
       'guillermo', 'tomas', 'hector', 'isaac', 'benito', 'antoni',
       'elias', 'kevin', 'anton', 'vicent', 'xavi', 'yassin',
       'francisco angel', 'vicente antonio', 'oscar jose',
       'manuel andres', 'dani', 'fran', 'quintin', 'pedro andres',
       'nathan', 'dan', 'santi', 'hector jose', 'guillermo antonio',
       'carlo', 'rafa', 'benito jose', 'daniel andres', 'kevin andres',
       'manu', 'rafael andres', 'jose alexander', 'francis', 'franc',
       'luis alexander', 'santiago angel', 'carlos alexander', 'zayd',
       'tom', 'willian', 'isaac antonio', 'eli', 'tomas andres',
       'elias jesus'], dtype='<U21')

In [77]:
# Accuracy
sum(mask_correct_w_stop)/len(mask_correct_w_stop)

0.7343928453899611