# Generating Brazilian Names with N-gram Models

In [1]:
from typing import Iterable, Any

import torch
import torch.nn.functional as F
from tqdm.auto import tqdm

from ibge import load_ibge_name_data

### Data Loading

In [2]:
data = load_ibge_name_data()

len(data)

130356

In [3]:
data[:5]

[{'name': 'maria', 'freq': 11734129},
 {'name': 'jose', 'freq': 5754529},
 {'name': 'ana', 'freq': 3089858},
 {'name': 'joao', 'freq': 2984119},
 {'name': 'antonio', 'freq': 2576348}]

### N-grams

In [4]:
# Find unique characters
chars = sorted(set("".join([x["name"] for x in data])))
n_chars = len(chars)

# Create a mapping from characters to indices and vice versa
i_to_c = dict(enumerate(["."] + chars))
c_to_i = {v: k for k, v in i_to_c.items()}

In [5]:
def get_ngrams(name: str, n: int) -> Iterable[tuple]:
    name = (n - 1)*"." + name + "."

    return zip(*[name[i:] for i in range(n)])

list(get_ngrams("rafael", n=4))

[('.', '.', '.', 'r'),
 ('.', '.', 'r', 'a'),
 ('.', 'r', 'a', 'f'),
 ('r', 'a', 'f', 'a'),
 ('a', 'f', 'a', 'e'),
 ('f', 'a', 'e', 'l'),
 ('a', 'e', 'l', '.')]

In [6]:
def encode_data_into_ngram_tensors(
    data: list[dict[str, Any]], 
    n: int
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    assert n > 1

    input_idxs = []
    target_idxs = []
    freqs = []

    for x in tqdm(data):
        name = x["name"]
        freq = x["freq"]

        for chars in get_ngrams(name, n):
            input_idxs.append([c_to_i[c] for c in chars[:-1]])
            target_idxs.append(c_to_i[chars[-1]])
            freqs.append(freq)

    X = F.one_hot(torch.tensor(input_idxs), num_classes=n_chars + 1).float()
    X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])

    y = F.one_hot(torch.tensor(target_idxs), num_classes=n_chars + 1).float()

    sample_weights = torch.tensor(freqs, dtype=torch.float32)
    sample_weights /= sample_weights.sum()

    return X, y, sample_weights

In [7]:
n = 6
X, y, sample_weights = encode_data_into_ngram_tensors(data, n)

X.shape, y.shape, sample_weights.shape

  0%|          | 0/130356 [00:00<?, ?it/s]

(torch.Size([1051060, 135]), torch.Size([1051060, 27]), torch.Size([1051060]))

In [8]:
W1 = torch.randn((n_chars + 1)*(n-1), 256, requires_grad=True)
W2 = torch.randn((256, n_chars + 1), requires_grad=True)

for i in range(50 + 1):
    H1 = F.relu(X @ W1)
    logits = H1 @ W2
    
    nll = F.cross_entropy(logits, y, reduction="none")
    loss = (nll * sample_weights).sum()

    if i % 5 == 0:
        print(f"epoch={i} | loss={loss.item():.4f}")

    W1.grad = None
    W2.grad = None
    loss.backward()

    W1.data += -10*W1.grad
    W2.data += -10*W2.grad


epoch=0 | loss=99.0252


KeyboardInterrupt: 

In [None]:
chars = []
input_idxs = [0]*(n - 1)

while True:
    x_enc = F.one_hot(torch.tensor(input_idxs), num_classes=n_chars + 1).float()
    x_enc = x_enc.reshape(1, x_enc.shape[0]*x_enc.shape[1])

    H1 = F.relu(x_enc @ W1)
    logits = H1 @ W2
    probs = F.softmax(logits, dim=1)
    i = torch.multinomial(probs, num_samples=1).item()

    if i == 0:
        break
    else:
        chars.append(i_to_c[i])
        input_idxs = input_idxs[1:] + [i]

name = ''.join(chars)
name