# Generating Brazilian Names with N-gram Models

In [1]:
from typing import Iterable, Any

import torch
import torch.nn.functional as F
import numpy as np
from tqdm.auto import tqdm

from ibge import load_ibge_name_data

### Data Loading

In [2]:
data = load_ibge_name_data()

len(data)

130356

In [3]:
data[:5]

[{'name': 'maria', 'freq': 11734129},
 {'name': 'jose', 'freq': 5754529},
 {'name': 'ana', 'freq': 3089858},
 {'name': 'joao', 'freq': 2984119},
 {'name': 'antonio', 'freq': 2576348}]

In [4]:
np.mean([len(x["name"]) for x in data])

np.float64(7.062996716683544)

### N-grams

In [5]:
# Find unique characters
chars = sorted(set("".join([x["name"] for x in data])))
n_chars = len(chars)

# Create a mapping from characters to indices and vice versa
i_to_c = dict(enumerate(["."] + chars))
c_to_i = {v: k for k, v in i_to_c.items()}

In [6]:
def get_ngrams(name: str, n: int) -> Iterable[tuple]:
    name = (n - 1)*"." + name + "."

    return zip(*[name[i:] for i in range(n)])

list(get_ngrams("rafael", n=4))

[('.', '.', '.', 'r'),
 ('.', '.', 'r', 'a'),
 ('.', 'r', 'a', 'f'),
 ('r', 'a', 'f', 'a'),
 ('a', 'f', 'a', 'e'),
 ('f', 'a', 'e', 'l'),
 ('a', 'e', 'l', '.')]

In [7]:
def encode_data_into_ngram_tensors(
    data: list[dict[str, Any]], 
    n: int
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    assert n > 1

    input_idxs = []
    target_idxs = []
    freqs = []

    for x in tqdm(data):
        name = x["name"]
        freq = x["freq"]

        for chars in get_ngrams(name, n):
            input_idxs.append([c_to_i[c] for c in chars[:-1]])
            target_idxs.append(c_to_i[chars[-1]])
            freqs.append(freq)

    X = F.one_hot(torch.tensor(input_idxs), num_classes=n_chars + 1).float()
    X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])

    y = F.one_hot(torch.tensor(target_idxs), num_classes=n_chars + 1).float()

    sample_weights = torch.tensor(freqs, dtype=torch.float32)
    sample_weights /= sample_weights.sum()

    return X, y, sample_weights

In [8]:
n = 10
X, y, sample_weights = encode_data_into_ngram_tensors(data, n)
X = X.cuda()
y = y.cuda()
sample_weights = sample_weights.cuda()

X.shape, y.shape, sample_weights.shape

  0%|          | 0/130356 [00:00<?, ?it/s]

(torch.Size([1051060, 243]), torch.Size([1051060, 27]), torch.Size([1051060]))

In [9]:
# W = torch.randn((n_chars + 1)*(n-1), (n_chars + 1), requires_grad=True, device="cuda")

# for i in range(2000 + 1):
#     logits = X @ W
#     nll = F.cross_entropy(logits, y, reduction="none")
#     loss = (nll * sample_weights).sum()

#     if i % 100 == 0:
#         print(f"epoch={i} | loss={loss.item():.4f}")

#     W.grad = None
#     loss.backward()

#     W.data += -1*W.grad


In [10]:
# chars = []
# input_idxs = [0]*(n - 1)

# while True:
#     x_enc = F.one_hot(torch.tensor(input_idxs), num_classes=n_chars + 1).float().cuda()
#     x_enc = x_enc.reshape(1, x_enc.shape[0]*x_enc.shape[1])

#     logits = x_enc @ W
#     probs = F.softmax(logits, dim=1)
#     i = torch.multinomial(probs, num_samples=1).item()

#     if i == 0:
#         break
#     else:
#         chars.append(i_to_c[i])
#         input_idxs = input_idxs[1:] + [i]

# name = ''.join(chars)
# name

In [17]:
batch_size = 20_000
n_epochs = 10_000

train_ratio = 0.8
train_size = int(X.shape[0] * train_ratio)

idxs = torch.randperm(X.shape[0], device="cuda")
train_idxs = idxs[:train_size]
val_idxs = idxs[train_size:]

X_train = X[train_idxs, :]
y_train = y[train_idxs, :]
sw_train = sample_weights[train_idxs]
sw_train /= sw_train.sum()

X_val = X[val_idxs, :]
y_val = y[val_idxs, :]
sw_val = sample_weights[val_idxs]
sw_val /= sw_val.sum()

W1 = torch.randn((n_chars + 1)*(n-1), 2**12, requires_grad=True, device="cuda")
W2 = torch.randn(2**12, (n_chars + 1), requires_grad=True, device="cuda")

def forward(X):
    H1 = F.relu(X @ W1)
    logits = H1 @ W2
    return logits

def compute_weighted_nll(logits, y, sample_weights):
    nll = F.cross_entropy(logits, y, reduction="none")
    return (nll * sample_weights).sum()

print(f"Model size: {W1.numel() + W2.numel()}\n")

for i in range(n_epochs + 1):
    sgd_idxs = torch.randint(0, X_train.shape[0], (batch_size,), device="cuda")

    X_sgd = X_train[sgd_idxs, :]
    y_sgd = y_train[sgd_idxs, :]
    sw_sgd = sw_train[sgd_idxs]
    sw_sgd /= sw_sgd.sum()

    logits = forward(X_sgd)
    loss = compute_weighted_nll(logits, y_sgd, sw_sgd)

    if i % (n_epochs // 100) == 0:
        loss_val = compute_weighted_nll(forward(X_val), y_val, sw_val)
        print(f"epoch={i} | loss={loss_val.item():.4f}")

    W1.grad = None
    W2.grad = None
    loss.backward()

    W1.data += -1*W1.grad
    W2.data += -1*W2.grad

Model size: 1105920

epoch=0 | loss=263.6970
epoch=100 | loss=15.6464
epoch=200 | loss=11.0752
epoch=300 | loss=8.9190
epoch=400 | loss=7.9287
epoch=500 | loss=7.7769
epoch=600 | loss=6.8230
epoch=700 | loss=6.6712
epoch=800 | loss=6.3653
epoch=900 | loss=5.8965
epoch=1000 | loss=5.5615
epoch=1100 | loss=5.3102
epoch=1200 | loss=5.2796
epoch=1300 | loss=4.9411
epoch=1400 | loss=4.9241
epoch=1500 | loss=4.6746
epoch=1600 | loss=4.5524
epoch=1700 | loss=4.5694
epoch=1800 | loss=4.4374
epoch=1900 | loss=4.3373
epoch=2000 | loss=4.2501
epoch=2100 | loss=3.9660
epoch=2200 | loss=3.9098
epoch=2300 | loss=3.7644
epoch=2400 | loss=3.8642
epoch=2500 | loss=3.8481
epoch=2600 | loss=3.6856
epoch=2700 | loss=3.9951
epoch=2800 | loss=3.6995
epoch=2900 | loss=3.6568
epoch=3000 | loss=3.7076
epoch=3100 | loss=3.5633
epoch=3200 | loss=3.4684
epoch=3300 | loss=3.5558
epoch=3400 | loss=3.3842
epoch=3500 | loss=3.2934
epoch=3600 | loss=3.2694
epoch=3700 | loss=3.2832
epoch=3800 | loss=3.3880
epoch=3900 |

In [233]:
chars = list("edv")
padding_size = n - 1 - len(chars)
input_idxs = [0]*padding_size + [c_to_i[c] for c in chars]

while True:
    x_enc = F.one_hot(torch.tensor(input_idxs), num_classes=n_chars + 1).float().cuda()
    x_enc = x_enc.reshape(1, x_enc.shape[0]*x_enc.shape[1])

    h1 = F.relu(x_enc @ W1)
    logits = h1 @ W2

    probs = F.softmax(logits, dim=1)
    i = torch.multinomial(probs, num_samples=1).item()

    if i == 0:
        break
    else:
        chars.append(i_to_c[i])
        input_idxs = input_idxs[1:] + [i]

name = ''.join(chars)
name

'edvaldo'