# Готовим данные

In [1]:
with open("../female-names.txt", "r") as f_in:
    female_names = f_in.readlines()

with open("../male-names.txt", "r") as f_in:
    male_names = f_in.readlines()
    

In [2]:
female_names = [x.lower().strip() for x in female_names]
male_names = [x.lower().strip() for x in male_names]

In [3]:
male_names.extend(female_names)

In [4]:
names = ["." + x + "." for x in male_names]


# Словарь и токенайзер

In [5]:
vocab = sorted(list(set("".join(names))), key=lambda v: "\t" if v == "." else v)

In [6]:
char_to_index = {char: index for index, char in enumerate(vocab)}
index_to_char = {index: char for char, index in char_to_index.items()}

def tokenize(char):
    return char_to_index.get(char, 0) 

def untokenize(index):
    return index_to_char.get(index, ".")

In [7]:
print(f"Токен для буквы а {tokenize("а")}")
print(f"Буква для токена 13 = {untokenize(13)}")

Токен для буквы а 3
Буква для токена 13 = к


# Вариант с обучением torch

## Короткий пример

In [9]:
import torch
import torch.nn.functional as F
loss_fn = torch.nn.CrossEntropyLoss()

In [10]:
xs = []
ys = []

for i in range(3):
    name = names[i]
    xs.extend([tokenize(l) for l in name[:-1]])
    ys.extend([tokenize(l) for l in name[1:]])
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [74]:
xenc = F.one_hot(xs, num_classes = len(vocab)).float()

In [329]:
W = torch.randn(len(vocab), len(vocab), requires_grad=True)

In [77]:
logits = xenc @ W

In [97]:
loss = loss_fn(logits, ys)
loss.backward()

In [99]:
W.data -= W.grad * 0.01

In [100]:
for _ in range(100):
    xenc = F.one_hot(xs, num_classes = len(vocab)).float()
    logits = xenc @ W
    loss = loss_fn(logits, ys)
    loss.backward()
    W.data -= W.grad * 0.01
    print(loss)

tensor(4.1132, grad_fn=<NllLossBackward0>)
tensor(4.1111, grad_fn=<NllLossBackward0>)
tensor(4.1080, grad_fn=<NllLossBackward0>)
tensor(4.1038, grad_fn=<NllLossBackward0>)
tensor(4.0985, grad_fn=<NllLossBackward0>)
tensor(4.0923, grad_fn=<NllLossBackward0>)
tensor(4.0849, grad_fn=<NllLossBackward0>)
tensor(4.0765, grad_fn=<NllLossBackward0>)
tensor(4.0671, grad_fn=<NllLossBackward0>)
tensor(4.0567, grad_fn=<NllLossBackward0>)
tensor(4.0452, grad_fn=<NllLossBackward0>)
tensor(4.0327, grad_fn=<NllLossBackward0>)
tensor(4.0192, grad_fn=<NllLossBackward0>)
tensor(4.0046, grad_fn=<NllLossBackward0>)
tensor(3.9890, grad_fn=<NllLossBackward0>)
tensor(3.9724, grad_fn=<NllLossBackward0>)
tensor(3.9548, grad_fn=<NllLossBackward0>)
tensor(3.9362, grad_fn=<NllLossBackward0>)
tensor(3.9166, grad_fn=<NllLossBackward0>)
tensor(3.8960, grad_fn=<NllLossBackward0>)
tensor(3.8744, grad_fn=<NllLossBackward0>)
tensor(3.8519, grad_fn=<NllLossBackward0>)
tensor(3.8283, grad_fn=<NllLossBackward0>)
tensor(3.80

In [11]:
xs = []
ys = []

for i in range(len(names)):
    name = names[i]
    xs.extend([tokenize(l) for l in name[:-1]])
    ys.extend([tokenize(l) for l in name[1:]])
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [330]:
W

tensor([[-1.5021,  0.3104,  1.2142,  ..., -0.2673,  1.6497,  0.1213],
        [-0.4555, -0.2758, -1.1395,  ..., -0.7018,  0.4106,  0.6979],
        [ 0.8295, -0.2886,  1.5594,  ...,  1.3039,  0.6362,  0.4214],
        ...,
        [-0.3750, -1.1103,  0.0821,  ..., -0.6192,  0.5243,  0.4642],
        [-0.1613, -1.0055,  0.7677,  ..., -0.9891,  0.1229, -0.4871],
        [ 0.9497, -0.7326, -0.8079,  ..., -0.9810, -0.3153,  2.9468]],
       requires_grad=True)

In [406]:
xenc = F.one_hot(xs, num_classes = len(vocab)).float()
for _ in range(1000):
    logits = xenc @ W
    loss = loss_fn(logits, ys)
    W.grad = None
    loss.backward()
    W.data -= W.grad * 0.1
    print(loss)

tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.4180, grad_fn=<NllLossBackward0>)
tensor(2.41

In [332]:
from pprint import pprint

In [409]:
new_names = []
for _ in range(10):
    x = torch.tensor([0])
    seq = []
    while True:
        xenc = F.one_hot(x, num_classes = len(vocab)).float()
        logits = xenc @ W
        
        y = torch.multinomial(logits.exp(), 1)[0]
        x = y 
        seq += untokenize(y.item())
        if y == 0:
             break
    new_names.append("".join(seq))
pprint(new_names)


['ритрила.',
 'а.',
 'мехасельбрионасена.',
 'н.',
 'эдамадий.',
 'нда.',
 'вея.',
 'акалорьёь.',
 'ай.',
 'а.']


In [407]:
xenc = F.one_hot(xs, num_classes = len(vocab)).float()
logits = xenc @ W
probs = logits.exp() / logits.exp().sum(1, keepdims=True)


In [408]:
loss = -probs[torch.arange(probs.shape[0]), ys].log().mean()
loss

tensor(2.4161, grad_fn=<NegBackward0>)

# Сделаем тоже самое но с помощью модели и оптимайзера

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim

class NameGenerator(nn.Module):
    def __init__(self, vocab_size):
        super(NameGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)
        self.fc = nn.Linear(vocab_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        return self.fc(x)


In [15]:
# Создание модели, функции потерь и оптимизатора
model = NameGenerator(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [16]:
num_epochs = 1000
for epoch in range(num_epochs):
    # Прямой проход
    outputs = model(xs)
    loss = criterion(outputs, ys)
    
    # Обратный проход и оптимизация
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 2.3664
Epoch [200/1000], Loss: 2.3547
Epoch [300/1000], Loss: 2.3514
Epoch [400/1000], Loss: 2.3502
Epoch [500/1000], Loss: 2.3496
Epoch [600/1000], Loss: 2.3493
Epoch [700/1000], Loss: 2.3492
Epoch [800/1000], Loss: 2.3491
Epoch [900/1000], Loss: 2.3490
Epoch [1000/1000], Loss: 2.3489


In [20]:
def generate_name(model, max_length=20):
    model.eval()
    with torch.no_grad():
        x = torch.tensor([0])  # Начинаем с токена начала последовательности
        name = []
        for _ in range(max_length):
            output = model(x)
            probs = torch.softmax(output, dim=1)
            next_char = torch.multinomial(probs, 1).item()
            if next_char == 0:  # Если встретили токен конца последовательности
                break
            name.append(untokenize(next_char))
            x = torch.tensor([next_char])
        return ''.join(name)


In [24]:
# Генерация 10 новых имен
new_names = [generate_name(model) for _ in range(10)]
print("Сгенерированные имена:")
for name in new_names:
    print(name)

Сгенерированные имена:
пин
вмисий
аха
лислиалибей
доладелетинт
пиладованна
са
ей
нелиррукэлломаран
дофртьз


# Другие варианты

In [44]:
class SimpleNameGenerator(nn.Module):
    def __init__(self, vocab_size):
        super(SimpleNameGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, x):
        return self.embedding(x)

# Подготовка данных (предполагается, что у вас уже есть xs, ys и vocab)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
vocab_size = len(vocab)

# Создание модели, функции потерь и оптимизатора
model = SimpleNameGenerator(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Обучение модели
num_epochs = 1000
for epoch in range(num_epochs):
    # Прямой проход
    outputs = model(xs)
    loss = criterion(outputs, ys)
    
    # Обратный проход и оптимизация
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

  xs = torch.tensor(xs)
  ys = torch.tensor(ys)


Epoch [100/1000], Loss: 3.0231
Epoch [200/1000], Loss: 2.6012
Epoch [300/1000], Loss: 2.4521
Epoch [400/1000], Loss: 2.4004
Epoch [500/1000], Loss: 2.3804
Epoch [600/1000], Loss: 2.3707
Epoch [700/1000], Loss: 2.3652
Epoch [800/1000], Loss: 2.3617
Epoch [900/1000], Loss: 2.3593
Epoch [1000/1000], Loss: 2.3575


In [48]:
# Генерация 10 новых имен
new_names = [generate_name(model) for _ in range(10)]
print("Сгенерированные имена:")
for name in new_names:
    print(name)

Сгенерированные имена:
тедельеос
пра
мстиса
ена
ионота
пр
иролли
ирин
рон
гдия


In [41]:
class ImprovedNameGenerator(nn.Module):
    def __init__(self, vocab_size):
        super(ImprovedNameGenerator, self).__init__()
        self.W = nn.Parameter(torch.randn(vocab_size, vocab_size) / vocab_size**0.5)
        
    def forward(self, x):
        x_one_hot = torch.nn.functional.one_hot(x, num_classes=self.W.shape[0]).float()
        logits = x_one_hot @ self.W
        return logits

# Подготовка данных (предполагается, что у вас уже есть xs, ys и vocab)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
vocab_size = len(vocab)

# Создание модели, функции потерь и оптимизатора
model = ImprovedNameGenerator(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Обучение модели
num_epochs = 1000
for epoch in range(num_epochs):
    # Прямой проход
    logits = model(xs)
    loss = criterion(logits, ys)
    
    # Обратный проход и оптимизация
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

  xs = torch.tensor(xs)
  ys = torch.tensor(ys)


Epoch [100/1000], Loss: 2.7393
Epoch [200/1000], Loss: 2.4833
Epoch [300/1000], Loss: 2.4111
Epoch [400/1000], Loss: 2.3858
Epoch [500/1000], Loss: 2.3740
Epoch [600/1000], Loss: 2.3674
Epoch [700/1000], Loss: 2.3632
Epoch [800/1000], Loss: 2.3604
Epoch [900/1000], Loss: 2.3584
Epoch [1000/1000], Loss: 2.3568
