<a href="https://colab.research.google.com/github/saparbayev-azizbek-12/bi-and-ai-talents-dl/blob/main/lesson-14/MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Homework

In [182]:
%%capture
!wget https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt

In [183]:
import torch
import torch.nn.functional as F

names = open('names.txt').read().splitlines()
vocab = sorted(set(''.join(names) + '.'))
stoi = {v: i for i, v in enumerate(vocab)}
itos = {i: v for v, i in stoi.items()}

def encode(name: str) -> list[int]:
    return [stoi[s] for s in name]

def decode(seq: list[int]) -> str:
    return ''.join([itos[i] for i in seq])

In [184]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [185]:
block_size = 3
X, Y = [], []

for name in names:
    context = [0] * block_size
    for ch in name + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X, device=device)
Y = torch.tensor(Y, device=device)

In [186]:
X.size()

torch.Size([228146, 3])

In [187]:
train_percent = 0.7
val_percent = 0.15
total_samples = X.size()[0]
train_size = int(total_samples * train_percent)
val_size = int(total_samples * val_percent)

Xtr, Ytr = X[:train_size], Y[:train_size]
Xval, Yval = X[train_size : train_size + val_size], Y[train_size : train_size + val_size]
Xts, Yts = X[train_size + val_size :], Y[train_size + val_size :]

In [188]:
def accuracy(model, X, Y):
    model.eval()
    with torch.no_grad():
        logits = model.forward(X)
        pred = torch.argmax(logits, dim=1)
        acc = (pred == Y).float().mean().item()
    return acc

In [189]:
class MLP(torch.nn.Module):
    def __init__(self, vocab_size, n_embd=10, n_hidden=100, block_size=3, scale=0.1, device='cuda', lr=0.05, p=0.2):
        super().__init__()
        torch.manual_seed(42)
        self.block_size = block_size
        self.C = torch.nn.Parameter(torch.randn(vocab_size, n_embd) * scale)
        self.W1 = torch.nn.Parameter(torch.randn(block_size * n_embd, n_hidden) * scale)
        self.b1 = torch.nn.Parameter(torch.zeros(n_hidden))
        self.W2 = torch.nn.Parameter(torch.randn(n_hidden, vocab_size) * scale)
        self.b2 = torch.nn.Parameter(torch.zeros(vocab_size))
        self.W3 = torch.nn.Parameter(torch.randn(block_size * n_embd, vocab_size) * scale)
        self.dropout = torch.nn.Dropout(p=p)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.to(device)
        self.history = []

    def forward(self, X):
        xenc = self.C[X]
        x_emb = xenc.view(xenc.size(0), -1)
        h = torch.tanh(x_emb @ self.W1 + self.b1)
        h_drop = self.dropout(h)
        logits = h_drop @ self.W2 + self.b2 + x_emb @ self.W3
        return logits

    def loss(self, X, Y):
        logits = self.forward(X)
        return F.cross_entropy(logits, Y)

    def params(self):
      print(self.state_dict().keys())

    def fit(self, Xtr, Ytr, Xval, Yval, max_steps=201, patience=5):
      super().train()
      best_test_acc = 0
      patience_counter = 0
      self.history = []

      for step in range(max_steps):
          self.optimizer.zero_grad()
          loss = self.loss(Xtr, Ytr)
          loss.backward()
          self.optimizer.step()
          train_acc = accuracy(self, Xtr, Ytr)
          test_acc = accuracy(self, Xval, Yval)
          self.history.append({'step': step, 'loss': loss.item(), 'train_acc': train_acc, 'test_acc': test_acc}) # Store history
          if step % 20 == 0:
              print(f"Step {step:3d} | Loss: {loss.item():.4f} | Train Acc: {train_acc:.3f} | Test Acc: {test_acc:.3f}")

          if test_acc > best_test_acc:
              best_test_acc = test_acc
              patience_counter = 0
              best_state = {k: v.clone() for k, v in self.state_dict().items()}
          else:
              patience_counter += 1

          if patience_counter >= patience:
              print(f"\nEarly stopping. Best Test Accuracy: {best_test_acc:.3f}")
              self.load_state_dict(best_state)
              break

    def get_history(self):
        return self.history

    def generate(self, num_samples=5):
        super().eval()
        for _ in range(num_samples):
            out = []
            context = [0] * self.block_size
            while True:
                x = torch.tensor([context], device=device)
                logits = self.forward(x)
                probs = F.softmax(logits, dim=1)
                ix = torch.multinomial(probs, num_samples=1).item()
                if itos[ix] == '.':
                    break
                out.append(itos[ix])
                context = context[1:] + [ix]
            print(''.join(out))

In [190]:
vocab_size = len(vocab)
model = MLP(vocab_size)

In [191]:
model.generate()

v
itajokyvgrzppeopkgbr
eqlskmairiphqiojfibjxqsvf
btuupwpoqxgzmiirwxiybqoionyuqucicpmoph
i


In [192]:
model.fit(Xtr, Ytr, Xval, Yval, patience=20)

Step   0 | Loss: 3.3015 | Train Acc: 0.174 | Test Acc: 0.134
Step  20 | Loss: 2.3140 | Train Acc: 0.280 | Test Acc: 0.240
Step  40 | Loss: 2.2446 | Train Acc: 0.293 | Test Acc: 0.244
Step  60 | Loss: 2.2094 | Train Acc: 0.304 | Test Acc: 0.249
Step  80 | Loss: 2.1684 | Train Acc: 0.315 | Test Acc: 0.254
Step 100 | Loss: 2.1345 | Train Acc: 0.323 | Test Acc: 0.250
Step 120 | Loss: 2.0965 | Train Acc: 0.337 | Test Acc: 0.262
Step 140 | Loss: 2.0708 | Train Acc: 0.344 | Test Acc: 0.264
Step 160 | Loss: 2.0521 | Train Acc: 0.348 | Test Acc: 0.275
Step 180 | Loss: 2.0341 | Train Acc: 0.354 | Test Acc: 0.277
Step 200 | Loss: 2.0273 | Train Acc: 0.356 | Test Acc: 0.277


In [193]:
model.generate()

caeli
crelia
nakiyah
ariah
liley


In [194]:
model.fit(Xtr, Ytr, Xval, Yval, patience=20)

Step   0 | Loss: 2.6009 | Train Acc: 0.343 | Test Acc: 0.275
Step  20 | Loss: 2.0297 | Train Acc: 0.355 | Test Acc: 0.276
Step  40 | Loss: 2.0109 | Train Acc: 0.360 | Test Acc: 0.280
Step  60 | Loss: 2.0031 | Train Acc: 0.361 | Test Acc: 0.282
Step  80 | Loss: 1.9980 | Train Acc: 0.363 | Test Acc: 0.281
Step 100 | Loss: 2.0029 | Train Acc: 0.361 | Test Acc: 0.283

Early stopping. Best Test Accuracy: 0.287


In [195]:
model.generate()

pianne
iti
warli
shandreigh
jayla


In [196]:
model.fit(Xtr, Ytr, Xval, Yval, patience=20)

Step   0 | Loss: 2.6413 | Train Acc: 0.336 | Test Acc: 0.247
Step  20 | Loss: 2.0083 | Train Acc: 0.361 | Test Acc: 0.279

Early stopping. Best Test Accuracy: 0.284


In [197]:
model.generate()

maddisia
lorin
jenner
shneannooris
jermelan


In [198]:
torch.save(model.state_dict(), "model.pkl")