In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

! wget "https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"

--2024-06-06 16:23:14--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: 'names.txt'


2024-06-06 16:23:14 (17.9 MB/s) - 'names.txt' saved [228145/228145]



In [3]:
words = open('/kaggle/working/names.txt').read().splitlines()

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+2 for i,s in enumerate(chars)}
stoi['.'] = 0
stoi['_'] = 1
itos = {i:s for s,i in stoi.items()}
# print(itos)
n_vocab = len(stoi)

max_len = len(max(words, key=len))+1

In [6]:
def build_dataset(words):
    X = []
    Y = []
    for word in words:
        word = '.' + word + '_'
        ints = [stoi[ch] for ch in list(word)]
        xi = ints[:-1]
        yi = ints[1:]
        xi.extend([0]*(max_len-len(xi)))
        yi.extend([-1]*(max_len-len(yi)))
        X.append(xi)
        Y.append(yi)
    return torch.tensor(X), torch.tensor(Y)


In [7]:
Xtr, Ytr = build_dataset(words)
Xtr.shape, Ytr.shape
Xtr = Xtr.to(device)
Ytr = Ytr.to(device)

In [8]:
@torch.no_grad
def print_names(times, model):
    model.eval()
    for i in range(times):
        res = []
        with torch.no_grad():
            inp = torch.tensor([[0]]);
            inp = inp.to(device)
            while True:
                oi,_ = model(inp)
                pred = oi[:,-1,:]
                p = torch.softmax(pred,1).squeeze()
                out = torch.multinomial(p, 1, True)
                inp = torch.cat((inp, out[:,None]),1)
                if out == 1: break

                res.append(itos[out.item()])
        print(''.join(res))

# RNN

In [18]:
class RNN(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden):
        super().__init__()
        self.n_embed = n_embed
        self.n_vocab = n_vocab
        self.n_hidden = n_hidden
        self.emb = nn.Embedding(n_vocab, n_embed) # embedding
        self.linX = nn.Linear(n_embed, n_hidden,) # weights for x
        self.linH = nn.Linear(n_hidden, n_hidden) # weights for prev hidden
        self.linO = nn.Linear(n_hidden, n_vocab) # weight for output
        self.start = nn.Parameter(torch.zeros(1, n_hidden))  # learnable initial state
        
    def forward(self, x, y=None):
        bs, t = x.shape
        hidden = []
        h_prev = self.start.expand((bs, -1)) # (bs, n_hidden)
        loss = None
        embs = self.emb(x) # precomputing embedding for performance (bs, T, n_embed)
        for i in range(t): # looping over time (sequence)
            xi = embs[:, i, :] # extracting a slice of time (bs, 1, n_embed)
            hi = torch.tanh(self.linX(xi) + self.linH(h_prev)) # rnn equation (bs, 1, n_hidden)
            h_prev = hi # making current state as prev state
            hidden.append(hi) # saving current_state
        
        hidden = torch.stack(hidden,1) # merging all hidden state slices together (bs, T, n_hidden) 
        logits = self.linO(hidden) # projecting to vocab size (bs, T, n_vocab)
        
        # unfolding all time step across the batches into (bs*T, n_vocab) and target into (bs*t, 1) for cross entropy
        if self.training:
            loss = F.cross_entropy(logits.view(-1, self.n_vocab), y.view(-1), ignore_index=-1)
        
        return logits, loss


In [19]:
n_embed = 164
n_hidden = 256
block_size = 32

In [20]:
model = RNN(n_vocab, n_embed, n_hidden)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [21]:
traindl = DataLoader(TensorDataset(Xtr, Ytr), block_size, True)

In [22]:
epochs = 10
i = 0
for epoch in range(epochs):
    model.train()
    for xi, yi in traindl:
        logits, loss = model(xi, yi)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 1500 == 0:
            print(f'{epoch}:{i} || loss: {loss.item():.4f}')
        i+=1

0:0 || loss: 3.4175
1:1500 || loss: 2.4336
2:3000 || loss: 2.1803
4:4500 || loss: 2.5107
5:6000 || loss: 2.2769
7:7500 || loss: 2.3809
8:9000 || loss: 2.2709


In [30]:
print_names(10, model)

rayden
leormen
raelon
ehriyah
lanim
siyah
lluwyn
edmell
rhmiree
kaizlee
