In [1]:
import torch
import random
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Model Building
class CausalSelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd, bias = False)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias = False)
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, congig.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.c_attn(x)
        q, k, v = qkv.split(config.n_embd, dim = -1)
        q = q.view(B, T, config.n_heads, C//config.n_heads).transpose(1, 2)
        k = k.view(B, T, config.n_heads, C//config.n_heads).transpose(1, 2)
        v = v.view(B, T, config.n_heads, C//config.n_heads).transpose(1, 2)
        att_sc = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.shape[-1]))
        att_sc = att_sc.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
        att_sc = F.softmax(att_sc, dim = -1)
        out = att_sc @ v
        out = out.transpose(1, 2).contiguous().view(B, T, C) 
        out = self.c_proj(out)
        return out


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias = False)
        self.gelu = nn.GELu(approximate = 'Tanh')
        slef.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias = False)

    def forward(self, x):
        out = self.c_fc(x)
        out = self.gelu(out)
        out = self.c_proj(out)
        return out


class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.attn = CausalSelfAttention(config)
        self.mlp = MLP(config)
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.ln_2 = nn.LayerNorm(config.n_embd)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln2(x))
        return x


class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            pte = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block() for _ in range(config.n_layers)]),
            ln = nn.LayerNorm(config.n_embd)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False)

    def forward(self, x):
        B, T = x.shape
        pos = torch.arange(0, T, dtype = torch.long, device = config.device)
        tok_emb = self.transformer.wte(x)
        pos_emb = self.transformer.pte(pos)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln(x)
        logits = self.lm_head(x)
        return logits

In [None]:
def generate(x, max_gen)
    for _ in range(max_gen):
        logits = m(x)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim = -1)
        topk_probs, topk_indices = torch.topk(probs, 50, dim = -1)
        ix = torch.multinomial(probs, num_samples = 1)
        xcol = torch.gather(topk_indices, -1, ix)
        x = torch.cat((x, xcol), dim = -1)
    print(dec(x))
max_gen = 10
x = torch.tensor(, dtype = torch.long, device = config.device)
generate(x, max_gen)

In [None]:
# EDA
text = open("Dataset/names.txt").read()
vocab = sorted(set(text))
vocab_size = len(vocab) + 1     # 1 refers to pad token
print("-"*80)
print("EDA")
print("-"*80)
print(f"Vocabulary : \n{vocab}\n\nVocab size : {vocab_size}\n")

data = open("Dataset/names.txt").read().splitlines()
print(f"First ten samples before shuffling : \n{data[:10]}\n")
random.seed(13377)
random.shuffle(data)
print(f"First ten samples after shuffling : \n{data[:10]}\n")
max_ix = 0
for ix, name in enumerate(data):
    if len(name) > len(data[max_ix]):
        max_ix = ix
print(f"Longest input : {data[max_ix]}\t\tLength : {len(data[max_ix])}\n")
print("-"*80)


# Data preprocessing
def encode(data):
    for i,name in enumerate(data):
        ix = []
        for ch in name:
            ix.append(stoi[ch])
        data[i] = torch.tensor(ix, dtype = torch.long)

decode = lambda ix: ''.join([itos[i] for i in ix])

def pad_sequences(data, pad_token, max_length,):
    for i,name in enumerate(data):
        if len(name) != max_length:
            pad_tensor = torch.full((max_length - len(name),), pad_token)
            data[i] = torch.cat((name, pad_tensor))

def split(data):
    n = int(0.9*len(data))
    xd = [d[:block_size] for d in data]
    yd = [d[1:] for d in data]
    xtr = torch.stack(xd[:n])
    ytr = torch.stack(yd[:n])
    xval = torch.stack(xd[n:])
    yval = torch.stack(yd[n:])
    return xtr, ytr, xval, yval

def get_batch(mode):
    if mode == "train":
        x = xtr
        y = ytr
    else:
        x = xval
        y = yval
    ix = torch.randint(len(x), (batch_size,))
    xb = x[ix]
    yb = y[ix]
    xb = xb.to(device)
    yb = yb.to(device)
    return xb, yb

encode(data)
pad_sequences(data, pad_token, max_length = 24)
xtr, ytr, xval, yval = split(data)
print(f"Train data size : {len(xtr)}\n")
print(f"Val data size : {len(xval)}\n")
print("-"*80)

In [5]:
import tiktoken

In [8]:
enc = tiktoken.get_encoding('o200k_base')