In [None]:
from IPython.display import HTML, display
import sys
import time

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tokenizers import ByteLevelBPETokenizer, Tokenizer

In [None]:
tokenizer = Tokenizer.from_file("/content/drive/MyDrive/bpe.tokenizer.json")

In [None]:
vocab_size = 1000
block_size = 320
batch_size = 64
n_embd = 512
ma_head = 16
n_blocks = 3
learning_rate = 1e-3
max_iters = 5001
eval_interval = 500
device = "cuda" if  torch.cuda.is_available() else "cpu"
eval_iters = 100
dropout = 0.25

In [None]:
torch.manual_seed(9)


class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v

        return out



class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out



class FeedForward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x



class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=ma_head) for _ in range(n_blocks)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)


    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_embd = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embd + pos_embd
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets==None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        def print_progressively_dynamic(text, delay=0.05):
            for char in text:
                sys.stdout.write(char)
                sys.stdout.flush()
                time.sleep(delay)

        def update_text_and_print(new_text):
            print_progressively_dynamic(new_text)
            # print()

        update_text_and_print(tokenizer.decode(idx[0].tolist()))
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
            update_text_and_print(tokenizer.decode(idx_next[0].tolist()))

        return None


model = BigramLanguageModel(vocab_size)
model.load_state_dict(torch.load("/content/drive/MyDrive/llmodel2.pt", map_location=torch.device(device)))
model.eval()

BigramLanguageModel(
  (token_embedding_table): Embedding(1000, 512)
  (position_embedding_table): Embedding(320, 512)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-15): 16 x Head(
            (key): Linear(in_features=512, out_features=32, bias=False)
            (query): Linear(in_features=512, out_features=32, bias=False)
            (value): Linear(in_features=512, out_features=32, bias=False)
            (dropout): Dropout(p=0.25, inplace=False)
          )
        )
        (proj): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.25, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
          (3): Dropout(p=0.25, inplace=False)
        )
      )
      (ln1): LayerNorm((512,), eps=1e-05, elementw

In [None]:
my_input = "Things get better when"
context = torch.tensor(tokenizer.encode(my_input).ids, dtype=torch.long).view(1, -1)
model.generate(context, max_new_tokens=200)
# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# print(tokenizer.decode(model.generate(context, max_new_tokens=100)[0].tolist()))

Things get better when a general population of 40 per cent of the majority . The majority of the Family  was 15 per cent in the Great Buddhist connected MDOT horsepower ( including outside Wiki ) , and in DDOT modified by New South Wales . 
 The buddhist contribution lightly  to the  and western cost of the U.S. authorities 4  ,  8005 defining at least turnpoints about 1  ,  500 by 1  ,  500  ,  900 pm in flight , and as of April 1997 authorities are located in North Wales . From through renovations of April 2010 ,

In [None]:
import sys
import time

def print_progressively_dynamic(text, delay=0.05):
    for char in text:
        sys.stdout.write(char)
        sys.stdout.flush()
        time.sleep(delay)

def update_text_and_print(new_text):
    print_progressively_dynamic(new_text)
    print()

update_text_and_print(tokenizer.decode(model.generate(context, max_new_tokens=100)[0].tolist()))

why are we here? . Among these investments must import themselves in the heart of the show , we have many was lives from mostly rowing community . These anthems are only weight , that studied rowing , and the attacks are outtrowing . The couple at
