In [None]:
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
block_size= 1024
vocab_size= 50257 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency

n_layer= 12
n_pos=1024
n_head= 12
n_embd= 768
dropout = 0.1
bias= True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

In [None]:
class PositionalEncoding(nn.Module):
    def _init_(self, d_model, max_len=512):
        super(PositionalEncoding, self)._init_()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].detach()


class Conv1D(nn.Module):

    def __init__(self, nf, nx):
        super().__init__()
        self.nf = nf
        self.weight = nn.Parameter(torch.empty(nx, nf))
        self.bias = nn.Parameter(torch.zeros(nf))
        nn.init.normal_(self.weight, std=0.02)

    def forward(self, x):
        size_out = x.size()[:-1] + (self.nf,)
        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        x = x.view(size_out)
        return x


class NewGELUActivation(nn.Module):


    def forward(self, input):
        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))



In [None]:
class GPT2Attention(nn.Module):

    def __init__(self):
        super().__init__()
        assert n_embd % n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = Conv1D(3*n_embd,n_embd)
        # output projection
        self.c_proj = Conv1D(n_embd, n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout =dropout


    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)


        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class GPT2MLP(nn.Module):

    def __init__(self):
        super().__init__()
        self.c_fc    = Conv1D(4*n_embd,  n_embd)
        self.c_proj  = Conv1D(n_embd, 4*n_embd)
        self.act    = NewGELUActivation()

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.act(x)
        x = self.c_proj(x)

        x = self.dropout(x)
        return x


In [None]:
class GPT2Block(nn.Module):

    def __init__(self):
        super().__init__()
        self.ln_1 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.attn = GPT2Attention()
        self.ln_2 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.mlp = GPT2MLP()

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self):
        super().__init__()

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([GPT2Block() for _ in range(n_layer)]),
            ln_f = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True),
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)



    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= block_size, f"Cannot forward sequence of length {t}, block size is only {block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for GPT2Block in self.transformer.h:
            x = GPT2Block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):

        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [None]:
# Get the state dictionary of your GPT2Small model
model = GPT()
# model=model.to(device)
model_state_dict = model.state_dict()


In [None]:
from transformers import GPT2Config, GPT2LMHeadModel

pretrained_model = GPT2LMHeadModel.from_pretrained("gpt2")
pretrained_state_dict = pretrained_model.state_dict()

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
for name, param in pretrained_state_dict.items():
    # print(name)
    if name in model_state_dict and param.size() == model_state_dict[name].size():
        model_state_dict[name].copy_(param)



In [None]:
for name,param in model_state_dict.items():
  print(name,torch.equal(model_state_dict[name],pretrained_state_dict[name]))

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.c_attn.weight True
transformer.h.0.attn.c_attn.bias True
transformer.h.0.attn.c_proj.weight True
transformer.h.0.attn.c_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.c_attn.weight True
transformer.h.1.attn.c_attn.bias True
transformer.h.1.attn.c_proj.weight True
transformer.h.1.attn.c_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True
transformer.h.1.mlp.c_fc.bias True
transformer.h.1.mlp.c_proj.weight True
transformer.h.1.mlp.c_proj.bias True
transformer.h.2.ln_1.weight True
transformer.h.2.ln_1.bias True
transformer.h.2.

In [None]:
import torch
import math
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'  # or 'gpt2-medium', 'gpt2-large', depending on the size you want
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

print("hii")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

hii


In [None]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
model.eval()
# Sample input text
input_text = "Once upon a time"

model=model.to(device)
# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Generate predictions
with torch.no_grad():
    output = model.generate(input_ids, max_new_tokens=50)

# Decode the generated output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)

Generated Text: Once upon a time, any consigned to copious medical terms would have so far been accursed and refused medical practices.

But no less emphatic under the legal code, physicians who had previously been found guilty were punished. They were closed prisons, and life


In [None]:
# Generate predictions
pretrained_model=pretrained_model.to(device)
pretrained_model.eval()
with torch.no_grad():
    output_ = pretrained_model.generate(input_ids, max_new_tokens=50)

generated_text_=tokenizer.decode(output_[0], skip_special_tokens=True)
print("Generated Text by GPT:", generated_text_)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text by GPT: Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger


Loading Shakespere Data

In [None]:
# read it in to inspect it
with open('/content/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [None]:
import torch

In [None]:
data_encoded=torch.tensor(encode(text), dtype=torch.long)
print(data_encoded.shape, data_encoded.dtype)
print(data_encoded[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

In [None]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data_encoded)) # first 90% will be train, rest val
train_data_encoded = data_encoded[:n]
val_data_encoded = data_encoded[n:]

In [None]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 50
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [None]:
torch.manual_seed(1337)
# batch_size = 4 # how many independent sequences will we process in parallel?
# block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data_encoded if split == 'train' else val_data_encoded
    ix = torch.randint(len(data_encoded) - block_size, (batch_size,))
    x = torch.stack([data_encoded[i:i+block_size] for i in ix])
    y = torch.stack([data_encoded[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

inputs:
torch.Size([64, 256])
tensor([[59, 58,  1,  ..., 39, 57,  6],
        [43, 56,  1,  ..., 37, 43, 57],
        [58, 46, 53,  ..., 17, 26, 14],
        ...,
        [46, 43, 39,  ..., 42, 43, 57],
        [18, 18, 10,  ..., 53, 56,  1],
        [45, 46, 39,  ...,  1, 58, 46]])
targets:
torch.Size([64, 256])
tensor([[58,  1, 15,  ..., 57,  6,  1],
        [56,  1, 24,  ..., 43, 57,  6],
        [46, 53, 59,  ..., 26, 14, 33],
        ...,
        [43, 39, 56,  ..., 43, 57, 54],
        [18, 10,  0,  ..., 56,  1, 40],
        [46, 39, 51,  ..., 58, 46, 53]])
----


# Model..

In [None]:
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:

vocab_size=65

n_pos=1024

bias= True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'

Training our base model on dataset..

In [None]:
# create a PyTorch optimizer
model=GPT()
model=model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(65, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=384, out_features=65, bias=False)
)

In [None]:
eval_iters = 100
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y=X.to(device),Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))



?qfzxbDkRZkNdc'wj,ZT,OLFL,eHtK
bHiPjCkMBbeAu3:XaSvgO-3qjMBF?gLUauhX:LVULFQ&eNuwqcOMxv.t?Vr dXlrDZaoeNFw3XHPpvWk,fDE$nYzxzqjNmX
Yo3&$FMtofViEIiB!!&VmOW;Kd!lKx,Ke3 ixYeYERnXciK;lxW;HFGidroG EsSXUB;qWk p.YGD3.lYWjbm!pelJlLnFAmVQF.C-hx;3qcncwvbN:?Uuv;MaiT'X3Uwty;MJlvBPUHI.yBm&pjY-lgvIEjMk:DGyqwJdqGMtSkklmoyW-SQA&QhdGC
Iib3qM'yS!-&fM$HZLETxgGGhx&$FsgC-LB3:Ae-xT3H
hAxkMMmnvbrufWqA s
;;3;QDLWTZ:fvt,Cdy.vlMUE$,w,fMFMPRD?CqYLSoB.UrHK-NLbk!ar,$yb&i&:
:rdsabWG$!JEgDLHYBvuihJKNuk?Dyr?:nyHRrxutM-I&fy&VE?!NMJ


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb,yb=xb.to(device),yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 4.3321, val loss 4.3311
step 50: train loss 2.4802, val loss 2.4817
step 100: train loss 2.4320, val loss 2.4320
step 150: train loss 2.3295, val loss 2.3314
step 200: train loss 2.1803, val loss 2.1808
step 250: train loss 2.0407, val loss 2.0401
step 300: train loss 1.9079, val loss 1.9108
step 350: train loss 1.8039, val loss 1.8058
step 400: train loss 1.7218, val loss 1.7216
step 450: train loss 1.6575, val loss 1.6572
step 500: train loss 1.6021, val loss 1.6028
step 550: train loss 1.5627, val loss 1.5632
step 600: train loss 1.5198, val loss 1.5246
step 650: train loss 1.4880, val loss 1.4887
step 700: train loss 1.4615, val loss 1.4634
step 750: train loss 1.4320, val loss 1.4287
step 800: train loss 1.4096, val loss 1.4108
step 850: train loss 1.3866, val loss 1.3932
step 900: train loss 1.3692, val loss 1.3704
step 950: train loss 1.3527, val loss 1.3531
step 999: train loss 1.3373, val loss 1.3403


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))



Commen he knock!

LEONTES:
Sir, thus prithee?

FER:
Yet you fieur bed; for safter you.

PETRUCHIO:
A darest memchanns. Thou love discoce;
Indoled live make good so bise;
This what I have-rogs no; become yet bear mocked
And 'twas spick'd; and I am again.

GRUMIO:
The con I stavest
Shorter the forbired of your lajest inderath!

GRUMIO:
Now nay; brave! Ka
hang been, the nurse I will before
What wicke, I am them have bold and I head,
And what yet to grant carge years;
And thou thee were you child, y


## Rotary Position Encoding

In [None]:
class FixedPositionalEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len=n_pos):
        super().__init__()
        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        position = torch.arange(0, max_seq_len, dtype=torch.float)
        sinusoid_inp = torch.einsum("i,j->ij", position, inv_freq)
        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
        self.register_buffer('emb', emb)

    def forward(self, x):
        return self.emb[None,:x.shape[1], :].to(x)

In [None]:
import locale
print(locale.getpreferredencoding())

ANSI_X3.4-1968


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install einops

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [None]:
from einops import rearrange, repeat


In [None]:
def rotate_every_two(x):
    x = rearrange(x, '... (d j) -> ... d j', j = 2)
    x1, x2 = x.unbind(dim = -1)
    x = torch.stack((-x2, x1), dim = -1)
    return rearrange(x, '... d j -> ... (d j)')

def apply_rotary_pos_emb(q, k, sinu_pos):
    sinu_pos = rearrange(sinu_pos, '() n (j d) -> n j d', j = 2)
    sin, cos = sinu_pos.unbind(dim = -2)
    sin, cos = map(lambda t: repeat(t, 'b n -> b (n j)', j = 2), (sin, cos))
    q, k = map(lambda t: (t * cos) + (rotate_every_two(t) * sin), (q, k))
    return q, k

In [None]:
class GPT2Attention_rotary(nn.Module):

    def __init__(self):
        super().__init__()
        assert n_embd % n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = Conv1D(3*n_embd,n_embd)
        # output projection
        self.c_proj = Conv1D(n_embd, n_embd)

        self.pos=FixedPositionalEmbedding(n_embd// n_head)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout =dropout

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        x=self.c_attn(x)

        pos=self.pos(x)
        q, k, v  = x.split(self.n_embd, dim=2)
        # print(q.shape,k.shape,v.shape)

        # Implementing rotary embedding..




        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        q, k = apply_rotary_pos_emb(q, k, pos)
        # print(q.shape,k.shape)



        # manual implementation of attention
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y




In [None]:
class GPT2Block_rotary(nn.Module):

    def __init__(self):
        super().__init__()
        self.ln_1 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.attn = GPT2Attention_rotary()
        self.ln_2 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.mlp = GPT2MLP()

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT_rotary(nn.Module):

    def __init__(self):
        super().__init__()

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([GPT2Block_rotary() for _ in range(n_layer)]),
            ln_f = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True),
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)


    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= block_size, f"Cannot forward sequence of length {t}, block size is only {block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for GPT2Block_rotary in self.transformer.h:
            x = GPT2Block_rotary(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):

        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


Now training our rotary embedding based model on this dataset...

In [None]:
model_rotary=GPT_rotary()
model_rotary=model_rotary.to(device)

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model_rotary.parameters(), lr=1e-3)

In [None]:
eval_iters = 100
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y=X.to(device),Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model_rotary)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb,yb=xb.to(device),yb.to(device)

    # evaluate the loss
    logits, loss = model_rotary(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 4.3535, val loss 4.3526
step 50: train loss 2.2412, val loss 2.2424
step 100: train loss 1.9550, val loss 1.9551
step 150: train loss 1.8149, val loss 1.8121
step 200: train loss 1.7073, val loss 1.7086
step 250: train loss 1.6343, val loss 1.6342
step 300: train loss 1.5841, val loss 1.5778
step 350: train loss 1.5349, val loss 1.5334
step 400: train loss 1.4982, val loss 1.4994
step 450: train loss 1.4665, val loss 1.4692
step 500: train loss 1.4434, val loss 1.4442
step 550: train loss 1.4155, val loss 1.4171
step 600: train loss 1.3955, val loss 1.3960
step 650: train loss 1.3791, val loss 1.3761
step 700: train loss 1.3580, val loss 1.3582
step 750: train loss 1.3473, val loss 1.3461
step 800: train loss 1.3250, val loss 1.3281
step 850: train loss 1.3212, val loss 1.3170
step 900: train loss 1.3052, val loss 1.3031
step 950: train loss 1.2961, val loss 1.2944
step 999: train loss 1.2831, val loss 1.2822


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model_rotary.generate(context, max_new_tokens=500)[0].tolist()))



shall be not mon vex'da, mark thy braile womph'd.

First Much Dick,
Met your compass to warm; and whilest here.

VOLUMNIA:
The wamons here, done about a hour, let night meat;
For of Warwick splent, any one-semplay.

FRIAR LAURENCE:
Thereto my sure well with their eye of all?

CAXLILF:
Nay! away!

TYRREL:
God love for Willanus fortune's peach'd.

RICHARD:
Ay, I sirraw hate?

ARCHISS SEBY:
Ay, this not would dangers despecting and teas:
Tell then I seed?

POMPEY:
O, Petruchurs, singulabol!--
Under


Using Group Query Attention..

In [None]:
class GPT2Attention_query(nn.Module):

    def __init__(self):
        super().__init__()
        assert n_embd % n_head == 0

        self.num_groups = 4
        # key, query, value projections for all heads, but in a batch
        self.c_attn = Conv1D(3*n_embd,n_embd)
        # output projection
        self.c_proj = Conv1D(n_embd, n_embd)

        self.pos=FixedPositionalEmbedding(n_embd// n_head)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout =dropout

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        x=self.c_attn(x)

        pos=self.pos(x)
        q, k, v  = x.split(self.n_embd, dim=2)
        # print(q.shape,k.shape,v.shape)

        # Implementing rotary embedding..




        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        q, k = apply_rotary_pos_emb(q, k, pos)
        # print(q.shape,k.shape)

        # Apply group query attention..
        k_new=k.clone()
        q_new=q.clone()
        v_new=v.clone()
        # print(k_new.shape)
        for i in range(0,n_head//self.num_groups):
          k_new[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:]=torch.mean(k[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:])
          q_new[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:]=torch.mean(q[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:])
          v_new[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:]=torch.mean(v[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:])

        k=k_new
        q=q_new
        v=v_new


        # manual implementation of attention
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y



In [None]:
class GPT2Block_query(nn.Module):

    def __init__(self):
        super().__init__()
        self.ln_1 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.attn = GPT2Attention_query()
        self.ln_2 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.mlp = GPT2MLP()

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT_query(nn.Module):

    def __init__(self):
        super().__init__()

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([GPT2Block_query() for _ in range(n_layer)]),
            ln_f = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True),
        ))

        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)



    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= block_size, f"Cannot forward sequence of length {t}, block size is only {block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for GPT2Block_query in self.transformer.h:
            x = GPT2Block_query(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):

        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [None]:
model_query=GPT_query()
model_query_state_dict=model_query.state_dict()
model_query

GPT_query(
  (transformer): ModuleDict(
    (wte): Embedding(65, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block_query(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention_modified(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (pos): FixedPositionalEmbedding()
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=384, out_features=65, bias=False)
)

In [None]:
print(sum(p.numel() for p in model_query.parameters())/1e6, 'M parameters')
#   print(name,torch.equal(model_query_state_dict[name],pretrained_state_dict[name]))

10.795776 M parameters


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model_query.parameters(), lr=1e-3)

In [None]:
eval_iters = 100
model_query=model_query.to(device)
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y=X.to(device),Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model_query)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb,yb=xb.to(device),yb.to(device)

    # evaluate the loss
    logits, loss = model_query(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 4.3438, val loss 4.3442
step 50: train loss 2.2474, val loss 2.2465
step 100: train loss 1.9610, val loss 1.9589
step 150: train loss 1.8163, val loss 1.8139
step 200: train loss 1.7083, val loss 1.7071
step 250: train loss 1.6337, val loss 1.6366
step 300: train loss 1.5782, val loss 1.5792
step 350: train loss 1.5342, val loss 1.5361
step 400: train loss 1.4958, val loss 1.5023
step 450: train loss 1.4631, val loss 1.4607
step 500: train loss 1.4432, val loss 1.4451
step 550: train loss 1.4122, val loss 1.4146
step 600: train loss 1.3908, val loss 1.3897
step 650: train loss 1.3752, val loss 1.3749
step 700: train loss 1.3637, val loss 1.3659
step 750: train loss 1.3414, val loss 1.3400
step 800: train loss 1.3257, val loss 1.3268
step 850: train loss 1.3150, val loss 1.3125
step 900: train loss 1.3021, val loss 1.3019
step 950: train loss 1.2923, val loss 1.2891
step 999: train loss 1.2785, val loss 1.2804


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model_query.generate(context, max_new_tokens=500)[0].tolist()))



The prite alift marriage her dischret,
May brave withbrand, heart can, man for them,
And before her authority away'd hear;
Or whom expromong with thy shumble speaks venesion.

Lord:
A is the passsions sat may had lift bid.

DUKE VINCENTIO:
O mustand the world, sir, to turned in execute's
muster armend it.

HENRY BOLINGBROKE:
Lathermen, haste inter life of firer looks.
Is you a good suspect your life, and as a noble at
well to Jove as an ear. How better upon and him,
it hath brang'd him crass whi


Sliding Window Attention

In [None]:
class SlidingWindowAttention(nn.Module):

    def __init__(self, window_size=64):
        super().__init__()
        self.window_size = window_size

    def forward(self, q, k, v):
        # Implement sliding window attention
        B, nh, T, hs = q.size()

        # Calculate the windowed attention
        attn = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        mask1 = torch.tril(torch.ones(1, T, T), diagonal=self.window_size)
        mask2= torch.triu(torch.ones(1, T, T), diagonal= -self.window_size)

        mask=torch.einsum('ijk,ijk -> ijk',mask1,mask2)
        attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        attn = attn.masked_fill(torch.isnan(attn), 0.0)

        # Apply the attention to the values
        x = attn @ v

        return x


In [None]:
class GPT2Attention_sliding(nn.Module):

    def __init__(self):
        super().__init__()
        assert n_embd % n_head == 0

        self.num_groups = 4
        # key, query, value projections for all heads, but in a batch
        self.c_attn = Conv1D(3*n_embd,n_embd)
        # output projection
        self.c_proj = Conv1D(n_embd, n_embd)

        self.pos=FixedPositionalEmbedding(n_embd// n_head)
        # regularization

        self.sliding_window_attention=SlidingWindowAttention()

        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout =dropout

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        x=self.c_attn(x)

        pos=self.pos(x)
        q, k, v  = x.split(self.n_embd, dim=2)
        # print(q.shape,k.shape,v.shape)

        # Implementing rotary embedding..




        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        q, k = apply_rotary_pos_emb(q, k, pos)
        # print(q.shape,k.shape)

        # Apply group query attention..
        k_new=k.clone()
        q_new=q.clone()
        v_new=v.clone()
        # print(k_new.shape)
        for i in range(0,n_head//self.num_groups):
          k_new[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:]=torch.mean(k[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:])
          q_new[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:]=torch.mean(q[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:])
          v_new[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:]=torch.mean(v[:,(i)*(n_head//self.num_groups):(i+1)*(n_head//self.num_groups)-1,:,:])

        k=k_new
        q=q_new
        v=v_new


        y = self.sliding_window_attention(q, k, v)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


In [None]:
class GPT2Block_sliding(nn.Module):

    def __init__(self):
        super().__init__()
        self.ln_1 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.attn = GPT2Attention_sliding()
        self.ln_2 = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True)
        self.mlp = GPT2MLP()

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT_sliding(nn.Module):

    def __init__(self):
        super().__init__()

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([GPT2Block_sliding() for _ in range(n_layer)]),
            ln_f = nn.LayerNorm((n_embd,), eps=1e-05, elementwise_affine=True),
        ))

        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)


    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= block_size, f"Cannot forward sequence of length {t}, block size is only {block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for GPT2Block_sliding in self.transformer.h:
            x = GPT2Block_sliding(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):

        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [None]:
model_sliding=GPT_sliding()
model_sliding_state_dict=model_sliding.state_dict()
model_sliding

GPT_sliding(
  (transformer): ModuleDict(
    (wte): Embedding(65, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block_sliding(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention_sliding(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (pos): FixedPositionalEmbedding()
          (sliding_window_attention): SlidingWindowAttention()
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=384, out_features=65, bias=False)
)

In [None]:
print(sum(p.numel() for p in model_sliding.parameters())/1e6, 'M parameters')
#   print(name,torch.equal(model_query_state_dict[name],pretrained_state_dict[name]))

10.795776 M parameters


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model_sliding.parameters(), lr=1e-3)

In [None]:
eval_iters = 100
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y=X.to(device),Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
model_sliding.eval()
model_sliding=model_sliding.to(device)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model_sliding)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb,yb=xb.to(device),yb.to(device)

    # evaluate the loss
    logits, loss = model_sliding(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 4.3371, val loss 4.3359
step 50: train loss 2.2397, val loss 2.2402
step 100: train loss 1.9628, val loss 1.9595
step 150: train loss 1.8090, val loss 1.8096
step 200: train loss 1.7010, val loss 1.7063
step 250: train loss 1.6387, val loss 1.6329
step 300: train loss 1.5754, val loss 1.5727
step 350: train loss 1.5313, val loss 1.5309
step 400: train loss 1.4978, val loss 1.4969
step 450: train loss 1.4628, val loss 1.4650
step 500: train loss 1.4345, val loss 1.4396
step 550: train loss 1.4158, val loss 1.4159
step 600: train loss 1.3949, val loss 1.3950
step 650: train loss 1.3686, val loss 1.3691
step 700: train loss 1.3543, val loss 1.3538
step 750: train loss 1.3416, val loss 1.3394
step 800: train loss 1.3234, val loss 1.3265
step 850: train loss 1.3100, val loss 1.3100
step 900: train loss 1.3025, val loss 1.3035
step 950: train loss 1.2930, val loss 1.2954
step 999: train loss 1.2811, val loss 1.2810


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model_sliding.generate(context, max_new_tokens=500)[0].tolist()))



Thomager:
An home, you should knows, made me and just.

PETRUCHIO:
Hie not plays fury your sallent filling woman
Watch, and I sir, we sile you have so.

CATESBY:
My lord, thou letter. Hast he cousin, for Crince doth,
Till on the drum to the trouble: it is it not give
your heirs and a benedard
To go fortable and with yet, Sainna, strainly pain
Contented, and my wife, for the crowning behuted:
A promish, singled at my leban speak all.
Without Murder Juliet; let surmy.

LUCIO:

This down:
I'll faul


Training function for training on sinle gpu..

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
gpu_id = torch.cuda.current_device()
gpu_id

0

In [None]:
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os




class Trainer:
    def __init__(
        self,
        model: model,
        train_data: DataLoader,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int,
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_data = train_data
        self.optimizer = optimizer
        self.save_every = save_every

    def _run_batch(self, source, targets):
        self.optimizer.zero_grad()
        logits, loss = model(source, targets)
        loss.backward()
        self.optimizer.step()

    def _run_epoch(self, epoch):
        # b_sz = len(next(iter(self.train_data))[0])
        # print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")

        source, targets = self.train_data
        source = source.to(self.gpu_id)
        targets = targets.to(self.gpu_id)
        self._run_batch(source, targets)

    def _save_checkpoint(self, epoch):
        ckp = self.model.state_dict()
        PATH = "checkpoint.pt"
        torch.save(ckp, PATH)
        print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

    def train(self, max_epochs: int):
        for epoch in range(max_epochs):
            self._run_epoch(epoch)
            if self.gpu_id == 0 and epoch % self.save_every == 0:
                self._save_checkpoint(epoch)


def load_train_objs():
    train_set = get_batch('train') # load your dataset
    model = GPT()  # load your model
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    return train_set, model, optimizer


def main(device:int,total_epochs: int,save_every: int,batch_size:int):
    dataset, model, optimizer = load_train_objs()
    train_data = dataset
    trainer = Trainer(model, train_data, optimizer,gpu_id, save_every)
    trainer.train(total_epochs)



if __name__ == "__main__":
    import sys
    total_epochs=100
    save_every=50
    device=gpu_id
    batch_size = 32
    main(device,total_epochs,save_every,batch_size)

Epoch 0 | Training checkpoint saved at checkpoint.pt
Epoch 50 | Training checkpoint saved at checkpoint.pt
