In [27]:
import torch
import torch.nn as nn
import json
import numpy as np
import torch.nn.functional as F
import os

In [28]:
deviceName = "cuda"

In [29]:
class Attention(nn.Module):
    """
        => In one note all the math is explained
    """
    def __init__(self):        
        super().__init__()
        # total 768 X 64 | 64 is query, key and value dim
        # 2304 / 64 = 36 | 3 q, k, v and total 12 head
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.attention = nn.Parameter(torch.zeros((768, 2304), device=self.device)) # weight
        self.attentionBias = nn.Parameter(torch.zeros(2304, device=self.device)) # Bias
        self.attentionProj = nn.Parameter(torch.zeros((768, 768), device=self.device))
        self.attentionProjBias = nn.Parameter(torch.zeros(768, device=self.device))

        self.softmax = nn.Softmax(dim = -1)
        
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask = None):
        # x = b X m X n
        # mask = b X m X m
        # b = batch size | m = seq len | n = input dim (768)
        B, T, C = x.shape
        x = x @ self.attention + self.attentionBias # (result) x = b X m X 2304
        Q, K, V = x.split(768, -1) # Q, K, V = b X m X 768
        batch, seqLen, hiddeDim = Q.shape
        
        # Q, K, V = b X head number X m X hidden dim
        Q = Q.view((batch, seqLen, 12, 64)).permute(0, 2, 1, 3)
        K = K.view((batch, seqLen, 12, 64)).permute(0, 2, 1, 3)
        V = V.view((batch, seqLen, 12, 64)).permute(0, 2, 1, 3)
        
        score = (Q @ K.permute(0, 1, 3, 2)) / 8 # root(64) = 8
        if mask is not None:
            mask = mask.unsqueeze(1)
            score = score.masked_fill(mask == 0, float("-inf"))
        sf = self.softmax(score)
        x = (sf @ V).permute(0, 2, 1, 3).contiguous().view(B, T, C)
        return x @ self.attentionProj + self.attentionProjBias

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.weight1 = nn.Parameter(torch.zeros((768, 3072), device=self.device))
        self.bias1 = nn.Parameter(torch.zeros(3072, device=self.device))
        self.weight2 = nn.Parameter(torch.zeros((3072, 768), device=self.device))
        self.bias2 = nn.Parameter(torch.zeros(768, device=self.device))
        self.gelu = nn.GELU(approximate="tanh")

    def forward(self, x):
        # x = b  X m X 768
        # b = batch size, m = sequence len
        return self.gelu(x @ self.weight1 + self.bias1) @ self.weight2 + self.bias2

class Transformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.attn = Attention().to(self.device)
        self.mlp = MLP().to(self.device)
        self.layerNorm1 = nn.LayerNorm(768).to(self.device)
        self.layerNorm2 = nn.LayerNorm(768).to(self.device)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask = None):
        # x = b  X m X 768
        # b = batch size, m = sequence len
        t = self.layerNorm1(x)
        t = self.attn(t, mask)
        t = self.dropout1(t)
        x = x + t
        # according to diagram the value is stored in x in below layerNorm2
        # but diagram is wrong, believe me I waste 2 days on this.
        t = self.layerNorm2(x)
        t = self.mlp(t)
        t = self.dropout2(t)
        x = x + t
        return x


class gpt2(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.embd = nn.Embedding(num_embeddings=50257, embedding_dim=768).to(self.device)
        self.positionEmbd = nn.Embedding(num_embeddings=1024, embedding_dim=768).to(self.device)
        self.layers = nn.ModuleList([
            Transformer() for _ in range(12)
        ]).to(self.device)

        self.layerNorm = nn.LayerNorm(768).to(self.device)

    def forward(self, x, mask = None):
        # b X m 
        # b = batch size, m = sequence len
        batchSize, seqLen = x.shape
        posInput = torch.arange(0, seqLen).expand(batchSize, seqLen).to(self.device)
        embdX = self.embd(x)        
        posEmbdX = self.positionEmbd(posInput)        

        x = embdX + posEmbdX
        for layer in self.layers:
            x = layer(x, mask)
        x = self.layerNorm(x)
        
        return x @ self.embd.weight.T


In [30]:
def CreateGPT2Model():
    layerName = ['transformer.h.k.ln_1.weight.npy',
            'transformer.h.k.ln_1.bias.npy',
            'transformer.h.k.attn.c_attn.weight.npy',
            'transformer.h.k.attn.c_attn.bias.npy',
            'transformer.h.k.attn.c_proj.weight.npy',
            'transformer.h.k.attn.c_proj.bias.npy',
            'transformer.h.k.ln_2.weight.npy',
            'transformer.h.k.ln_2.bias.npy',
            'transformer.h.k.mlp.c_fc.weight.npy',
            'transformer.h.k.mlp.c_fc.bias.npy',
            'transformer.h.k.mlp.c_proj.weight.npy',
            'transformer.h.k.mlp.c_proj.bias.npy']
    path = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\weigths"
    gptModel = gpt2()
    gptModel.embd.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, 'transformer.wte.weight.npy'))).to(dtype=torch.float32).to(device=gptModel.device))
    gptModel.positionEmbd.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, 'transformer.wpe.weight.npy'))).to(dtype=torch.float32).to(device=gptModel.device))
    for index, layer in enumerate(gptModel.layers):
        layer.layerNorm1.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[0].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.layerNorm1.bias = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[1].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.attn.attention = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[2].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.attn.attentionBias = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[3].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.attn.attentionProj = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[4].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.attn.attentionProjBias = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[5].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.layerNorm2.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[6].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.layerNorm2.bias = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[7].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.mlp.weight1 = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[8].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.mlp.bias1 = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[9].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.mlp.weight2 = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[10].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
        layer.mlp.bias2 = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, layerName[11].replace("k", str(index))))).to(dtype=torch.float32).to(device=gptModel.device))
    gptModel.layerNorm.bias = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, 'transformer.ln_f.bias.npy'))).to(dtype=torch.float32).to(device=gptModel.device))
    gptModel.layerNorm.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(path, 'transformer.ln_f.weight.npy'))).to(dtype=torch.float32).to(device=gptModel.device))
    return gptModel
    

In [31]:
class tokenizer:
    def __init__(self, path = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\embedding"):
        self.path = path
        with open(os.path.join(path, "vocab.txt"), "r") as f:
            self.vocab = json.load(f)
        self.merge = dict()
        with open(os.path.join(path, "merge.txt"), "r", encoding="utf-8") as f:
            a = f.readlines()[1:]
            for index, words in enumerate(a):
                words = words.replace("\n", "")
                self.merge[tuple(words.strip().split())] = index
            
        self.reverseVocab = dict()
        for i in self.vocab.keys():
            self.reverseVocab[self.vocab[i]] = i
    
    def GetSplitWord(self, txt):        
        txt = list(txt)
        while(True):            
            changeIndex = -1
            rank = -1
            for index in range(1, len(txt)):
                tupl = (txt[index - 1], txt[index])
                if(tupl in self.merge and (rank == -1 or (rank != -1 and self.merge[tupl] < rank))):
                    changeIndex = index
                    rank = self.merge[tupl]
            if(changeIndex == -1):
                break
            txt[changeIndex-1] += txt[changeIndex]
            txt.pop(changeIndex)
        return txt

    def encode(self, txt):
        txt = txt.replace(" ", "Ġ")
        li = self.GetSplitWord(txt)
        res = []
        for word in li:            
            res.append(self.vocab[word])
        return res
    
    def decode(self, li):
        txt = ""
        for i in li:
            txt += self.reverseVocab[i]
        return txt.replace("Ġ", " ")

In [32]:
def CreateMask(encd:list)->torch.tensor:
    n = len(encd)
    mask = torch.ones((n,n), device = torch.device(deviceName if torch.cuda.is_available() else "cpu"))
    mask = mask.tril()
    return mask.unsqueeze(0)

def SelectNextWord(prob):
    sf = nn.Softmax(dim=-1)
    prob = prob[-1] / 0.98
    prob = sf(prob)
    return torch.multinomial(prob, num_samples=1).item()

In [33]:
tkn = tokenizer()

In [34]:
tkn.encode("why me")

[22850, 502]

In [35]:
gptModel = CreateGPT2Model()
gptModel.eval()

gpt2(
  (embd): Embedding(50257, 768)
  (positionEmbd): Embedding(1024, 768)
  (layers): ModuleList(
    (0-11): 12 x Transformer(
      (attn): Attention(
        (softmax): Softmax(dim=-1)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (mlp): MLP(
        (gelu): GELU(approximate='tanh')
      )
      (layerNorm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layerNorm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (layerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [37]:
sent = "Here look"
encd = tkn.encode(sent)
itera = 1
while(True):
    x = torch.tensor(encd, device=deviceName)
    mask = CreateMask(encd)
    a = gptModel(torch.tensor(x).view(1,len(encd)), mask).squeeze(0)
    encd.append(SelectNextWord(a))
    itera += 1
    if(itera > 50):
        break
print(tkn.decode(encd))

  a = gptModel(torch.tensor(x).view(1,len(encd)), mask).squeeze(0)


Here look into it. You see, I told you after all's said and done, how you could go back to Sweet Tooth in a day and make an address in New York is unbelievable. I've never had a major disagreement about films or music before.
