In [16]:
import torch
import torch.nn as nn
import json
import numpy as np
import torch.nn.functional as F
import os
import pandas as pd
deviceName = "cuda"

In [None]:
weightPath = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\weigths"


class Attention(nn.Module):
    """
        => In one note all the math is explained
    """
    def __init__(self, index):        
        super().__init__()
        # total 768 X 64 | 64 is query, key and value dim
        # 2304 / 64 = 36 | 3 q, k, v and total 12 head
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.attention = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.attn.c_attn.weight.npy"))).to(dtype=torch.float32).to(device=self.device)) # weight
        self.attentionBias = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.attn.c_attn.bias.npy"))).to(dtype=torch.float32).to(device=self.device)) # Bias
        self.attentionProj = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.attn.c_proj.weight.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.attentionProjBias = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.attn.c_proj.bias.npy"))).to(dtype=torch.float32).to(device=self.device))

        self.softmax = nn.Softmax(dim = -1)
        
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask = None):
        # x = b X m X n
        # mask = b X m X m
        # b = batch size | m = seq len | n = input dim (768)
        B, T, C = x.shape
        x = x @ self.attention + self.attentionBias # (result) x = b X m X 2304
        Q, K, V = x.split(768, -1) # Q, K, V = b X m X 768
        batch, seqLen, hiddeDim = Q.shape
        
        # Q, K, V = b X head number X m X hidden dim
        Q = Q.view((batch, seqLen, 12, 64)).permute(0, 2, 1, 3)
        K = K.view((batch, seqLen, 12, 64)).permute(0, 2, 1, 3)
        V = V.view((batch, seqLen, 12, 64)).permute(0, 2, 1, 3)
        
        score = (Q @ K.permute(0, 1, 3, 2)) / 8 # root(64) = 8
        if mask is not None:
            mask = mask.unsqueeze(1)
            score = score.masked_fill(mask == 0, float("-inf"))
        sf = self.softmax(score)
        x = (sf @ V).permute(0, 2, 1, 3).contiguous().view(B, T, C)
        return x @ self.attentionProj + self.attentionProjBias

class MLP(nn.Module):
    def __init__(self, index):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.weight1 = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.mlp.c_fc.weight.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.bias1 = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.mlp.c_fc.bias.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.weight2 = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.mlp.c_proj.weight.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.bias2 = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.mlp.c_proj.bias.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.gelu = nn.GELU(approximate="tanh")

    def forward(self, x):
        # x = b  X m X 768
        # b = batch size, m = sequence len
        return self.gelu(x @ self.weight1 + self.bias1) @ self.weight2 + self.bias2

class Transformer(nn.Module):
    def __init__(self, index):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.attn = Attention(index).to(self.device)
        self.mlp = MLP(index).to(self.device)
        self.layerNorm1 = nn.LayerNorm(768).to(self.device)
        self.layerNorm2 = nn.LayerNorm(768).to(self.device)        
        self.layerNorm1.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.ln_1.weight.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.layerNorm1.bias = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.ln_1.bias.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.layerNorm2.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.ln_2.weight.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.layerNorm2.bias = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, f"transformer.h.{index}.ln_2.bias.npy"))).to(dtype=torch.float32).to(device=self.device))
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask = None):
        # x = b  X m X 768
        # b = batch size, m = sequence len
        t = self.layerNorm1(x)
        t = self.attn(t, mask)
        t = self.dropout1(t)
        x = x + t
        # according to diagram the value is stored in x in below layerNorm2
        # but diagram is wrong, believe me I waste 2 days on this.
        t = self.layerNorm2(x)
        t = self.mlp(t)
        t = self.dropout2(t)
        x = x + t
        return x


class gpt2(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.embd = nn.Embedding(num_embeddings=50257, embedding_dim=768).to(self.device)
        self.embd.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, 'transformer.wte.weight.npy'))).to(dtype=torch.float32).to(device=self.device)) # copy embedding matrix
        self.positionEmbd = nn.Embedding(num_embeddings=1024, embedding_dim=768).to(self.device)
        self.positionEmbd.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, 'transformer.wpe.weight.npy'))).to(dtype=torch.float32).to(device=self.device))  # copy embedding matrix
        self.layers = nn.ModuleList([
            Transformer(index) for index in range(12)
        ]).to(self.device)

        self.layerNorm = nn.LayerNorm(768).to(self.device)
        self.layerNorm.bias = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, 'transformer.ln_f.bias.npy'))).to(dtype=torch.float32).to(device=self.device))
        self.layerNorm.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, 'transformer.ln_f.weight.npy'))).to(dtype=torch.float32).to(device=self.device))

    def forward(self, x, mask = None):
        # b X m 
        # b = batch size, m = sequence len
        batchSize, seqLen = x.shape
        posInput = torch.arange(0, seqLen).expand(batchSize, seqLen).to(self.device)
        embdX = self.embd(x)        
        posEmbdX = self.positionEmbd(posInput)        

        x = embdX + posEmbdX
        for layer in self.layers:
            x = layer(x, mask)
        x = self.layerNorm(x)
        
        return x @ self.embd.weight.T


In [None]:
class tokenizer:
    def __init__(self, path = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\embedding"):
        self.path = path
        with open(os.path.join(path, "vocab.txt"), "r") as f:
            self.vocab = json.load(f)
        self.merge = dict()
        with open(os.path.join(path, "merge.txt"), "r", encoding="utf-8") as f:
            a = f.readlines()[1:]
            for index, words in enumerate(a):
                words = words.replace("\n", "")
                self.merge[tuple(words.strip().split())] = index
            
        self.reverseVocab = dict()
        for i in self.vocab.keys():
            self.reverseVocab[self.vocab[i]] = i
    
    def GetSplitWord(self, txt):        
        txt = list(txt)
        while(True):            
            changeIndex = -1
            rank = -1
            for index in range(1, len(txt)):
                tupl = (txt[index - 1], txt[index])
                if(tupl in self.merge and (rank == -1 or (rank != -1 and self.merge[tupl] < rank))):
                    changeIndex = index
                    rank = self.merge[tupl]
            if(changeIndex == -1):
                break
            txt[changeIndex-1] += txt[changeIndex]
            txt.pop(changeIndex)
        return txt

    def encode(self, txt):
        txt = txt.replace(" ", "Ġ").replace("\n", "Ċ")
        li = self.GetSplitWord(txt)
        res = []
        for word in li:            
            res.append(self.vocab[word])
        return res
    
    def decode(self, li):
        txt = ""
        for i in li:
            txt += self.reverseVocab[i]
        return txt.replace("Ġ", " ").replace("Ċ", "\n")

In [20]:
def CreateMask(encd:list)->torch.tensor:
    n = len(encd)
    mask = torch.ones((n,n), device = torch.device(deviceName if torch.cuda.is_available() else "cpu"))
    mask = mask.tril()
    return mask.unsqueeze(0)

def SelectNextWord(prob):
    sf = nn.Softmax(dim=-1)
    prob = prob[-1] / 0.98
    prob = sf(prob)
    return torch.multinomial(prob, num_samples=1).item()

In [None]:
class Train:
    def __init__(self):
        dataSet = pd.read_csv(r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\DataSet\PeomWithTagDataSet.csv")
        self.poem = [' '.join(i.split()) for i in list(dataSet.Poem)]
        self.tags = [' '.join(i.split()) for i in list(dataSet.Tags)]
        self.model = gpt2()
        self.tkn = tokenizer()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")

        # training 
        self.batchSize = 32
        self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=1e-3)
        self.loss_fn = nn.CrossEntropyLoss()

    def CreateData(self, index):
        encode = self.tkn.encode(self.poem[index])
        tags = self.tkn.encode(self.tags[index])
        return encode, tags

    def Train(self, epoch = 10):
        for epc in range(epoch):
            for i in range(0, len(self.poem), self.batchSize):
                data, tags = self.CreateData(i)
                totalLoss = 0
                for k in range(len(data)):
                    mask = CreateMask(tags)
                    target = torch.tensor(data[k], device=self.device).unsqueeze(0)
                    logit = self.model(torch.tensor(tags, device=self.device).view(1, len(tags)), mask)
                    loss = self.loss_fn(logit[:,-1,:], target)
                    totalLoss += loss
                    tags.append(data[k])
                print(f"Epoch: {epc + 1}/{epoch} | Poem No.: {i + 1}/{len(self.poem)} | Loss: {totalLoss.item()}")
                totalLoss /= len(data)
                totalLoss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                break
            break


In [22]:
tr = Train()

In [None]:
tr.Train()

Epoch: 1/10 | Poem No.: 1/13854 | Loss: 2448.75048828125


In [None]:
sent = "Here look"
tkn = tokenizer()
encd = tkn.encode(sent)
itera = 1
while(True):
    x = torch.tensor(encd, device=deviceName)
    mask = CreateMask(encd)
    a = gpt2(torch.tensor(x).view(1,len(encd)), mask).squeeze(0)
    encd.append(SelectNextWord(a))
    itera += 1
    if(itera > 50):
        break
print(tkn.decode(encd))

  a = gptModel(torch.tensor(x).view(1,len(encd)), mask).squeeze(0)


Here lookhogLabourleanspick Mastery Breaker myster Yard Labourgive Album Breakertle romanticWomenwomenInterestoustPack LabourWomenwomenleans intimate TraderShip MasteryroundLabourLady Yardem Trader Contest Gentle tonShip YardDist weightsleansentleUMEem Warehouseem TraderWomenTrust myster
