In [1]:
import torch
import torch.nn as nn
import os
import numpy as np
import json
import pandas as pd
import math

In [2]:
deviceName = "cuda"

In [3]:
class tokenizer:
    def __init__(self, path = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\embedding"):
        self.path = path
        with open(os.path.join(path, "vocab.txt"), "r") as f:
            self.vocab = json.load(f)
        self.merge = dict()
        with open(os.path.join(path, "merge.txt"), "r", encoding="utf-8") as f:
            a = f.readlines()[1:]
            for index, words in enumerate(a):
                words = words.replace("\n", "")
                self.merge[tuple(words.strip().split())] = index
            
        self.reverseVocab = dict()
        for i in self.vocab.keys():
            self.reverseVocab[self.vocab[i]] = i
    
    def GetSplitWord(self, txt):        
        txt = list(txt)
        while(True):            
            changeIndex = -1
            rank = -1
            for index in range(1, len(txt)):
                tupl = (txt[index - 1], txt[index])
                if(tupl in self.merge and (rank == -1 or (rank != -1 and self.merge[tupl] < rank))):
                    changeIndex = index
                    rank = self.merge[tupl]
            if(changeIndex == -1):
                break
            txt[changeIndex-1] += txt[changeIndex]
            txt.pop(changeIndex)
        return txt

    def encode(self, txt):
        txt = txt.replace(" ", "Ġ").replace("\n", "Ċ")
        li = self.GetSplitWord(txt)
        res = []
        for word in li:
            if word in self.vocab:     
                res.append(self.vocab[word])
        return res
    
    def decode(self, li):
        txt = ""
        for i in li:
            txt += self.reverseVocab[i]
        return txt.replace("Ġ", " ").replace("Ċ", "\n")

In [4]:
weightPath = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\weigths"

class Attention(nn.Module):
    def __init__(self, inputDim, hiddenDimQuery, hiddenDimValue, head, dropout_prob=0.1):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.inputDim = inputDim
        self.numHead = head
        self.head_dim_query = hiddenDimQuery
        self.head_dim_value = hiddenDimValue 
        
        self.query_proj = nn.Linear(inputDim, self.numHead * hiddenDimQuery, bias=False).to(self.device)
        self.key_proj = nn.Linear(inputDim, self.numHead * hiddenDimQuery, bias=False).to(self.device)
        self.value_proj = nn.Linear(inputDim, self.numHead * hiddenDimValue, bias=False).to(self.device)

        self.output_proj = nn.Linear(self.numHead * hiddenDimValue, inputDim, bias=False).to(self.device)

        self.softmax = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x, mask=None):
        """
            (Input) X: s X n (sequence_length X input_dim)

            Output: s X n
        """
        N, S, D = x.shape
        queries = self.query_proj(x).view(N, S, self.numHead, self.head_dim_query).permute(0, 2, 1, 3)
        keys = self.key_proj(x).view(N, S, self.numHead, self.head_dim_query).permute(0, 2, 1, 3)
        values = self.value_proj(x).view(N, S, self.numHead, self.head_dim_value).permute(0, 2, 1, 3)

        energy = torch.matmul(queries, keys.permute(0, 1, 3, 2)) / math.sqrt(self.head_dim_query)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e9)

        attention_scores = self.softmax(energy)
        attention_scores = self.dropout(attention_scores)
        out = torch.matmul(attention_scores, values)
        out = out.permute(0, 2, 1, 3).contiguous().view(N, S, self.numHead * self.head_dim_value)

        out = self.output_proj(out)
        return out

class FNN(nn.Module):
    """
        => Simple FNN
    """
    def __init__(self, inputDim, hiddenDim):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.relu = nn.ReLU().to(self.device)
        self.linear1 = nn.Linear(inputDim, hiddenDim).to(self.device)
        self.linear2 = nn.Linear(hiddenDim, inputDim).to(self.device)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

class Transformer(nn.Module):
    def __init__(self, inputDim, hiddenDimQuery, hiddenDimValue, head, fnnHidden, dropoutProb=0.1):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.atten = Attention(inputDim, hiddenDimQuery, hiddenDimValue, head, dropoutProb).to(self.device)
        self.layerNorm1 = nn.LayerNorm(inputDim).to(self.device)
        self.fnn = FNN(inputDim, fnnHidden).to(self.device)
        self.layerNorm2 = nn.LayerNorm(inputDim).to(self.device)
        self.dropout1 = nn.Dropout(dropoutProb)
        self.dropout2 = nn.Dropout(dropoutProb)

    def forward(self, x, mask=None):
        atten_output = self.atten(x, mask)
        x = self.dropout1(atten_output) + x
        x = self.layerNorm1(x)
        fnn_output = self.fnn(x)
        x = self.dropout2(fnn_output) + x
        x = self.layerNorm2(x)
        return x

class TextEmbedding(nn.Module):
    def __init__(self, inputDim = 768, hiddenDim = 256, fnnHidden = 2048, nLayer = 4, nHead = 4):
        # hiddenDim => value and Query/key dim
        super().__init__()
        self.nHead = nHead
        self.nLayer = nLayer
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")
        self.layer = nn.ModuleList([
            Transformer(inputDim, hiddenDim, hiddenDim, nHead, fnnHidden)
            for _ in range(nLayer)
        ])

    def forward(self, x, mask = None):
        for layer in self.layer:
            x = layer(x, mask)
        return x

class TagClassify(nn.Module):
    def __init__(self, numberOfTag = 176, inputDim = 768, hiddenDim = 768, transformereLayer = 4, numberOfHead = 4, transformerFnnHiddenDim = 2048):
        super().__init__()
        self.device = torch.device(deviceName if torch.cuda.is_available() else "cpu")

        self.textEmbd = TextEmbedding(inputDim, hiddenDim, transformerFnnHiddenDim, transformereLayer, numberOfHead)
        self.linear = nn.Linear(in_features=hiddenDim, out_features=numberOfTag, device=self.device)

        self.sigmoid = nn.Sigmoid().to(self.device)
        
        self.embd = nn.Embedding(num_embeddings=50257, embedding_dim=768).to(self.device)
        self.embd.weight = nn.Parameter(torch.from_numpy(np.load(os.path.join(weightPath, 'transformer.wte.weight.npy'))).to(dtype=torch.float32).to(device=self.device))
        for param in self.embd.parameters():
            param.requires_grad = False
            
    def forward(self, x, mask = None):
        return self.linear(self.textEmbd(self.embd(x), mask)[:,0,:]) # select first vector from sequence

In [5]:
class DataSet:
    def __init__(self, batchSize = 64):
        self.path = r"C:\Users\shiva\Desktop\IISC\code\NeuroCpp\Projects\The Dream\DataSet\multiTagedData.csv"
        data = pd.read_csv(self.path)
        self.poems = list(data.Poem)
        self.tags = list(data.Tags)
        self.batchSize = batchSize
        self.tagIndex = dict()
        self.encdr = tokenizer()
        self.numberOfTag = self.CreateTag()

    def Next(self, index):
        max = 0
        li = []
        batchTag = torch.zeros((self.batchSize, self.numberOfTag))
        for i in range(index, min(index + self.batchSize, len(self.poems))):
            if(len(self.poems[i].split()) > 300):
                continue
            encd = self.encdr.encode(self.poems[i])
            allTag = self.tags[i].split(",")
            for tag in allTag:
                batchTag[i-index][self.tagIndex[tag]] = 1
            if(len(encd) > max):
                max = len(encd)
            li.append(encd)        
        seqPoem = torch.full((self.batchSize, max), 50256)
        for i in range(len(li)):
            seqPoem[i,:len(li[i])] = torch.tensor(li[i])
        return seqPoem, batchTag

    def CreateTag(self):
        index = 0
        for tag in self.tags:
            allTag = tag.split(",")
            for eachTag in allTag:
                if(eachTag not in self.tagIndex):
                    self.tagIndex[eachTag] = index
                    index += 1
        return len(self.tagIndex)

In [6]:
class Train:
    def __init__(self, batchSize = 64):
        self.data = DataSet(batchSize)
        self.model = TagClassify(numberOfTag=self.data.numberOfTag, transformereLayer=4, numberOfHead=4)
        self.tkn = tokenizer()
        self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=1e-3)
        self.lossFn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(17))

    def Train(self, epoch = 10):
        for epc in range(epoch):            
            for index in range(0, len(self.data.poems), self.data.batchSize):
                data, batchTag = self.data.Next(index)
                data = data.to(device=self.model.device)
                batchTag = batchTag.to(device=self.model.device)
                pred = self.model(data)
                loss = self.lossFn(pred, batchTag)
                print(f"[*] Epoch: {epc + 1}/{epoch} | Loss : {loss.item()} | Poem No.: {index} - {index + self.data.batchSize}/{len(self.data.poems)}")

In [7]:
tr = Train(batchSize=16)
tr.Train()

  from .autonotebook import tqdm as notebook_tqdm


[*] Epoch: 1/10 | Loss : 0.919161319732666 | Poem No.: 0 - 16/12899
[*] Epoch: 1/10 | Loss : 0.8411513566970825 | Poem No.: 16 - 32/12899
[*] Epoch: 1/10 | Loss : 0.9235261082649231 | Poem No.: 32 - 48/12899
[*] Epoch: 1/10 | Loss : 0.9167590737342834 | Poem No.: 48 - 64/12899
[*] Epoch: 1/10 | Loss : 0.861738920211792 | Poem No.: 64 - 80/12899
[*] Epoch: 1/10 | Loss : 0.9022499918937683 | Poem No.: 80 - 96/12899
[*] Epoch: 1/10 | Loss : 0.9927285313606262 | Poem No.: 96 - 112/12899
[*] Epoch: 1/10 | Loss : 1.0066076517105103 | Poem No.: 112 - 128/12899
[*] Epoch: 1/10 | Loss : 0.8618478178977966 | Poem No.: 128 - 144/12899
[*] Epoch: 1/10 | Loss : 0.9614133238792419 | Poem No.: 144 - 160/12899
[*] Epoch: 1/10 | Loss : 1.3024410009384155 | Poem No.: 160 - 176/12899
[*] Epoch: 1/10 | Loss : 1.2218018770217896 | Poem No.: 176 - 192/12899
[*] Epoch: 1/10 | Loss : 1.0858728885650635 | Poem No.: 192 - 208/12899
[*] Epoch: 1/10 | Loss : 1.03807532787323 | Poem No.: 208 - 224/12899
[*] Epoch:

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
