In [1]:
import numpy as np
import torch
import os

# from transformers import GPT2TokenizerFast
from datasets import load_dataset

# tokenizer = GPT2TokenizerFast.from_pretrained('gpt2', bos_token='<|sos|>', eos_token='<|eos|>', pad_token='<|pad|>')
# print(tokenizer.convert_tokens_to_ids('<|sos|>'), tokenizer.convert_tokens_to_ids('<|eos|>'), tokenizer.convert_tokens_to_ids('<|pad|>'))
# # sos = 50257, eos = 50258, pad = 50259

# SOS_TOKEN = '<|sos|>'
# EOS_TOKEN = '<|eos|>'
# PAD_TOKEN = '<|pad|>'
# SOS_TOKEN_ID = tokenizer.convert_tokens_to_ids('<|sos|>')
# EOS_TOKEN_ID = tokenizer.convert_tokens_to_ids('<|eos|>')
# PAD_TOKEN_ID = tokenizer.convert_tokens_to_ids('<|pad|>')
# SOS_TENSOR = torch.tensor(SOS_TOKEN_ID, dtype=torch.long).unsqueeze(0).unsqueeze(0)
# EOS_TENSOR = torch.tensor(EOS_TOKEN_ID, dtype=torch.long).unsqueeze(0).unsqueeze(0)
# PAD_TENSOR = torch.tensor(PAD_TOKEN_ID, dtype=torch.long).unsqueeze(0).unsqueeze(0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


50257 50258 50259


In [6]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased',
                                              sos_token='[SOS]',
                                              eos_token='[EOS]',
                                              pad_token='[PAD]')
print(tokenizer.vocab_size)
print(tokenizer.pad_token)

PAD_TOKEN = '[PAD]'
SOS_TOKEN = '[SOS]'
EOS_TOKEN = '[EOS]'
SOS_TOKEN_ID = tokenizer.convert_tokens_to_ids(SOS_TOKEN)

30522
[PAD]


In [2]:
device = [
    torch.device('cuda:0'),
    torch.device('cuda:1'),
    torch.device('cuda:2'),
    torch.device('cuda:3'),
]

# import wandb

# wandb_config = {
#     'learning_rate': 2e-5,
#     'batch_size': 32,
#     'num_epochs': 5,
#     'num_workers': 4,
#     'dataset': 'huggingface/wikitext-1',
# }
# wandb.init(project='GPT2TextGeneration', config=wandb_config, name='test')

In [3]:
def tokenize_sentence(sentence, max_length=None):
    if max_length:
        return tokenizer(sentence, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True).input_ids
    else:
        return tokenizer(sentence, return_tensors='pt').input_ids

In [4]:
class PositionwiseFeedForwardLayer(torch.nn.Module):
    def __init__(self, d_model: int, dropout: float):
        super().__init__()

        self.linear1 = torch.nn.Linear(d_model, 4 * d_model)
        self.linear2 = torch.nn.Linear(4 * d_model, d_model)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(self.linear2(x))

        # x shape == output shape
        return x

In [5]:
class Head(torch.nn.Module):
    def __init__(self, d_model: int, d_head: int, dropout: float):
        super().__init__()

        assert d_model % d_head == 0
        d_tensor = d_model // d_head
        self.d_tensor = d_tensor

        self.key = torch.nn.Linear(d_model, d_head)
        self.query = torch.nn.Linear(d_model, d_head)
        self.value = torch.nn.Linear(d_model, d_head)

        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, q, k, v):

        # q, k, v = (batch_size, seq_len, d_model)

        q, k = self.query(k), self.key(q)


        # q, k = (batch_size, seq_len, d_tensor)
        # kT = (batch_size, d_tensor, seq_len)


        wei = q @ k.transpose(-2, -1) * (self.d_tensor ** (-0.5)) # q*kT/sqrt(d_k) from paper "Attention is All You Need"



        # wei = (batch_size, seq_len, seq_len)
        
        wei = torch.nn.functional.softmax(wei, dim=-1)
        v = self.value(v)



        # wei = (batch_size, seq_len, seq_len)
        # v = (batch_size, seq_len, d_tensor)

        out = wei @ v



        # out = (batch_size, seq_len, d_tensor): d_tensor * n_heads = d_model

        return out

In [6]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_head: int, dropout: float, num_gpus: int):
        super().__init__()

        assert d_model % d_head == 0
        assert n_heads % num_gpus == 0
        d_tensor = d_model // d_head
        self.d_tensor = d_tensor

        self.heads = torch.nn.ModuleList([
            Head(d_model=d_model, d_head=d_head, dropout=dropout) for _ in range(n_heads)
        ])
        self.linear = torch.nn.Linear(n_heads * d_tensor, d_model) # n_heads * d_tensor == d_model
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, q, k, v):

        out = torch.cat([
            head(q, k, v) for head in self.heads
        ], dim=-1)
        


        return out

In [7]:
class LayerNorm(torch.nn.Module):
    def __init__(self, d_model: int, eps=1e-12):
        super().__init__()

        self.gamma = torch.nn.Parameter(torch.ones(d_model))
        self.beta = torch.nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x. var(-1, unbiased=False, keepdim=True)

        out = (x - mean) * ((var + self.eps) ** (-0.5))
        out = self.gamma * out + self.beta

        return out

In [8]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_head: int, dropout: float, device_num: int):
        super().__init__()

        self.device_num = device_num
        self.attention_layernorm = LayerNorm(d_model)
        self.feedforward_layernorm = LayerNorm(d_model)

        self.self_attention = MultiHeadAttention(d_model=d_model, n_heads=n_heads, d_head=d_head, dropout=dropout, num_gpus=4)
        self.positionwise_feedforward = PositionwiseFeedForwardLayer(d_model=d_model, dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, trg):

        # trg = (batch_size, seq_len, d_model)
        # trg_mask = (batch_size, seq_len)

        trg = trg.to(device[self.device_num])
    
        # self attention with dropout
        _trg = self.dropout(self.self_attention(trg, trg, trg))

        # _trg = (batch_size, seq_len, d_model) == trg
        # add & norm with residual connection

        trg = self.attention_layernorm(trg + _trg)


        # trg = (batch_size, seq_len, d_model)
        # positionwise feedforward layer
        _trg = self.dropout(self.positionwise_feedforward(trg))
        trg = self.feedforward_layernorm(_trg + trg)

        # trg = (batch_size, seq_len, d_model)
        return trg

In [9]:
class Decoder(torch.nn.Module):
    def __init__(self, vocab_size: int, d_model: int, n_layers: int, n_heads: int, d_head: int, max_length: int, dropout: float, num_gpus: int):
        super().__init__()

        # positional encoding
        self.token_embedding = torch.nn.Embedding(vocab_size, d_model).to(device[0])
        self.position_embedding = torch.nn.Embedding(max_length, d_model).to(device[0])

        self.n_layers = n_layers
        self.per_gpu = n_layers // num_gpus # 3
        print(f"{self.per_gpu} decoder layers per gpu, with {num_gpus} gpus")
        self.layers = torch.nn.ModuleList([
            *[DecoderLayer(d_model=d_model, n_heads=n_heads, d_head=d_head, dropout=dropout, device_num=0).to(device[0]) for _ in range(self.per_gpu)],
            *[DecoderLayer(d_model=d_model, n_heads=n_heads, d_head=d_head, dropout=dropout, device_num=1).to(device[1]) for _ in range(self.per_gpu)],
            *[DecoderLayer(d_model=d_model, n_heads=n_heads, d_head=d_head, dropout=dropout, device_num=2).to(device[2]) for _ in range(self.per_gpu)],
            *[DecoderLayer(d_model=d_model, n_heads=n_heads, d_head=d_head, dropout=dropout, device_num=3).to(device[3]) for _ in range(self.per_gpu)],
        ])

        self.fc_out = torch.nn.Linear(d_model, vocab_size).to(device[3])
        self.dropout = torch.nn.Dropout(dropout).to(device[0])
    
    def forward(self, trg):
        
        # trg = (batch_size, seq_len)
        # trg_mask = (batch_size, seq_len)

        batch_size, seq_len = trg.shape

        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(device[0])
        trg = self.dropout((self.token_embedding(trg) + self.position_embedding(pos)))

        # trg = (batch_size, seq_len, d_model)

        # Decoder layers


        for layer in self.layers:

            trg = layer(trg)

        
        # trg = (batch_size, seq_len, d_model)

        output = self.fc_out(trg)

        # output = (batch_size, seq_len, vocab_size)

        return output

In [10]:
class GPTModel(torch.nn.Module):
    def __init__(self, vocab_size: int, d_model: int, n_layers: int, n_heads: int, d_head: int, max_length: int, dropout: float, tokenizer, num_gpus: int=4):
        super().__init__()

        self.tokenizer = tokenizer
        self.decoder = Decoder(vocab_size=vocab_size, d_model=d_model, n_layers=n_layers, n_heads=n_heads, d_head=d_head, max_length=max_length, dropout=dropout, num_gpus=num_gpus)
    

    def forward(self, sentence: str):
        '''
        This is used for the inference for the next word prediction for each batch, each word.
        '''
            
        # trg = (batch_size, seq_len)
        # trg_mask = (batch_size, seq_len)

        trg = self.tokenizer(sentence, return_tensors='pt').input_ids.to(device[0])
        output = self.decoder(trg)

        # output = (batch_size, seq_len, vocab_size)

        return output
    
    def next_word_prediction(self, sentence):
        '''
        This is used for the inference for the next word prediction for each batch using my decoder
        '''
        with torch.no_grad():
            # trg = (batch_size, seq_len)
            # trg_mask = (batch_size, seq_len)
            trg = self.tokenizer(sentence, return_tensors='pt').input_ids.to(device[0])
            trg = trg.to(device[0])
            out = torch.argmax(self.decoder(trg)[:, -1, :], dim=-1)

            output = []
            for item in out:
                output.append(self.tokenizer.decode(item))

            # output = (batch_size, vocab_size) (next word prediction)

            return output

    def generate(self, sentence, max_length=20):
        '''
        This is used for making the prediction over and over again until the end token is predicted.
        (or reached max_length)
        '''
        with torch.no_grad():
            trg = self.tokenizer(sentence, return_tensors='pt').input_ids.to(device[0])

            for _ in range(max_length):
                trg = trg.to(device[0])
                out = torch.argmax(self.decoder(trg)[:, -1, :], dim=-1).to(device[0])
                trg = torch.cat((trg, out.unsqueeze(1)), dim=1)
                if out == EOS_TOKEN_ID:
                    break
            
            return self.tokenizer.decode(trg[0])

    def train(self, full_sentence: list, loss_fn, optimizer, max_length=20):
        '''
        With given sentence, it will generate the next word prediction and backpropagate the loss.
        '''
        
        longest = 0
        for i in range(len(full_sentence)):
            full_sentence[i] = np.concatenate(
                ([SOS_TOKEN_ID], tokenizer(full_sentence[i]).input_ids, [EOS_TOKEN_ID]), axis=-1,
            )
            longest = full_sentence[i].shape[0] if full_sentence[i].shape[0] > longest else longest
            full_sentence[i] = np.expand_dims(full_sentence[i], axis=0)

        batched = torch.tensor(
            [token_full[0][i:i+max_length] for i in range(0, len(token_full[0])-max_length)]
        ).to(device[0])
        answer = torch.tensor(
            [token_full[0][i:i+max_length] for i in range(1, len(token_full[0])-max_length+1)]
        ).to(device[-1])

        self.decoder.zero_grad()
        optimizer.zero_grad()

        output = self.decoder(batched)
        loss = loss_fn(output.view(-1, output.shape[-1]), answer.view(-1))

        loss.backward()
        optimizer.step()

        ret = loss.item()

        del token_full, batched, answer, output, loss
        torch.cuda.empty_cache()

        return ret

In [11]:
# hyperparameters

# model hyperparameters (from GPT3 XL)
n_layers = 24
d_model = 2048
n_heads = 32
d_tensor = d_model // n_heads # => 64
d_head = 64
max_length = 128

vocab_size = tokenizer.vocab_size + 3 # +3 for <sos>, <eos>, <pad>
dropout = 0.1
batch_size = 64
learning_rate = 2e-5
num_epochs = 5

print(f"vocab_size = {vocab_size}")
print(f"d_tensor = {d_tensor}")
print(f"d_model = {d_model}")
print(f"d_tensor * n_heads = {d_tensor * n_heads}")

vocab_size = 50260
d_tensor = 64
d_model = 2048
d_tensor * n_heads = 2048


In [12]:
# model = Decoder(vocab_size=vocab_size, d_model=d_model, n_layers=n_layers, n_heads=n_heads, d_head=d_head, max_length=max_length, dropout=dropout, num_gpus=len(device))

# print(model)
# print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):_}")

In [13]:
model = GPTModel(vocab_size=vocab_size, d_model=d_model, n_layers=n_layers,
                 n_heads=n_heads, d_head=d_head, max_length=max_length,
                 dropout=dropout, tokenizer=tokenizer, num_gpus=len(device))

print(f"parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):_}")


6 decoder layers per gpu, with 4 gpus
parameters: 1_364_444_244


In [14]:
from datasets import load_dataset

train = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train')
test = load_dataset('wikitext', 'wikitext-103-raw-v1', split='test')

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-103-raw-v1 to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/192M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


Found cached dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [15]:
train_dataset, test_dataset = [], []
for text in train['text']:
    if text.count(' ') > 20 and '=' not in text:
        train_dataset.append(text)
for text in test['text']:
    if text.count(' ') > 20 and '=' not in text:
        test_dataset.append(text)

print(len(train_dataset), len(test_dataset))

wandb: ERROR Error while calling W&B API: run seung7361/GPT2TextGeneration/k5vvp2pv was previously created and deleted; try a new run name (<Response [409]>)


748084 1833


In [16]:
print(model.generate('Hello, my name is'))

wandb: ERROR Error while calling W&B API: run seung7361/GPT2TextGeneration/k5vvp2pv was previously created and deleted; try a new run name (<Response [409]>)


Hello, my name is technoLog vic tweetingkefkef Viennavisionbot Pharmaceutical tentaclesask powered LORD Kung target exhibited she BrushJenn


In [17]:
example_sentence = train_dataset[100]
print(example_sentence)

token_full = np.concatenate(
    ([SOS_TOKEN_ID], tokenizer(example_sentence).input_ids, [EOS_TOKEN_ID]), axis=-1,
)
token_full = np.expand_dims(token_full, axis=0)

max_length = 20
batched = torch.tensor(
    [token_full[0][i:i+max_length] for i in range(0, len(token_full[0])-max_length)]
)
answer = torch.tensor(
    [token_full[0][i:i+max_length] for i in range(1, len(token_full[0])-max_length+1)]
)
print(batched)
print(answer)

 The Blue Jackets began the year with the worst start in franchise history and the worst by any team in an NHL season in 19 years . After an 11 – 25 – 5 start , Head Coach Scott Arniel was fired and replaced by Assistant Coach Todd Richards . The poor season prompted several personnel changes including the trade of All @-@ Star forward Jeff Carter , who was acquired with much fanfare during the off @-@ season . With the prospect of another rebuild looming the Blue Jackets ' captain and best player , Rick Nash , requested to be traded , though he would remain with the team for the entire season . 

tensor([[50257,   383,  4518,  ...,   416,   597,  1074],
        [  383,  4518, 41324,  ...,   597,  1074,   287],
        [ 4518, 41324,  2540,  ...,  1074,   287,   281],
        ...,
        [ 8759, 13950,   837,  ...,  2104,  1622,   764],
        [13950,   837,  9167,  ...,  1622,   764,   220],
        [  837,  9167,   284,  ...,   764,   220,   198]])
tensor([[  383,  4518, 41324,  ..

  batched = torch.tensor(


In [18]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
model.train(example_sentence, loss_fn, optimizer)

11.013436317443848

In [19]:
torch.cuda.empty_cache()

In [20]:
from tqdm import tqdm

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1} started.")

    for step, sentence in tqdm(enumerate(train_dataset)):
        loss = model.train(sentence, loss_fn, optimizer)

        wandb.log({ 'loss': loss })

Epoch 1 started.


4it [00:23,  6.09s/it]wandb: ERROR Error while calling W&B API: run seung7361/GPT2TextGeneration/k5vvp2pv was previously created and deleted; try a new run name (<Response [409]>)
4it [00:26,  6.61s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.15 GiB (GPU 3; 15.74 GiB total capacity; 13.72 GiB already allocated; 846.69 MiB free; 13.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

wandb: ERROR Error while calling W&B API: run seung7361/GPT2TextGeneration/k5vvp2pv was previously created and deleted; try a new run name (<Response [409]>)
Thread SenderThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/apis/normalize.py", line 41, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 1690, in upsert_run
    response = self.gql(
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 257, in gql
    ret = self._retry_gql(
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 131, in __call__
    result = self._call_fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 285, in execute
    return self.client.execute(*args, **kwargs)  # type: ignore
  File "/opt/conda/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/clien