In [1]:
import torch
import numpy as np
import os
import sys
from transformers import LlamaConfig, LlamaForCausalLM 
from torch.nn.utils.rnn import pad_sequence # https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html


2023-08-18 12:33:04.625961: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PAD_TOKEN = 0 # don't use this token
START_TOKEN = 1
END_TOKEN = 2
# we want the examples to start with <START> and end with <END>

In [3]:
train_N = 1000
test_N = 100
max_length = 100
min_length = 10
vocab_size = 512

In [5]:
# example dataset
# we are going to take a dataset where the sequence goes like [10, 11, 12, 13...] but each example has a random length
all_ds = {}
for split, N in [('train', train_N), ('test', test_N)]:
    sizes = torch.randint(low=min_length, high=max_length, size=(N,))
    total_dataset = []
    for i in range(N):
        sz = sizes[i]
        # remember the first 3 tokens are outlawed so we don't want to use them
        total_dataset.append(torch.arange(sz) + np.random.randint(low=3, high=vocab_size-sz))
    all_ds[split] = total_dataset

In [6]:
class CustomDS:
    # important things to have in a dataset class is __len__ and __getitem__
    def __init__(self, data):
        self.data = data
        self.start = torch.tensor([START_TOKEN])
        self.end = torch.tensor([END_TOKEN])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        inp = self.data[idx]
        return torch.cat([self.start, inp, self.end]) # add start and end tokens
all_datasets = {k: CustomDS(all_ds[k]) for k in ['train', 'test']}

In [17]:
# here we define the collate function. It takes in a list of examples from the dataset and then makes a batch
def collate_fn(examples):
    example_lens = [len(s) for s in examples]
    padded_tensor = pad_sequence(examples, padding_value=PAD_TOKEN, batch_first=True)
    labels = padded_tensor.clone()
    attn_mask = torch.zeros(padded_tensor.shape).long()
    for i in range(len(example_lens)): # deal with padding
        attn_mask[i, :example_lens[i]] = 1
        labels[i, example_lens[i]:] = -100 # ignore padding during training
    return {'input_ids': padded_tensor, 'attention_mask': attn_mask, 'labels': labels}

In [18]:
# huggingface will create its own loaders, this is just for playing around
train_loader = torch.utils.data.DataLoader(all_datasets['train'], collate_fn=collate_fn, shuffle=True, batch_size=10)
test_loader = torch.utils.data.DataLoader(all_datasets['test'], collate_fn=collate_fn, shuffle=True, batch_size=10)

In [19]:
# let's look at a single batch
for b in test_loader:
    print(b)
    break

{'input_ids': tensor([[  1, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433,
         434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444,   2,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0],
        [  1, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
         201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
         215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
         229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
         243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256,
         257, 258, 259,

In [23]:
config = LlamaConfig(vocab_size=vocab_size, hidden_size=512, num_hidden_layers=5, num_attention_heads=1)
model = LlamaForCausalLM(config).cuda() # put on gpu

In [24]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="test",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    eval_steps=20,
    logging_steps=20,
    report_to="none",
    
)

trainer = Trainer(
    model=model,
    data_collator=collate_fn,
    train_dataset=all_datasets['train'],
    eval_dataset=all_datasets['test'],
    args=training_args
)

In [25]:
trainer.train()



Step,Training Loss,Validation Loss
20,5.147,3.764648
40,2.8629,2.136579
60,1.7891,1.577046
80,1.4963,1.496569


TrainOutput(global_step=80, training_loss=2.8238290309906007, metrics={'train_runtime': 38.7678, 'train_samples_per_second': 257.946, 'train_steps_per_second': 2.064, 'total_flos': 542335506284544.0, 'train_loss': 2.8238290309906007, 'epoch': 10.0})

In [26]:
# ok now we have trained the model lets look at an example output
with torch.no_grad():
    inp = torch.cat([torch.tensor([START_TOKEN]), torch.arange(10) + 20]).unsqueeze(0).cuda()
    out = model.generate(inp, max_new_tokens=10)[0]
    print("Input:", inp[0])
    print("Output:", out[len(inp[0]):])

Input: tensor([ 1, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], device='cuda:0')
Output: tensor([30, 31, 32, 33, 34, 35, 36, 37, 38, 39], device='cuda:0')
