A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
set_seed(3407)

In [2]:
from translate.storage.tmx import tmxfile, tmxunit

In [3]:
src_sents = []
tgt_sents = []
tu: tmxunit
with open("./data/en-tr.tmx/en-tr.tmx", 'rb') as fin:
    tmx_file = tmxfile(fin, 'en', 'tr')
    # Extract the source and target sentences into lists
    for node in tmx_file.unit_iter():
        src_sents.append(node.source)
        tgt_sents.append(node.target)

In [4]:
len(tgt_sents)

205756

In [5]:
from typing import List


class TMXDataset(Dataset):
    def __init__(self, src_sents: List[str], tgt_sents: List[str]):
        self.src = src_sents
        self.tgt = tgt_sents

    def __len__(self):
        return len(self.src)

    # Token count
    def get_vocab_size(self):
        # return self.num_digits
        return 64
    
    # Sequence length
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        # return self.length * 2 - 1
        return 64 * 2 - 1


    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

In [6]:
# print an example instance of the dataset
# train_dataset = SortDataset('train')
# test_dataset = SortDataset('test')
dataset = TMXDataset(src_sents, tgt_sents)
x, y = dataset[1]
print(x, y)
# for a, b in zip(x,y):
    # print(a, b)

Kosovo is taking a hard look at its privatisation process in light of recurring complaints. Kosova, tekrar eden şikayetler ışığında özelleştirme sürecini incelemeye alıyor.


In [7]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = dataset.get_vocab_size()
model_config.block_size = dataset.get_block_size()
model = GPT(model_config)

number of parameters: 2.71M


In [8]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, dataset)

running on device cpu


In [9]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

[('Energy ministers from eight Southeast European countries - Albania, Bosnia and Herzegovina, Bulgaria, Croatia, Greece, Macedonia, Romania, and Yugoslavia - gathered in Athens, and are expected to sign a Memorandum of Understanding on creating a regional electricity market in Southeast Europe.', "Michael O'Reilly, the co-ordinator of Haradinaj's defence team, said he is disappointed the tribunal decided against provisional release.", '(Various sources - 15/11/02)', 'The Albanian government introduced a 10% flat income tax rate on Wednesday (August 1st).', "The movie won two awards at this year's Cannes Festival.", '"I encourage the Kosovo authorities to study both documents thoroughly and to see this as an opportunity to be seized for Kosovo\'s future, by taking timely actions as outlined for the benefit of everyone living in Kosovo," Feith added.', "Seeking to capture the ICTY's most wanted indictee, NATO and EUFOR troops have conducted numerous raids on the Karadzic family home in 

AttributeError: 'tuple' object has no attribute 'to'

In [2]:
import pickle

class SortDataset(Dataset):
    """ 
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=8, num_digits=5):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits
    
    def __len__(self):
        return 10000 # ...
    
    def get_vocab_size(self):
        return self.num_digits
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):
        
        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that 
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = -1
        return x, y


In [3]:
# print an example instance of the dataset
train_dataset = SortDataset('train')
test_dataset = SortDataset('test')
x, y = train_dataset[0]
for a, b in zip(x,y):
    print(int(a),int(b))

4 -1
3 -1
1 -1
4 -1
1 -1
0 -1
3 -1
0 0
0 0
0 1
1 1
1 3
3 3
3 4
4 4


In [4]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 0.09M


In [5]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [6]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

[tensor([[2, 2, 0, 0, 3, 0, 3, 1, 0, 0, 0, 1, 2, 2, 3],
        [2, 0, 1, 4, 2, 4, 3, 2, 0, 1, 2, 2, 2, 3, 4],
        [3, 4, 4, 1, 1, 4, 1, 2, 1, 1, 1, 2, 3, 4, 4],
        [4, 3, 3, 0, 4, 3, 0, 4, 0, 0, 3, 3, 3, 4, 4],
        [4, 0, 3, 2, 2, 1, 2, 4, 0, 1, 2, 2, 2, 3, 4],
        [1, 4, 1, 2, 3, 0, 1, 0, 0, 0, 1, 1, 1, 2, 3],
        [2, 0, 4, 4, 2, 4, 2, 1, 0, 1, 2, 2, 2, 4, 4],
        [3, 0, 2, 4, 4, 4, 4, 1, 0, 1, 2, 3, 4, 4, 4],
        [2, 4, 4, 4, 0, 1, 0, 1, 0, 0, 1, 1, 2, 4, 4],
        [3, 4, 3, 0, 4, 4, 1, 2, 0, 1, 2, 3, 3, 4, 4],
        [2, 3, 3, 1, 1, 4, 0, 4, 0, 1, 1, 2, 3, 3, 4],
        [1, 1, 1, 1, 0, 0, 3, 4, 0, 0, 1, 1, 1, 1, 3],
        [3, 3, 0, 4, 3, 0, 0, 4, 0, 0, 0, 3, 3, 3, 4],
        [0, 4, 3, 1, 0, 4, 4, 0, 0, 0, 0, 1, 3, 4, 4],
        [3, 4, 3, 2, 2, 4, 0, 3, 0, 2, 2, 3, 3, 3, 4],
        [3, 1, 2, 3, 2, 4, 4, 1, 1, 1, 2, 2, 3, 3, 4],
        [3, 3, 2, 0, 2, 1, 4, 0, 0, 0, 1, 2, 2, 3, 3],
        [2, 4, 2, 2, 3, 4, 0, 0, 0, 0, 2, 2, 2, 3, 4],
        [

KeyboardInterrupt: 

In [20]:
# now let's perform some evaluation
model.eval();

In [21]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer, 'train', max_batches=50)
    test_score  = eval_split(trainer, 'test',  max_batches=50)

GPT claims that [4, 0, 0, 2, 0, 0, 0, 1] sorted is [0, 0, 0, 0, 1, 1, 2, 4] but gt is [0, 0, 0, 0, 0, 1, 2, 4]
GPT claims that [0, 4, 0, 0, 0, 0, 0, 2] sorted is [0, 0, 0, 0, 0, 1, 2, 4] but gt is [0, 0, 0, 0, 0, 0, 2, 4]
GPT claims that [0, 3, 1, 0, 2, 0, 0, 0] sorted is [0, 0, 0, 0, 1, 1, 2, 3] but gt is [0, 0, 0, 0, 0, 1, 2, 3]
train final score: 4976/5000 = 99.52% correct
GPT claims that [0, 0, 0, 0, 2, 0, 3, 1] sorted is [0, 0, 0, 0, 1, 1, 2, 3] but gt is [0, 0, 0, 0, 0, 1, 2, 3]
GPT claims that [0, 0, 2, 0, 0, 2, 1, 0] sorted is [0, 0, 0, 0, 1, 1, 2, 2] but gt is [0, 0, 0, 0, 0, 1, 2, 2]
GPT claims that [0, 0, 1, 1, 0, 0, 0, 2] sorted is [0, 0, 0, 0, 1, 1, 1, 2] but gt is [0, 0, 0, 0, 0, 1, 1, 2]
test final score: 4980/5000 = 99.60% correct


In [23]:
train_dataset.length

8

In [24]:
# let's run a random given sequence through the model as well
n = train_dataset.length # naugy direct access shrug
inp = torch.tensor([[2, 0, 2, 1, 0, 1, 4, 3]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

input sequence  : [[2, 0, 2, 1, 0, 1, 4, 3]]
predicted sorted: [[0, 0, 1, 1, 2, 2, 3, 4]]
gt sort         : [0, 0, 1, 1, 2, 2, 3, 4]
matches         : True
