In [2]:

# set up logging
import logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -    %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [3]:
# make deterministic
from utils import set_seed
set_seed(42)
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset


class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print(f"data has {data_size:d} characters, {vocab_size:d} unique.")

        self.stoi = { ch: i for i, ch in enumerate(chars) }
        self.itos = { i: ch for i, ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx+self.block_size+1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]

        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [5]:
block_size = 128  # spacial extent of the model for its context
# text = open('/home/grads/xiaohan/scratch/minGPT/data/The Old Man and the Sea.txt', 'r').read()
text = open('filename.txt', 'r').read()
train_dataset = CharDataset(text, block_size = 128) # one line of poem is roughly 50 characters

data has 43184 characters, 85 unique.


In [6]:
from model import GPT, GPTConfig


mconf = GPTConfig(
    train_dataset.vocab_size,
    train_dataset.block_size,
    n_layer=4,
    n_head=8,
    n_embd=512,
)
model = GPT(mconf)

In [None]:
from trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(
    max_epochs=2,
    batch_size=512,
    learning_rate=6e-4,
    lr_decay=True,
    warmup_tokens=512*20,
    final_tokens=2*len(train_dataset)*block_size,
    num_workers=2,
)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

  0%|          | 0/85 [00:00<?, ?it/s]

In [None]:
from utils import sample

context = "Jean Valjean made his appearance"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)