<h1>Character Level GPT on Text Data</h1>

### Edit the filename below to train the GPT model on the corpus. Select "Run" -> "Run All".

In [1]:
%env SCOREP_ENABLE_TRACING=1
%env SCOREP_ENABLE_PROFILING=0
%env SCOREP_TOTAL_MEMORY=3g

set user environment sucessfully: {'SCOREP_ENABLE_TRACING': '1', 'SCOREP_ENABLE_PROFILING': '0', 'SCOREP_TOTAL_MEMORY': '3g'}

In [2]:
%%scorep_python_binding_arguments
--noinstrumenter

use the following scorep python binding arguments: --noinstrumenter

In [3]:
filename = "fairytales.txt"

In [4]:
%%execute_with_scorep
import scorep
import logging

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%d/%m/%Y %H:%M:%S",
    level=logging.INFO)

from utils import set_seed
set_seed(42)

import numpy as numpy
import torch
import torch.nn as nn
from torch.nn import functional as F

import math
from torch.utils.data import Dataset

class CharDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print("data has %d characters, %d unique." % (data_size, vocab_size))

        self.stoi = {ch:i for i, ch in enumerate(chars)}
        self.itos = {i:ch for i, ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        chunk = self.data[idx : idx+self.block_size+1]
        dix = [self.stoi[s] for s in chunk]

        x = torch.tensor(dix[:-1], dtype = torch.long)
        y = torch.tensor(dix[1:], dtype = torch.long)
        return x, y

with scorep.instrumenter.enable():
    block_size = 32

    text = open("./{}".format(filename), "r").read()
    train_dataset = CharDataset(text, block_size)

    from model import GPT, GPTconfig
    mconf = GPTconfig(train_dataset.vocab_size, train_dataset.block_size,
                      n_layer=8, n_head=8, n_embd=512)
    model = GPT(mconf)

    from trainer import Trainer, TrainerConfig

    tconf = TrainerConfig(max_epochs=3, batch_size=512, learning_rate=6e-4,
                          lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                          num_workers=4)
    trainer = Trainer(model, train_dataset, None, tconf)

    torch.cuda.empty_cache()
    trainer.train()

    torch.save(model.state_dict(), "./saved_models/trained_gpt_model")

data has 49496 characters, 79 unique.


15/09/2021 15:35:08 - INFO - model - Number of parameters : 2.531738e+07
epoch 1 iter 96: train loss 2.31473. lr 0.00030152924503397155: 100%|██████████| 97/97 [02:27<00:00,  1.52s/it]
epoch 2 iter 96: train loss 2.05380. lr 5.9999999999999995e-05: 100%|██████████| 97/97 [02:27<00:00,  1.52s/it]
epoch 3 iter 96: train loss 1.88871. lr 0.0003015292450339715: 100%|██████████| 97/97 [02:27<00:00,  1.52s/it] 
