In [1]:
%%writefile /home/ubuntu/kevin.jung/colossal/callback.py
import psutil
import torch
import torch.distributed as dist
from pytorch_lightning.callbacks import Callback


def print_rank_0(*args, **kwargs):
    if dist.get_rank() == 0:
        print(*args, **kwargs)
    dist.barrier()


def get_cpu_mem():
    return psutil.Process().memory_info().rss


class MemoryMonitor(Callback):
    def __init__(self) -> None:
        super().__init__()
        self.max_cpu_mem = 0

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
        self.max_cpu_mem = max(get_cpu_mem(), self.max_cpu_mem)

    def on_fit_start(self, trainer, pl_module) -> None:
        max_cuda_mem = torch.cuda.max_memory_allocated()
        cuda_mem = torch.cuda.memory_allocated()
        print_rank_0(f'CPU memory before training: {get_cpu_mem()/1024**2:.3f} MB')
        print_rank_0(f'CUDA memory before training: {cuda_mem/1024**2:.3f} MB')
        print_rank_0(f'Max CUDA memory before training: {max_cuda_mem/1024**2:.3f} MB')

    def on_fit_end(self, trainer, pl_module) -> None:
        max_cuda_mem = torch.cuda.max_memory_allocated()
        print_rank_0(f'Max CPU memory: {self.max_cpu_mem/1024**2:.3f} MB')
        print_rank_0(f'Max CUDA memory: {max_cuda_mem/1024**2:.3f} MB')


Writing /home/ubuntu/kevin.jung/colossal/callback.py


In [2]:
%%writefile /home/ubuntu/kevin.jung/colossal/data.py
import torch

__all__ = ['RandomDataloader']


def get_data(batch_size, seq_len, vocab_size):
    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
    attention_mask = torch.ones_like(input_ids)
    return input_ids, attention_mask


class RandomDataloader:
    def __init__(self, n_steps: int, batch_size: int, seq_len: int = 1024, vocab_size: int = 50257) -> None:
        self.n_steps = n_steps
        self.cur_step = 0
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.vocab_size = vocab_size

    def __iter__(self):
        self.cur_step = 0
        return self

    def __next__(self):
        if self.cur_step >= self.n_steps:
            raise StopIteration
        self.cur_step += 1
        return get_data(self.batch_size, self.seq_len, self.vocab_size)

    def __len__(self):
        return self.n_steps
        

Writing /home/ubuntu/kevin.jung/colossal/data.py


In [3]:
%%writefile /home/ubuntu/kevin.jung/colossal/model.py
import torch.nn as nn
import pytorch_lightning as pl
from transformers import GPT2Config, GPT2LMHeadModel, GPT2PreTrainedModel
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import colo_set_process_memory_fraction
from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
from torch.optim import Adam, Optimizer
from functools import partial
from typing import Callable, Iterable
from contextlib import contextmanager
__all__ = ['GPTLitModule', 'get_optimizer']


@contextmanager
def no_init_weights():
    def dummy_fn(*args):
        return
    try:
        old_init_weights = GPT2PreTrainedModel._init_weights
        GPT2PreTrainedModel._init_weights = dummy_fn
        yield
    finally:
        GPT2PreTrainedModel._init_weights = old_init_weights


class GPTLMModel(nn.Module):
    def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=12, max_seq_len=1024, vocab_size=50257, checkpoint=False):
        super().__init__()
        self.checkpoint = checkpoint
        with no_init_weights():
            self.model = GPT2LMHeadModel(GPT2Config(n_embd=hidden_size, n_layer=num_layers,
                                                    n_head=num_attention_heads, n_positions=max_seq_len, n_ctx=max_seq_len, vocab_size=vocab_size))
        if checkpoint:
            self.model.gradient_checkpointing_enable()

    def forward(self, input_ids, attention_mask):
        # Only return lm_logits
        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]


def gpt2_tiny(checkpoint=True):
    return GPTLMModel(hidden_size=128, num_layers=4, num_attention_heads=4, checkpoint=checkpoint)


def gpt2_small(checkpoint=True):
    return GPTLMModel(hidden_size=768, num_layers=12, num_attention_heads=12, checkpoint=checkpoint)


def gpt2_medium(checkpoint=True):
    return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_large(checkpoint=True):
    return GPTLMModel(hidden_size=1280, num_layers=36, num_attention_heads=20, checkpoint=checkpoint)


def gpt2_xl(checkpoint=True):
    return GPTLMModel(hidden_size=1600, num_layers=48, num_attention_heads=25, checkpoint=checkpoint)


def gpt2_2B(checkpoint=True):
    return GPTLMModel(hidden_size=2048, num_layers=40, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_3B(checkpoint=True):
    return GPTLMModel(hidden_size=2304, num_layers=48, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_4B(checkpoint=True):
    return GPTLMModel(hidden_size=2304, num_layers=64, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_6B(checkpoint=True):
    return GPTLMModel(hidden_size=4096, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_8B(checkpoint=True):
    return GPTLMModel(hidden_size=3072, num_layers=72, num_attention_heads=24, checkpoint=checkpoint)


def gpt2_12B(checkpoint=True):
    return GPTLMModel(hidden_size=4096, num_layers=60, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_15B(checkpoint=True):
    return GPTLMModel(hidden_size=4096, num_layers=78, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_18B(checkpoint=True):
    return GPTLMModel(hidden_size=4096, num_layers=90, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_20B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=25, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_24B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_28B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=35, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_32B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=40, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_36B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=45, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_40B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)


def gpt2_45B(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=56, num_attention_heads=16, checkpoint=checkpoint)


def gpt3(checkpoint=True):
    return GPTLMModel(max_seq_len=2048, hidden_size=12288, num_layers=96, num_attention_heads=96, checkpoint=checkpoint)


def get_gpt_model(model_name: str, checkpoint: bool = True) -> nn.Module:
    model_map = {
        'gpt2_tiny': gpt2_tiny,
        'gpt2_small': gpt2_small,
        'gpt2_medium': gpt2_medium,
        'gpt2_large': gpt2_large,
        'gpt2_xl': gpt2_xl,
        'gpt2_2B': gpt2_2B,
        'gpt2_3B': gpt2_3B,
        'gpt2_4B': gpt2_4B,
        'gpt2_6B': gpt2_6B,
        'gpt2_8B': gpt2_8B,
        'gpt2_12B': gpt2_12B,
        'gpt2_15B': gpt2_15B,
        'gpt2_18B': gpt2_18B,
        'gpt2_20B': gpt2_20B,
        'gpt2_24B': gpt2_24B,
        'gpt2_28B': gpt2_28B,
        'gpt2_32B': gpt2_32B,
        'gpt2_36B': gpt2_36B,
        'gpt2_40B': gpt2_40B,
        'gpt2_45B': gpt2_45B,
        'gpt3': gpt3,
    }
    assert model_name in model_map
    return model_map[model_name](checkpoint)


class GPTLMLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.loss = nn.CrossEntropyLoss()

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))


def get_optimizer(strategy: str, **kwargs) -> Callable[[Iterable], Optimizer]:
    assert strategy in ('ddp', 'deepspeed', 'colossal')
    if strategy == 'ddp':
        opt_cls = Adam
    elif strategy == 'deepspeed':
        offload = kwargs.pop('offload')
        if offload:
            opt_cls = DeepSpeedCPUAdam
        else:
            opt_cls = FusedAdam
    else:
        opt_cls = HybridAdam
    return partial(opt_cls, **kwargs)


class GPTLitModule(pl.LightningModule):
    def __init__(self, model_name: str, optimizer_init_fn: Callable[[Iterable], Optimizer],
                 checkpoint: bool = True, cuda_mem_fraction: float = 1.0) -> None:
        super().__init__()
        self.model_name = model_name
        self.optimizer_init_fn = optimizer_init_fn
        self.checkpoint = checkpoint
        self.criterion = GPTLMLoss()
        self.cuda_mem_fraction = cuda_mem_fraction

    def configure_sharded_model(self) -> None:
        self.model = get_gpt_model(self.model_name, self.checkpoint)

    def on_load_checkpoint(self, checkpoint) -> None:
        if not hasattr(self, 'model'):
            self.configure_sharded_model()

    def configure_optimizers(self):
        return self.optimizer_init_fn(self.model.parameters())

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask = batch
        logits = self.model(input_ids, attention_mask)
        loss = self.criterion(logits, input_ids)
        return loss

    def on_fit_start(self) -> None:
        if self.cuda_mem_fraction < 1.0:
            colo_set_process_memory_fraction(self.cuda_mem_fraction)
            

Writing /home/ubuntu/kevin.jung/colossal/model.py


In [13]:
%%writefile /home/ubuntu/kevin.jung/colossal/train.py
import pytorch_lightning as pl
import argparse
from data import RandomDataloader
from model import GPTLitModule, get_optimizer
from callback import MemoryMonitor
from pytorch_lightning.callbacks import TQDMProgressBar
from pytorch_lightning.strategies.ddp import DDPStrategy
from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy
from pytorch_lightning.strategies.colossalai import ColossalAIStrategy

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--tqdm_rate', type=int, default=2000)
    parser.add_argument('--epochs', type=int, default=2)
    parser.add_argument('--steps_per_epoch', type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=1)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--model', default='gpt2_xl')
    parser.add_argument('--np', type=int, default=1)
    parser.add_argument('--no_activation_ckpt', action='store_true', default=False)
    parser.add_argument('--opt_nvme_offload_frac', type=float, default=0.0)
    parser.add_argument('--opt_nvme_offload_dir', default='./offload')
    parser.add_argument('--seq_len', type=int, default=1024)
    parser.add_argument('--placement_policy', default='cuda')
    parser.add_argument('--opt_gpu_margin_rat', type=float, default=0.0)
    parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
    parser.add_argument('--strategy', default='ddp', choices=['ddp', 'colossal', 'deepspeed'])
    parser.add_argument('--offload', action='store_true', default=False)
    args = parser.parse_args()
    train_dataloader = RandomDataloader(args.steps_per_epoch, args.batch_size, args.seq_len)
    optimizer_cfg = {'lr': args.lr}
    if args.strategy == 'ddp':
        trainer_cfg = {
            'accelerator': 'gpu',
            'precision': 16,
            'strategy': DDPStrategy(static_graph=True)
        }
    elif args.strategy == 'colossal':
        trainer_cfg = {
            'accelerator': 'gpu',
            'precision': 16,
            'strategy': ColossalAIStrategy(
                placement_policy=args.placement_policy,
                gpu_margin_mem_ratio=args.opt_gpu_margin_rat,
                initial_scale=32,
                chunk_search_range= 64 * 1024**2,
                chunk_search_n_grids= 4096,
                min_chunk_size= 32 * 1024**2
            )
        }
        optimizer_cfg['nvme_offload_dir'] = args.opt_nvme_offload_dir
        optimizer_cfg['nvme_offload_fraction'] = args.opt_nvme_offload_frac
    elif args.strategy == 'deepspeed':
        trainer_cfg = {
            'accelerator': 'gpu',
            'precision': 16,
            'strategy': DeepSpeedStrategy(
                stage=3,
                offload_parameters=args.offload,
                offload_optimizer=args.offload,
                initial_scale_power=5
            )
        }
        optimizer_cfg['offload'] = args.offload
    opt_init_fn = get_optimizer(args.strategy, **optimizer_cfg)
    model = GPTLitModule(args.model, opt_init_fn, checkpoint=not args.no_activation_ckpt,
                         cuda_mem_fraction=args.cuda_mem_frac)
    trainer = pl.Trainer(
        max_epochs=args.epochs,
        devices=args.np,
        enable_checkpointing=False,
        callbacks=[
            MemoryMonitor(),
            TQDMProgressBar(refresh_rate=args.tqdm_rate)
        ],
        **trainer_cfg
    )
    trainer.fit(model, train_dataloader)

Overwriting /home/ubuntu/kevin.jung/colossal/train.py


In [16]:
%%writefile /home/ubuntu/kevin.jung/colossal/train_start
# export CUDA_LAUNCH_BLOCKING="1"
# export CUDA_VISIBLE_DEVICES="0,1,2,3"
export TOKENIZERS_PARALLELISM="0"


EXECUTEFILE="colossal/train.py"    # needs custom trainer path
EPOCHS=100                         # type=int       default=2
TQDM_RATE=2000                     # type=int       default=2000
LEARNING_RATE=5e-5                 # type=float     default=1e-3
STRATEGY="colossal"                # type=str       default='ddp'         choices=['ddp', 'colossal', 'deepspeed']
ACCELERATOR="gpu"                  # type=str       default=gpu
NP=-1                              # type=int       default=1
BATCHSIZE=1                        # type=int       default=1
MODEL_NAME='gpt2_2B'               # type=str       default='gpt2_xl'     choices=['gpt2_tiny'~'gpt2_xl'~'gpt3']
STEPS_PER_EPOCH=4                  # type=int       default=4
NAC=false                          # type=bool      default=False         action='store_true'
OFFLOAD=false                      # type=bool      default=False         action='store_true'
OPT_NVME_OFFLAND_FRAC=0.0          # type=float     default=0.0
OPT_NVME_OFFLAND_DIR='/data/opt/'  # type=str       default='/data/opt'
SEQ_LEN=1024                       # type=int       default=1024
PLACEMENT_POLICY='cuda'            # type=str       defualt='cuda'
OPT_GPU_MARGIN_RAT=0.0             # type=float     defualt=0.0
CUDA_MEMORY_FRAC=1.0               # type=float     defualt=1.0

# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/ubuntu/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/ubuntu/anaconda3/etc/profile.d/conda.sh" ]; then
        . "/home/ubuntu/anaconda3/etc/profile.d/conda.sh"
    else
        export PATH="/home/ubuntu/anaconda3/bin:$PATH"
    fi
fi
unset __conda_setup
# <<< conda initialize <<<

conda activate nlp_env

python $EXECUTEFILE\
  --tqdm_rate $TQDM_RATE\
  --model $MODEL_NAME\
  --epochs $EPOCHS\
  --steps_per_epoch $STEPS_PER_EPOCH\
  --batch_size $BATCHSIZE\
  --seq_len $SEQ_LEN\
  --cuda_mem_frac $CUDA_MEMORY_FRAC\
  --np $NP\
  --strategy $STRATEGY\
  --placement_policy $PLACEMENT_POLICY\
  --lr $LEARNING_RATE \
  --no_activation_ckpt\
  --offload\
  --opt_nvme_offload_frac $OPT_NVME_OFFLAND_FRAC\
  --opt_nvme_offload_dir $OPT_NVME_OFFLAND_DIR\
  --opt_gpu_margin_rat $OPT_GPU_MARGIN_RAT
  

Overwriting /home/ubuntu/kevin.jung/colossal/train_start
