In [1]:
import os
os.environ["RAY_ML_DEV"] = "1"

In [2]:
from typing import Any
import ray
from ray.tune.syncer import SyncConfig
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd
from ray.data.preprocessors import Chain
import evaluate

MODEL_NAME = "databricks/dolly-v2-3b"

current_dataset = load_dataset("tiny_shakespeare")

from ray.data.preprocessors import BatchMapper


def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)


splitter = BatchMapper(split_text, batch_format="pandas")
tokenizer = BatchMapper(tokenize, batch_format="pandas")
preprocessor = Chain(splitter, tokenizer)

ray_datasets = ray.data.from_huggingface(current_dataset)


total_train_batches = splitter.fit_transform(ray_datasets["train"]).count()

from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer

class DollyV2Model(pl.LightningModule):
    def __init__(self, lr=2e-5, eps=1e-8):
        super().__init__()
        self.lr = lr
        self.eps = eps
        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

        self.metric = evaluate.load("accuracy")
        self.predictions = []
        self.references = []

    def forward(self, batch):
        labels = batch["labels"]
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        if self.global_rank == 0:
            print("loss = ", loss.item())
        return loss

    def training_step(self, batch, batch_idx):
        loss = self.forward(batch)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.trainer.model.parameters(), lr=self.lr, eps=self.eps)



  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset tiny_shakespeare (/home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)
100%|██████████| 3/3 [00:00<00:00, 1059.70it/s]
2023-04-28 17:58:03,655	INFO worker.py:1432 -- Connecting to existing Ray cluster at address: 10.0.121.41:6379...
2023-04-28 17:58:03,665	INFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at https://console.anyscale-staging.com/api/v2/sessions/ses_m411tiqu8eluvt1k5ivfqj4q5r/services?redirect_to=dashboard 
2023-04-28 17:58:04,152	INFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_f11e5f881e80db22b07cf4de97c14e2a.zip' (165.59MiB) to Ray cluster...
2023-04-28 17:58:04,718	INFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_f11e5f881e80db22b07cf4de97c14e2a.zip'.
2023-04-28 17:58:08,146	INFO streaming_executor.py:87 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOper

In [None]:

from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from pytorch_lightning.callbacks import TQDMProgressBar

from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
from torch.distributed.fsdp import ShardingStrategy, MixedPrecision, CPUOffload
from pytorch_lightning.callbacks.progress import TQDMProgressBar

import functools
wrap_policy = functools.partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls = {GPTNeoXLayer}
)

mixed_precision_policy = MixedPrecision(
    param_dtype=torch.float16,
    reduce_dtype=torch.float16,
    buffer_dtype=torch.float16,
)

cpu_offload = CPUOffload(
    offload_params=True
)

class DollyV2Progressbar(TQDMProgressBar):
    def __init__(self, num_iters_per_epoch, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_iters_per_epoch = num_iters_per_epoch
    
    def on_train_epoch_start(self, trainer, *_):
        super().on_train_epoch_start(trainer, *_)
        self.train_progress_bar.reset(self.num_iters_per_epoch)
    
num_workers = 16
batch_size_per_worker = 8
num_iters_per_epoch = total_train_batches // (num_workers * batch_size_per_worker)
progress_bar = DollyV2Progressbar(num_iters_per_epoch)

# Define the configs for LightningTrainer
lightning_config = (
    LightningConfigBuilder()
    .module(cls=DollyV2Model, lr=1e-5, eps=1e-8)
    .trainer(
        max_epochs=1, 
        accelerator="gpu", 
        log_every_n_steps=1,
        precision="16-mixed",
        limit_train_batches=5,
        callbacks=[progress_bar],
        # plugins=[mixed_precision_plugin],
    )
    .checkpointing(monitor="train_loss", mode="min", save_top_k = 1, save_last=True)
    .strategy(
        name="fsdp",
        sharding_strategy=ShardingStrategy.FULL_SHARD,
        auto_wrap_policy=wrap_policy,
        # cpu_offload=cpu_offload
    )
    .build()
)

from ray.tune.syncer import SyncConfig

# Save AIR checkpoints according to the performance on validation set
run_config = RunConfig(
    name="ptl-finetune-dolly-v2",
    storage_path="s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint",
    checkpoint_config=CheckpointConfig(),
)

# Scale the DDP training workload across 4 GPUs
# You can change this config based on your compute resources.
scaling_config = ScalingConfig(
    num_workers=num_workers, use_gpu=True, resources_per_worker={"CPU": 8, "GPU": 1}
)


trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=run_config,
    scaling_config=scaling_config,
    datasets={"train": ray_datasets["train"]},
    datasets_iter_config={"batch_size": batch_size_per_worker},
    preprocessor=preprocessor,
)
result = trainer.fit()

result


In [3]:
from ray.train.lightning import LightningCheckpoint

In [4]:
ckpt = LightningCheckpoint.from_uri("s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/")

In [5]:
model = ckpt.get_model(DollyV2Model)

In [3]:
model = DollyV2Model.load_from_checkpoint("/home/ray/s3/ckpt/model")

In [8]:
import torch
from instruct_pipeline import InstructionTextGenerationPipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")


loading file vocab.json from cache at None
loading file merges.txt from cache at None
loading file tokenizer.json from cache at /home/ray/.cache/huggingface/hub/models--databricks--dolly-v2-3b/snapshots/e19a5252f69d79d94ac95045eb9b8a158775f701/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/ray/.cache/huggingface/hub/models--databricks--dolly-v2-3b/snapshots/e19a5252f69d79d94ac95045eb9b8a158775f701/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/ray/.cache/huggingface/hub/models--databricks--dolly-v2-3b/snapshots/e19a5252f69d79d94ac95045eb9b8a158775f701/tokenizer_config.json


In [15]:
dolly_model = model.model.cuda()

In [16]:
generate_text = InstructionTextGenerationPipeline(model=dolly_model, tokenizer=tokenizer)

In [17]:
res = generate_text("Explain to me the difference between nuclear fission and fusion.")

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "transformers_version": "4.28.1"
}



In [14]:
res[0]["generated_text"]

'Nuclear fission is the process that occurs in a nuclear reactor when a nucleus splits in two parts.\nNuclear fusion is the process of two nuclei coming together to form one larger nucleus.\nThe release of energy in nuclear fusion is much greater than that in nuclear fission, which is why it is considered a more promising form of power generation.\nBesides producing more energy per unit of fuel than does fission, nuclear fusion could be used to create radiation free, endless energy supplies. However, no one has ever developed a fusion reactor capable of supplying commercial electricity because of the difficulty of catalyzing the reaction.\nBut since 2022 the development of magnetic confinement fusion (MCT) has made great strides. Researchers worldwide are hopeful that the development of this technology will enable energy-producing fusion reactors in our future.'