In [1]:
import os
os.environ["RAY_ML_DEV"] = "1"

In [2]:
from typing import Any
import ray
from ray.tune.syncer import SyncConfig
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd
from ray.data.preprocessors import Chain
import evaluate

MODEL_NAME = "databricks/dolly-v2-3b"

current_dataset = load_dataset("tiny_shakespeare")

from ray.data.preprocessors import BatchMapper


def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)


splitter = BatchMapper(split_text, batch_format="pandas")
tokenizer = BatchMapper(tokenize, batch_format="pandas")
preprocessor = Chain(splitter, tokenizer)

ray_datasets = ray.data.from_huggingface(current_dataset)


total_train_batches = splitter.fit_transform(ray_datasets["train"]).count()

from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer

class DollyV2Model(pl.LightningModule):
    def __init__(self, lr=2e-5, eps=1e-8):
        super().__init__()
        self.lr = lr
        self.eps = eps
        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

        self.metric = evaluate.load("accuracy")
        self.predictions = []
        self.references = []

    def forward(self, batch):
        outputs = self.model(
            batch["input_ids"], 
            attention_mask=batch["attention_mask"], 
            labels=batch["labels"]
        )
        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self.forward(batch)
        self.log("train_loss", loss)
        if self.global_rank == 0 and batch_idx % 10 == 0:
            print("loss = ", loss.item())
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.trainer.model.parameters(), lr=self.lr, eps=self.eps)

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 3.73k/3.73k [00:00<00:00, 3.91MB/s]
Downloading metadata: 100%|██████████| 1.90k/1.90k [00:00<00:00, 1.86MB/s]
Downloading readme: 100%|██████████| 6.10k/6.10k [00:00<00:00, 5.38MB/s]


Downloading and preparing dataset tiny_shakespeare/default to /home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e...


Downloading data: 1.12MB [00:00, 17.4MB/s]                  
                                                                         

Dataset tiny_shakespeare downloaded and prepared to /home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 1025.42it/s]
2023-04-30 17:56:59,527	INFO worker.py:1432 -- Connecting to existing Ray cluster at address: 10.0.17.186:6379...
2023-04-30 17:56:59,537	INFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at https://console.anyscale-staging.com/api/v2/sessions/ses_m411tiqu8eluvt1k5ivfqj4q5r/services?redirect_to=dashboard 
2023-04-30 17:57:00,132	INFO packaging.py:520 -- Creating a file package for local directory '/tmp/ray_tmp_module/ray'.
2023-04-30 17:57:01,032	INFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_8bfad0fcb1030606.zip' (155.25MiB) to Ray cluster...
2023-04-30 17:57:01,573	INFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_8bfad0fcb1030606.zip'.
2023-04-30 17:57:02,081	INFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_e31442e26ed4f60c0dfec60af8ffa7f7.zip' (165.64MiB) to Ray cluster...
2023-04-30 17:57:02,675	INFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_

In [3]:

from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from pytorch_lightning.callbacks import TQDMProgressBar

from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
from torch.distributed.fsdp import ShardingStrategy, MixedPrecision, CPUOffload, BackwardPrefetch
from pytorch_lightning.callbacks.progress import TQDMProgressBar

import functools
wrap_policy = functools.partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls = {GPTNeoXLayer}
)

mixed_precision_policy = MixedPrecision(
    param_dtype=torch.float16,
    reduce_dtype=torch.float16,
    buffer_dtype=torch.float16,
)

class DollyV2Progressbar(TQDMProgressBar):
    def __init__(self, num_iters_per_epoch, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_iters_per_epoch = num_iters_per_epoch
    
    def on_train_epoch_start(self, trainer, *_):
        super().on_train_epoch_start(trainer, *_)
        self.train_progress_bar.reset(self.num_iters_per_epoch)
    
num_workers = 16
batch_size_per_worker = 8
num_iters_per_epoch = total_train_batches // (num_workers * batch_size_per_worker)
progress_bar = DollyV2Progressbar(num_iters_per_epoch)

# Define the configs for LightningTrainer
lightning_config = (
    LightningConfigBuilder()
    .module(cls=DollyV2Model, lr=2e-5, eps=1e-8)
    .trainer(
        max_epochs=1, 
        accelerator="gpu", 
        log_every_n_steps=1,
        # accumulate_grad_batches=2,
        precision="16-mixed",
        callbacks=[progress_bar],
    )
    .checkpointing(save_last=True)
    .strategy(
        name="fsdp",
        sharding_strategy=ShardingStrategy.FULL_SHARD,
        backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
        forward_prefetch=True,
        auto_wrap_policy=wrap_policy,
    )
    .build()
)

from ray.tune.syncer import SyncConfig

# Save AIR checkpoints according to the performance on validation set
run_config = RunConfig(
    name="finetune-dolly-v2",
    storage_path="s3://yunxuanx-test/model-checkpoint",
    checkpoint_config=CheckpointConfig(),
)

# Scale the DDP training workload across 4 GPUs
# You can change this config based on your compute resources.
scaling_config = ScalingConfig(
    num_workers=num_workers, use_gpu=True, resources_per_worker={"CPU": 8, "GPU": 1}
)


trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=run_config,
    scaling_config=scaling_config,
    datasets={"train": ray_datasets["train"]},
    datasets_iter_config={"batch_size": batch_size_per_worker},
    preprocessor=preprocessor,
)
result = trainer.fit()

result


0,1
Current time:,2023-04-30 18:23:35
Running for:,00:26:27.33
Memory:,8.3/124.4 GiB

Trial name,status,loc,iter,total time (s),train_loss,epoch,step
LightningTrainer_18e2e_00000,TERMINATED,10.0.17.186:7290,1,1456.8,0.175903,0,169


(LightningTrainer pid=7290) 2023-04-30 17:57:16,864	INFO backend_executor.py:128 -- Starting distributed worker processes: ['7494 (10.0.17.186)', '3206 (10.0.37.115)', '3174 (10.0.2.210)', '3191 (10.0.26.185)', '3190 (10.0.42.121)', '3266 (10.0.31.196)', '3178 (10.0.51.169)', '3211 (10.0.16.220)', '3194 (10.0.57.188)', '3160 (10.0.22.19)', '16501 (10.0.30.41)', '3213 (10.0.40.107)', '3258 (10.0.32.28)', '3235 (10.0.14.135)', '3233 (10.0.61.133)', '3178 (10.0.4.204)']
(RayTrainWorker pid=7494) 2023-04-30 17:57:19,146	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=16]

(pid=7290) Running: 0.0/272.0 CPU, 0.0/16.0 GPU, 0.0 MiB/73.21 GiB object_store_memory:   0%|          | 0/1 [00:00<?, ?it/s]
(pid=7290) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output:   0%|          | 0/1 [00:00<?, ?it/s]
(LightningTrainer pid=7290)                                                                                                   2023-04-30 17:57:19,758	

Epoch 0:   0%|          | 0/168 [00:00<?, ?it/s]


(RayTrainWorker pid=7494)   rank_zero_warn(


(RayTrainWorker pid=7494) loss =  11.7890625
Epoch 0:   1%|          | 1/168 [00:09<25:11,  9.05s/it, v_num=0]
Epoch 0:   1%|          | 2/168 [00:16<22:57,  8.30s/it, v_num=0]
Epoch 0:   2%|▏         | 3/168 [00:23<21:33,  7.84s/it, v_num=0]
Epoch 0:   2%|▏         | 4/168 [00:30<20:59,  7.68s/it, v_num=0]
Epoch 0:   3%|▎         | 5/168 [00:37<20:30,  7.55s/it, v_num=0]
Epoch 0:   4%|▎         | 6/168 [00:44<20:05,  7.44s/it, v_num=0]
Epoch 0:   4%|▍         | 7/168 [00:51<19:44,  7.36s/it, v_num=0]
Epoch 0:   5%|▍         | 8/168 [00:58<19:28,  7.30s/it, v_num=0]
Epoch 0:   5%|▌         | 9/168 [01:05<19:15,  7.27s/it, v_num=0]
Epoch 0:   6%|▌         | 10/168 [01:12<19:10,  7.28s/it, v_num=0]
(RayTrainWorker pid=7494) loss =  0.3720703125
Epoch 0:   7%|▋         | 11/168 [01:20<19:02,  7.28s/it, v_num=0]
Epoch 0:   7%|▋         | 12/168 [01:27<18:58,  7.30s/it, v_num=0]
Epoch 0:   8%|▊         | 13/168 [01:34<18:48,  7.28s/it, v_num=0]
Epoch 0:   8%|▊         | 14/168 [01:42<18:42,

Trial name,_report_on,date,done,epoch,experiment_tag,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,step,time_since_restore,time_this_iter_s,time_total_s,timestamp,train_loss,training_iteration,trial_id
LightningTrainer_18e2e_00000,train_epoch_end,2023-04-30_18-21-31,True,0,0,ip-10-0-17-186,1,10.0.17.186,7290,True,169,1456.8,1456.8,1456.8,1682904091,0.175903,1,18e2e_00000


Epoch 0: : 169it [22:47,  8.09s/it, v_num=0]
(RayTrainWorker pid=3211, ip=10.0.16.220) lightning_module_state_dict [repeated 15x across cluster]
(RayTrainWorker pid=3211, ip=10.0.16.220)  [repeated 60x across cluster]


(RayTrainWorker pid=7494) `Trainer.fit` stopped: `max_epochs=1` reached.
(RayTrainWorker pid=7494) RayFSDPStrategy: tearing down strategy...
2023-04-30 18:23:35,484	INFO tune.py:1010 -- Total run time: 1587.49 seconds (1546.53 seconds for the tuning loop).


Result(
  metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.1759033203125, 'epoch': 0, 'step': 169, 'should_checkpoint': True, 'done': True, 'trial_id': '18e2e_00000', 'experiment_tag': '0'},
  path='s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08',
  checkpoint=LightningCheckpoint(uri=s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08/checkpoint_000000)
)

In [4]:
# from ray.train.lightning import LightningCheckpoint#
# ckpt = LightningCheckpoint.from_uri("s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/")

In [5]:
cmd = f"aws s3 sync {result.checkpoint.uri} /home/ray/s3/checkpoint"
os.system(cmd)

download: s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08/checkpoint_000000/_preprocessor to ../s3/checkpoint/_preprocessor
download: s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08/checkpoint_000000/.metadata.pkl to ../s3/checkpoint/.metadata.pkl
download: s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08/checkpoint_000000/.is_checkpoint to ../s3/checkpoint/.is_checkpoint
download: s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08/checkpoint_000000/.tune_metadata to ../s3/checkpoint/.tune_metadata
download: s3://yunxuanx-test/model-checkpoint/finetune-dolly-v2/LightningTrainer_18e2e_00000_0_2023-04-30_17-57-08/checkpoint_000000/model to ../s3/checkpoint/model


0

In [6]:
import torch
from transformers import AutoTokenizer, pipeline
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="right")
dolly = DollyV2Model.load_from_checkpoint("/home/ray/s3/checkpoint/model").cuda()
nlp_pipeline = pipeline(task="text-generation", model=dolly.model, tokenizer=tokenizer, device=0)

In [7]:
# nlp_pipeline("I pray the gods", max_new_tokens=30)
nlp_pipeline("Romeo and juliet", max_new_tokens=30)


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[{'generated_text': 'Romeo and juliet, Romeo and Juliet, Romeo and Juliet! Romeo, Romeo! Juliet, Juliet! Romeo, Romeo! Juliet, Juliet'}]

In [8]:
# from instruct_pipeline import InstructionTextGenerationPipeline
# generate_text = InstructionTextGenerationPipeline(model=dolly, tokenizer=tokenizer, device=0)

In [9]:
# res = generate_text("中国队勇夺世界杯")

In [10]:
# res[0]["generated_text"]