In [2]:
import os
os.environ["RAY_ML_DEV"] = "1"

In [3]:
from typing import Any
import ray
from ray.tune.syncer import SyncConfig
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd
from ray.data.preprocessors import Chain
import evaluate

MODEL_NAME = "databricks/dolly-v2-3b"

current_dataset = load_dataset("tiny_shakespeare")

from ray.data.preprocessors import BatchMapper


def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)


splitter = BatchMapper(split_text, batch_format="pandas")
tokenizer = BatchMapper(tokenize, batch_format="pandas")
preprocessor = Chain(splitter, tokenizer)

ray_datasets = ray.data.from_huggingface(current_dataset)


total_train_batches = splitter.fit_transform(ray_datasets["train"]).count()

from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer

class DollyV2Model(pl.LightningModule):
    def __init__(self, lr=2e-5, eps=1e-8):
        super().__init__()
        self.lr = lr
        self.eps = eps
        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

        self.metric = evaluate.load("accuracy")
        self.predictions = []
        self.references = []

    def forward(self, batch):
        labels = batch["labels"]
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        if self.global_rank == 0:
            print("loss = ", loss.item())
        return loss

    def training_step(self, batch, batch_idx):
        loss = self.forward(batch)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.trainer.model.parameters(), lr=self.lr, eps=self.eps)



  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset tiny_shakespeare (/home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)
100%|██████████| 3/3 [00:00<00:00, 1043.88it/s]
2023-04-29 01:48:06,010	INFO worker.py:1432 -- Connecting to existing Ray cluster at address: 10.0.121.51:6379...
2023-04-29 01:48:06,019	INFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at https://console.anyscale-staging.com/api/v2/sessions/ses_m411tiqu8eluvt1k5ivfqj4q5r/services?redirect_to=dashboard 
2023-04-29 01:48:06,615	INFO packaging.py:520 -- Creating a file package for local directory '/tmp/ray_tmp_module/ray'.
2023-04-29 01:48:07,526	INFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_06d4befa94ba66a2.zip' (155.57MiB) to Ray cluster...
2023-04-29 01:48:08,072	INFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_06d4befa94ba66a2.zip'.
2023-04-29 01:48:08,580	

In [None]:

from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from pytorch_lightning.callbacks import TQDMProgressBar

from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
from torch.distributed.fsdp import ShardingStrategy, MixedPrecision, CPUOffload
from pytorch_lightning.callbacks.progress import TQDMProgressBar

import functools
wrap_policy = functools.partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls = {GPTNeoXLayer}
)

mixed_precision_policy = MixedPrecision(
    param_dtype=torch.float16,
    reduce_dtype=torch.float16,
    buffer_dtype=torch.float16,
)

cpu_offload = CPUOffload(
    offload_params=True
)

class DollyV2Progressbar(TQDMProgressBar):
    def __init__(self, num_iters_per_epoch, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_iters_per_epoch = num_iters_per_epoch
    
    def on_train_epoch_start(self, trainer, *_):
        super().on_train_epoch_start(trainer, *_)
        self.train_progress_bar.reset(self.num_iters_per_epoch)
    
num_workers = 16
batch_size_per_worker = 8
num_iters_per_epoch = total_train_batches // (num_workers * batch_size_per_worker)
progress_bar = DollyV2Progressbar(num_iters_per_epoch)

# Define the configs for LightningTrainer
lightning_config = (
    LightningConfigBuilder()
    .module(cls=DollyV2Model, lr=1e-5, eps=1e-8)
    .trainer(
        max_epochs=1, 
        accelerator="gpu", 
        log_every_n_steps=1,
        precision="16-mixed",
        limit_train_batches=5,
        callbacks=[progress_bar],
        # plugins=[mixed_precision_plugin],
    )
    .checkpointing(monitor="train_loss", mode="min", save_top_k = 1, save_last=True)
    .strategy(
        name="fsdp",
        sharding_strategy=ShardingStrategy.FULL_SHARD,
        auto_wrap_policy=wrap_policy,
        # cpu_offload=cpu_offload
    )
    .build()
)

from ray.tune.syncer import SyncConfig

# Save AIR checkpoints according to the performance on validation set
run_config = RunConfig(
    name="ptl-finetune-dolly-v2",
    storage_path="s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint",
    checkpoint_config=CheckpointConfig(),
)

# Scale the DDP training workload across 4 GPUs
# You can change this config based on your compute resources.
scaling_config = ScalingConfig(
    num_workers=num_workers, use_gpu=True, resources_per_worker={"CPU": 8, "GPU": 1}
)


trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=run_config,
    scaling_config=scaling_config,
    datasets={"train": ray_datasets["train"]},
    datasets_iter_config={"batch_size": batch_size_per_worker},
    preprocessor=preprocessor,
)
result = trainer.fit()

result


In [4]:
from ray.train.lightning import LightningCheckpoint

In [4]:
ckpt = LightningCheckpoint.from_uri("s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/")

In [5]:
model = ckpt.get_model(DollyV2Model)

In [6]:
!aws s3 sync s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/ /home/ray/s3/ckpt

download: s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/_preprocessor to ../s3/ckpt/_preprocessor
download: s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/.metadata.pkl to ../s3/ckpt/.metadata.pkl
download: s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/.tune_metadata to ../s3/ckpt/.tune_metadata
download: s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_17-29-40/checkpoint_000000/.is_checkpoint to ../s3/ckpt/.is_checkpoint
download: s3://large-dl-models-mirror/models--dolly-v2-3b-fp16/model-checkpoint/ptl-finetune-dolly-v2/LightningTrainer_ede1d_00000_0_2023-04-28_

In [7]:
model = DollyV2Model.load_from_checkpoint("/home/ray/s3/ckpt/model")

Downloading (…)lve/main/config.json: 100%|██████████| 819/819 [00:00<00:00, 185kB/s]
Downloading pytorch_model.bin: 100%|██████████| 5.68G/5.68G [00:28<00:00, 201MB/s] 
Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 6.62MB/s]


In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")

Downloading (…)okenizer_config.json: 100%|██████████| 450/450 [00:00<00:00, 279kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 21.2MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 228/228 [00:00<00:00, 158kB/s]


In [12]:
from transformers import pipeline
dolly = model.model.cuda()
nlp_pipeline = pipeline(task="text-generation", model=dolly, tokenizer=tokenizer, device=0)

In [18]:

nlp_pipeline("Below is an instruction that describes a task. Write a response that appropriately completes the request.: Determine whether this is a positive or negative comment: The movie is super gooood! #Response:", max_new_tokens = 100)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[{'generated_text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.: Determine whether this is a positive or negative comment: The movie is super gooood! #Response: Positive.: Super gooood is a positive word.: Super gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper gooood is a positive word.\n\nSuper'}]

In [20]:
from instruct_pipeline import InstructionTextGenerationPipeline
generate_text = InstructionTextGenerationPipeline(model=dolly, tokenizer=tokenizer, device=0)

In [27]:
res = generate_text("中国队勇夺世界杯")

In [28]:
res[0]["generated_text"]

"While China's squad was not particularly strong going into the 2008 AFC Youth Championship, they completely turned the tournament around to win the tournament, becoming the first Chinese team to win the title. Coach Guochuan Lai made several shrewd substitutions to match his opponents' style of play, like replacing Cheng Tiantian with Chen Bo and Liu Tao with Yu Haixin. The Chinese also used a 4-3-3 formation as a formation of choice during the tournament, and would also switch back and forth between 4-2-4 and 4-3-3 formations during the tournament.\nIn the quarterfinals, China played Uzbekistan. The Chinese kept a man-oriented formation, and it seemed to pay off as they opened the scoring through Wang Yong in the 7th minute. However, Uzbekistan turned the game around in the 25th minute, when they capitalized on a Chinese error to score the first goal of the game. From that point onwards, China's offensive performance tailed off, and the Uzbekistan player's kept control of the game. U