In [1]:
! pip install --user -q "google-cloud-aiplatform[ray]>=1.56.0" \
                        "ray[data,train,tune,serve]>=2.33.0"

In [2]:
# @title Define constants
PROJECT_NBR = "721521243942"
PROJECT_ID = "ai-hangsik"
REGION="us-central1"
RAY_CLUSTER_NM = "ray33-cluster-20250216-192557"

In [3]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

In [4]:
import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.air import CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray

ray.__version__

'2.33.0'

In [18]:
ray.shutdown()

In [None]:

RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/{REGION}/persistentResources/{RAY_CLUSTER_NM}"
print(f"RAY_ADDRESS:{RAY_ADDRESS}")

pip_env = {
  "pip": [
        "datasets==2.12.0",
        "evaluate==0.4.0",
        "accelerate==0.16.0",
        "transformers==4.26.0",
        "torch==1.13.0",
        "deepspeed==0.9.2",
        "ipython==8.14.0",
        "numpy<2.0.0",  # https://github.com/deepspeedai/DeepSpeed/issues/5671
        "python-json-logger"
  ],
}

conda_env = {
    "conda": {
        "dependencies": ["mpi4py", "pip", pip_env]
    }  # pip install mpi4py won't work, use conda install instead
}

ray.init(address=RAY_ADDRESS,runtime_env=conda_env)

In [7]:
import ray.data
import ray
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from transformers.utils.logging import enable_progress_bar
import torch

import transformers
from ray.train.huggingface.transformers import (
    prepare_trainer,
    RayTrainReportCallback,
)
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig

[2025-02-16 11:22:01,285] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


I0000 00:00:1739704921.236411 1118981 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1739704921.261917 1118981 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


In [8]:
model_name = "databricks/dolly-v2-3b"
# use_gpu = True
# num_workers = 2
# cpus_per_worker = 4

In [9]:
current_dataset = load_dataset("yahma/alpaca-cleaned")
current_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51760
    })
})

In [10]:
def generate_prompt(data_point):
    # ref: https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

                ### Instruction:
                {data_point["instruction"]}

                ### Input:
                {data_point["input"]}

                ### Response:
                {data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

                ### Instruction:
                {data_point["instruction"]}

                ### Response:
                {data_point["output"]}"""

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token_id = 0
CUTOFF_LEN = 128

current_dataset = current_dataset.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    ),
)



Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [12]:
ray_datasets = ray.data.from_huggingface(current_dataset["train"])
ray_datasets

MaterializedDataset(
   num_blocks=1,
   num_rows=51760,
   schema={
      output: string,
      input: string,
      instruction: string,
      input_ids: list<item: int32>,
      attention_mask: list<item: int8>
   }
)

In [13]:
def trainer_init_per_worker(config):
    batch_size = config.get("batch_size", 1)
    epochs = config.get("epochs", 1)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
        },
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
    
    print("Loading model")
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        # device_map="auto",
        torch_dtype=torch.float16,
    )

    print("Model loaded")
    
    train_ds = ray.train.get_dataset_shard("train")
    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size,
    )
    
    print(f"batch_size: {batch_size}")
    print("Preparing training arguments")
    
    training_args = TrainingArguments(
        output_dir="deepspeed-dolly",
        per_device_train_batch_size=batch_size,
        logging_steps=1,
        max_steps=steps_per_epoch * epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        num_train_epochs=epochs,
        push_to_hub=False,
        disable_tqdm=False,
        fp16=True,
        gradient_accumulation_steps=16,
        deepspeed=deepspeed,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer.pad_token_id = 0

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
    )
    
    print("Model loaded")
    
    enable_progress_bar()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_iterable,
        tokenizer=tokenizer,
        data_collator=transformers.DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),
    )
    
    #Add callback to report checkpoints to Ray Train
    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    
    print("Start training")
    trainer.train()

In [15]:
num_workers = 2
cpus_per_worker = 4
batch_size = 3
train_ds_size = ray_datasets.count()

steps_per_epoch = train_ds_size // (batch_size * num_workers)

trainer = TorchTrainer(
    train_loop_per_worker=trainer_init_per_worker,
    train_loop_config={
        "batch_size": batch_size,  # batch_size per device
        "epochs": 2,
        "steps_per_epoch": steps_per_epoch,
    },
    scaling_config=ScalingConfig(
        num_workers= num_workers,
        use_gpu=True,
        resources_per_worker={
            "GPU": 1,
            "CPU": 1,
        },  # NOTE: huggingface transformers only support 1 GPU per worker.
    ),
    run_config=RunConfig(
        storage_path = "gs://sllm_checkpoints/databrics-dolly-v2-3b/"
    ),
    datasets={
        "train": ray_datasets,
    },
)

In [None]:
results = trainer.fit()

In [None]:
result.metrics     # The metrics reported during training.
result.checkpoint  # The latest checkpoint reported during training.
result.path        # The path where logs are stored.
result.error       # The exception that was raised, if training failed.