## Deepspeed on Vertex AI

In [1]:
! pip install --user -q google-cloud-aiplatform[ray]==1.56.0 \
                        ray[data,train,tune,serve]==2.9.3 \
                        datasets \
                        evaluate \
                        accelerate==0.18.0 \
                        transformers==4.26.0 \
                        torch>=1.12.0 \
                        deepspeed==0.12.3 \
                        huggingface_hub[cli]

In [2]:
# @title Define constants
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET_URI = "gs://ray_deepspeed_gpt-j-6b"  # @param {type:"string"}


In [3]:
model_name = "EleutherAI/gpt-j-6b"

use_gpu = True
num_workers = 16
cpus_per_worker = 8

In [5]:
# @title Authenticate your Hugging Face account
from huggingface_hub import interpreter_login

interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



Enter your token (input will not be visible):  ········
Add token as git credential? (Y/n)  n


In [52]:
import ray


runtime_env={
        "pip": [
            "google-cloud-aiplatform[ray]==1.56.0",
            "ray[data,train,tune,serve]==2.9.3",
            "datasets",
            "evaluate",
            # The latest combination accelerate==0.25.0, transformers==4.36.0, deepspeed==0.12.4
            # has issues with DeepSpeed process group initialization,
            # and will result in a batch_size validation problem.
            # TODO(ml-team): get rid of the pins once the issue is fixed.
            "accelerate==0.18.0",
            "transformers==4.26.0",
            "torch>=1.12.0",
            "deepspeed==0.12.3",
        ],
    },

ray.shutdown()
ray.init(
    runtime_env={
        "pip": [
            "google-cloud-aiplatform[ray]==1.56.0",
            "ray[data,train,tune,serve]==2.9.3",
            "datasets",
            "evaluate",
            # The latest combination accelerate==0.25.0, transformers==4.36.0, deepspeed==0.12.4
            # has issues with DeepSpeed process group initialization,
            # and will result in a batch_size validation problem.
            # TODO(ml-team): get rid of the pins once the issue is fixed.
            "accelerate==0.18.0",
            "transformers==4.26.0",
            "torch>=1.12.0",
            "deepspeed==0.12.3",
        ],
    },
)

2025-02-15 10:41:51,779	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


0,1
Python version:,3.10.16
Ray version:,2.9.3
Dashboard:,http://127.0.0.1:8266


[36m(download_model pid=871609)[0m path1 11
[36m(download_model pid=871609)[0m path:/home/jupyter/.cache/huggingface/hub/gpt-j-6b


[36m(download_model pid=871609)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=871609)[0m CommandException: No URLs matched. Do the files you're operating on exist?


In [53]:
from transformers.utils.hub import TRANSFORMERS_CACHE

TRANSFORMERS_CACHE

'/home/jupyter/.cache/huggingface/hub'

In [54]:
import numpy as np
import pandas as pd
import os

In [55]:
# THIS SHOULD BE HIDDEN IN DOCS AND ONLY RAN IN CI
# Download the model from our S3 mirror as it's faster

import ray
import subprocess
import ray.util.scheduling_strategies


def force_on_node(node_id: str, remote_func_or_actor_class):
    scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
        node_id=node_id, soft=False
    )
    options = {"scheduling_strategy": scheduling_strategy}
    
    print(f"options : {options}")
    
    return remote_func_or_actor_class.options(**options)


def run_on_every_node(remote_func_or_actor_class, **remote_kwargs):
    refs = []
    for node in ray.nodes():
        print(f"node : {node}")
        
        if node["Alive"] and node["Resources"].get("GPU", None):
            
            print(f"node  1")
            
            refs.append(
                force_on_node(node["NodeID"], remote_func_or_actor_class).remote(
                    **remote_kwargs
                )
            )
    return ray.get(refs)


@ray.remote(num_gpus=1)
def download_model():
    
    from transformers.utils.hub import TRANSFORMERS_CACHE

    print(f"path1 11")

        
    path = os.path.expanduser(
        os.path.join(TRANSFORMERS_CACHE, "gpt-j-6b")
    )
    
    print(f"path:{path}")
    
    subprocess.run(["mkdir", "-p", os.path.join(path, "snapshots", "main")])
    subprocess.run(["mkdir", "-p", os.path.join(path, "refs")])
    
    if os.path.exists(os.path.join(path, "refs", "main")):
        return
    
    subprocess.run(
        [
            "gsutil",
            "-m",
            "cp",
            "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/",
            os.path.join(path, "snapshots", "main"),
        ]
    )
    
    
    
    # with open(os.path.join(path, "snapshots", "main", "hash"), "r") as f:
    #     f_hash = f.read().strip()
    # with open(os.path.join(path, "refs", "main"), "w") as f:
    #     f.write(f_hash)
    # os.rename(
    #     os.path.join(path, "snapshots", "main"), os.path.join(path, "snapshots", f_hash)
    # )


In [56]:
_ = run_on_every_node(download_model)

node : {'NodeID': 'd656747843370e5970370f077d44814884f2c44c19944c9b9faff537', 'Alive': True, 'NodeManagerAddress': '10.128.0.4', 'NodeManagerHostname': 'llm-training', 'NodeManagerPort': 45059, 'ObjectManagerPort': 43839, 'ObjectStoreSocketName': '/var/tmp/ray/session_2025-02-15_10-41-49_623472_360471/sockets/plasma_store', 'RayletSocketName': '/var/tmp/ray/session_2025-02-15_10-41-49_623472_360471/sockets/raylet', 'MetricsExportPort': 39560, 'NodeName': '10.128.0.4', 'RuntimeEnvAgentPort': 63917, 'alive': True, 'Resources': {'CPU': 48.0, 'memory': 125083953357.0, 'GPU': 4.0, 'node:10.128.0.4': 1.0, 'object_store_memory': 57893122867.0, 'accelerator_type:L4': 1.0, 'node:__internal_head__': 1.0}, 'Labels': {'ray.io/node_id': 'd656747843370e5970370f077d44814884f2c44c19944c9b9faff537'}}
node  1
options : {'scheduling_strategy': <ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy object at 0x7f5e0e19d930>}


Loading tiny_shakespeare dataset


The repository for tiny_shakespeare contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/tiny_shakespeare.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [16]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(current_dataset["train"]),
    "validation": ray.data.from_huggingface(current_dataset["validation"]),
}

ray_datasets

2025-02-15 08:25:36,724	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


{'train': MaterializedDataset(num_blocks=1, num_rows=1, schema={text: string}),
 'validation': MaterializedDataset(num_blocks=1, num_rows=1, schema={text: string})}

[36m(download_model pid=508380)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=508380)[0m CommandException: No URLs matched. Do the files you're operating on exist?
[36m(download_model pid=527250)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=527250)[0m CommandException: No URLs matched. Do the files you're operating on exist?
[36m(download_model pid=794548)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=794548)[0m CommandException: No URLs matched. Do the files you're operating on exist?
[36m(download_model pid=796093)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=796093)[0m CommandException: No URLs matched. Do the files you're operating on exist?
[36m(download_model pid=798844)[0m Omittin

[36m(download_model pid=801159)[0m path:/home/jupyter/.cache/huggingface/hub/gpt-j-6b


[36m(download_model pid=801159)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=801159)[0m CommandException: No URLs matched. Do the files you're operating on exist?


[36m(download_model pid=822083)[0m path:/home/jupyter/.cache/huggingface/hub/gpt-j-6b


[36m(download_model pid=822083)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=822083)[0m CommandException: No URLs matched. Do the files you're operating on exist?


[36m(download_model pid=825335)[0m path:/home/jupyter/.cache/huggingface/hub/gpt-j-6b


[36m(download_model pid=825335)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=825335)[0m CommandException: No URLs matched. Do the files you're operating on exist?


[36m(download_model pid=836973)[0m path1 11
[36m(download_model pid=836973)[0m path:/home/jupyter/.cache/huggingface/hub/gpt-j-6b


[36m(download_model pid=836973)[0m Omitting prefix "gs://sllm_checkpoints/EleutherAI/gpt-j-6b/". (Did you mean to do cp -r?)
[36m(download_model pid=836973)[0m CommandException: No URLs matched. Do the files you're operating on exist?


In [17]:
block_size = 512


In [19]:
from transformers import AutoTokenizer
import pandas as pd

def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=block_size,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)


processed_datasets = {
    key: (
        ds.map_batches(split_text, batch_format="pandas")
        .map_batches(tokenize, batch_format="pandas")
    )
    for key, ds in ray_datasets.items()
}
processed_datasets

{'train': MapBatches(tokenize)
 +- MapBatches(split_text)
    +- Dataset(num_blocks=1, num_rows=1, schema={text: string}),
 'validation': MapBatches(tokenize)
 +- MapBatches(split_text)
    +- Dataset(num_blocks=1, num_rows=1, schema={text: string})}

In [27]:
import evaluate
import torch
from transformers import (
    Trainer,
    TrainingArguments,
    GPTJForCausalLM,
    AutoTokenizer,
    default_data_collator,
)
from transformers.utils.logging import disable_progress_bar, enable_progress_bar

from ray import train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback

def train_func(config):
    # Use the actual number of CPUs assigned by Ray
    os.environ["OMP_NUM_THREADS"] = str(
        train.get_context().get_trial_resources().bundles[-1].get("CPU", 1)
    )
    # Enable tf32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True

    batch_size = config.get("batch_size", 4)
    epochs = config.get("epochs", 2)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)
    steps_per_epoch = config.get("steps_per_epoch")

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
            "hysteresis": 4,
            "consecutive_hysteresis": True,
        },
        "bf16": {"enabled": "auto"},
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }

    print("Preparing training arguments")
    training_args = TrainingArguments(
        "output",
        logging_steps=1,
        save_strategy="steps",
        save_steps=steps_per_epoch,
        max_steps=steps_per_epoch * epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=1,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        label_names=["input_ids", "attention_mask"],
        push_to_hub=False,
        report_to="none",
        disable_tqdm=True,  # declutter the output a little
        fp16=True,
        gradient_checkpointing=True,
        deepspeed=deepspeed,
    )
    disable_progress_bar()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model")

    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)
    model.resize_token_embeddings(len(tokenizer))

    print("Model loaded")

    enable_progress_bar()

    metric = evaluate.load("accuracy")

    train_ds = train.get_dataset_shard("train")
    eval_ds = train.get_dataset_shard("validation")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size,
        local_shuffle_buffer_size=train.get_context().get_world_size() * batch_size,
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    # Add callback to report checkpoints to Ray Train
    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()

[2025-02-15 09:29:12,651] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [30]:
storage_path = "gs://sllm_checkpoints/EleutherAI/gptj-deepspeed-finetune"  # TODO: Set up cloud storage

In [None]:
# import os, re

# artifact_storage = os.environ.get("ANYSCALE_ARTIFACT_STORAGE", "artifact_storage")
# user_name = re.sub(r"\s+", "__", os.environ.get("ANYSCALE_USERNAME", "user"))
# storage_path = f"{artifact_storage}/{user_name}/gptj-deepspeed-finetune"

In [32]:
batch_size = 16
train_ds_size = processed_datasets["train"].count()
steps_per_epoch = train_ds_size // (batch_size * num_workers)

2025-02-15 10:04:42,205	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(split_text)->MapBatches(tokenize)]
2025-02-15 10:04:42,206	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2025-02-15 10:04:42,207	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

2025-02-15 10:04:42,252	ERROR streaming_executor_state.py:496 -- An exception was raised from a task of operator "MapBatches(split_text)->MapBatches(tokenize)". Dataset execution will now abort. To ignore this exception and continue, set DataContext.max_errored_blocks.
