In [1]:
# Copyright  2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Deepspeed with Ray on Vertex AI

* https://docs.ray.io/en/latest/train/examples/deepspeed/gptj_deepspeed_fine_tuning.html

### Configuration

In [1]:
%pip install --user -q "google-cloud-aiplatform[ray]>=1.56.0" \
                        "ray[data,train,tune,serve]==2.33.0"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import os

import ray

In [3]:
ray.__version__

'2.33.0'

In [4]:
# @title Define constants
PROJECT_NBR = "721521243942"
PROJECT_ID = "ai-hangsik"
REGION="us-central1"
RAY_CLUSTER_NM = "ray33-cluster-20250218-085159"

In [5]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

### Connect to Ray on Vertex AI

In [6]:
ray.shutdown()

In [7]:
RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/{REGION}/persistentResources/{RAY_CLUSTER_NM}"
print(f"RAY_ADDRESS:{RAY_ADDRESS}")

RUNTIME_ENV = {
  "pip": [
      "google-cloud-aiplatform[ray]>=1.56.0",
      "ray[data,train,tune,serve]==2.33.0",
      "datasets",
      "evaluate",
      "accelerate==0.18.0",
      "transformers==4.26.0",
      # "torch==1.12.0",
      
      "torch==2.0.0",  # for CUDA 11.8 : https://pytorch.org/get-started/previous-versions/
      "deepspeed==0.14.4", # https://github.com/huggingface/alignment-handbook/issues/180
      "numpy<2",
      
      "setuptools",
      "ipython",
      "scikit-learn",
      "ninja",
      "triton<=3.1.0"  # https://github.com/deepspeedai/DeepSpeed/issues/7028
      
  ],
}

# pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1  pytorch-cuda=11.8

# runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}}


RAY_ADDRESS:vertex_ray://projects/721521243942/locations/us-central1/persistentResources/ray33-cluster-20250218-085159


### Connect to Ray on Vertex AI

In [8]:
ray.init(address=RAY_ADDRESS,runtime_env=RUNTIME_ENV)

[Ray on Vertex AI]: Cluster State = State.RUNNING


0,1
Python version:,3.10.16
Ray version:,2.33.0
Vertex SDK version:,1.80.0
Dashboard:,0e24b26feffa2996-dot-us-central1.aiplatform-training.googleusercontent.com
Interactive Terminal Uri:,3e39a4149930c896-dot-us-central1.aiplatform-training.googleusercontent.com
Cluster Name:,ray33-cluster-20250218-085159


[36m(data_manage pid=1335, ip=10.127.0.21)[0m Loading tiny_shakespeare dataset


[36m(TunerInternal pid=4824)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[36m(TunerInternal pid=4824)[0m 
[36m(TunerInternal pid=4824)[0m View detailed results here: sllm_checkpoints/tmp_store/deepspeed/TorchTrainer_2025-02-18_00-35-08
[36m(TunerInternal pid=4824)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-02-18_00-02-04_369354_12/artifacts/2025-02-18_00-35-16/TorchTrainer_2025-02-18_00-35-08/driver_artifacts`
[36m(TrainTrainable pid=1466, ip=10.127.0.21)[0m [2025-02-18 00:35:22,825] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)
[36m(TunerInternal pid=4824)[0m 
[36m(TunerInternal pid=4824)[0m Training started with configuration:
[36m(TunerInternal pid=4824)[0m ╭────────────────────────────────────────╮
[36m(TunerInternal pid=4824)[0m │ Training config                        │
[36m(TunerInternal pid=4824)[0m ├────────────────────────────────────────┤
[36m(TunerInternal pid=4824)[0m │ train_loop_config/batch_size        16 │
[36m(TunerInter

[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Setting up process group for: env:// [rank=0, world_size=3]
[36m(TorchTrainer pid=1466, ip=10.127.0.21)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=1466, ip=10.127.0.21)[0m - (ip=10.127.0.21, pid=1538) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=1466, ip=10.127.0.21)[0m - (ip=10.127.0.20, pid=1070) world_rank=1, local_rank=0, node_rank=1
[36m(TorchTrainer pid=1466, ip=10.127.0.21)[0m - (ip=10.127.0.22, pid=637) world_rank=2, local_rank=0, node_rank=2


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:35:34,051] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m [2025-02-18 00:35:34,058] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m [2025-02-18 00:35:34,114] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m df: /root/.triton/autotune: No such file or directory






[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Preparing training arguments
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:35:41,385] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m Preparing training arguments
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m [2025-02-18 00:35:41,385] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Preparing training arguments
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m [2025-02-18 00:35:41,386] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Loading model
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Loading model
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m Loading model
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:38:07,079] [INFO] [partition_parameters.py:345:__exit__] finished initializing model - num_params = 285, num_elems = 6.05B
[36m(R

[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m max_steps is given, it will override any value given in num_train_epochs
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Using cuda_amp half precision backend
Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 16.2MB/s]
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m max_steps is given, it will override any value given in num_train_epochs
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m Using cuda_amp half precision backend


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Model loaded


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m max_steps is given, it will override any value given in num_train_epochs
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Using cuda_amp half precision backend


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:38:39,858] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.4, git-hash=unknown, git-branch=unknown
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:38:39,868] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Using /root/.cache/torch_extensions/py310_cu117 as PyTorch extensions root...
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Emitting ninja build file /root/.cache/torch_extensions/py310_cu117/cpu_adam/build.ninja...
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Building extension module cpu_adam...
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Using /root/.cache/torch_extensions/py310_cu117 as PyTorch extensions root...
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Emitting ninja build file /root/.cache/torch_extensions/py310_cu117/cpu_adam/build.ninja...
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Building extension module cpu_adam...
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Allowing ninja to set a default number of workers... (overrida

[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m [1/3] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/tmp/ray/session_2025-02-18_00-02-04_369354_12/runtime_resources/pip/57c8f880f9a251101ddb29a582a94a0beca2f588/virtualenv/lib/python3.10/site-packages/deepspee

[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m Loading extension module cpu_adam...
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m Loading extension module cpu_adam...


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:16,939] [INFO] [utils.py:781:see_memory_usage] Stage 3 initialize beginning
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:16,940] [INFO] [utils.py:782:see_memory_usage] MA 3.87 GB         Max_MA 4.89 GB         CA 5.94 GB         Max_CA 6 GB 
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:16,940] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 5.83 GB, percent = 6.2%
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:16,942] [INFO] [stage3.py:130:__init__] Reduce bucket size 16777216
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:16,942] [INFO] [stage3.py:131:__init__] Prefetch bucket size 15099494
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:17,100] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00

[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m Loading extension module cpu_adam...


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:17,454] [INFO] [utils.py:781:see_memory_usage] Before creating fp16 partitions
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:17,455] [INFO] [utils.py:782:see_memory_usage] MA 3.87 GB         Max_MA 3.87 GB         CA 5.94 GB         Max_CA 6 GB 
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:17,455] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 5.83 GB, percent = 6.2%
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:21,933] [INFO] [utils.py:781:see_memory_usage] After creating fp16 partitions: 3
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:21,934] [INFO] [utils.py:782:see_memory_usage] MA 3.87 GB         Max_MA 3.87 GB         CA 6.16 GB         Max_CA 6 GB 
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:21,934] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 5.83 GB, p

[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m ***** Running training *****
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Num examples = 3648
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Num Epochs = 9223372036854775807
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Instantaneous batch size per device = 16
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Total train batch size (w. parallel, distributed & accumulation) = 48
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Gradient Accumulation steps = 1
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Total optimization steps = 76
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m   Number of trainable parameters = 0
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m ***** Running training *****
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m   Num examples = 3648
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m   Num Epochs = 9223372036854775807
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m   Inst

[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:37,938] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:37,939] [INFO] [utils.py:782:see_memory_usage] MA 3.9 GB         Max_MA 4.67 GB         CA 11.43 GB         Max_CA 11 GB 
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:37,939] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 25.6 GB, percent = 27.1%
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:37,939] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer_Stage3
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:37,939] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:39:37,940] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler 

[36m(SplitCoordinator pid=1609, ip=10.127.0.21)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-18_00-02-04_369354_12/logs/ray-data
[36m(SplitCoordinator pid=1609, ip=10.127.0.21)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(split_text)->MapBatches(tokenize)] -> OutputSplitter[split(3, equal=True)]


[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:40:20,268] [INFO] [loss_scaler.py:197:update_scale] Consecutive hysteresis is enabled. Restoring hysteresis to 4
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m {'loss': 11.6172, 'learning_rate': 1.9736842105263158e-05, 'epoch': 0.01}
[36m(RayTrainWorker pid=1070, ip=10.127.0.20)[0m {'loss': 11.6172, 'learning_rate': 1.9736842105263158e-05, 'epoch': 0.01}
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m {'loss': 11.6172, 'learning_rate': 1.9736842105263158e-05, 'epoch': 0.01}
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m [2025-02-18 00:40:51,184] [INFO] [loss_scaler.py:197:update_scale] Consecutive hysteresis is enabled. Restoring hysteresis to 4
[36m(RayTrainWorker pid=637, ip=10.127.0.22)[0m {'loss': 6.0716, 'learning_rate': 1.9473684210526318e-05, 'epoch': 0.03}
[36m(RayTrainWorker pid=1538, ip=10.127.0.21)[0m {'loss': 6.0716, 'learning_rate': 1.9473684210526318e-05, 'epoch': 0.03}
[36m(RayTrainWorker 

### Model and cluster configuration

In [9]:
model_name = "EleutherAI/gpt-j-6B"
use_gpu = True
num_gpus = 2
num_workers = 3
cpus_per_worker = 6  # g2-standard-24 : https://cloud.google.com/compute/docs/gpus#l4-gpus

### Dataset

In [10]:
# from datasets import load_dataset

# print("Loading tiny_shakespeare dataset")
# current_dataset = load_dataset("tatsu-lab/alpaca")
# current_dataset

In [11]:
# from datasets.dataset_dict import DatasetDict

# slice_dataset = DatasetDict({'train': current_dataset['train'].select(range(2000))})
# slice_dataset

In [12]:
# train_dataset, validation_dataset= slice_dataset['train'].train_test_split(test_size=0.1).values()
# dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset})
# dataset

In [13]:
import ray.data

@ray.remote
def data_manage():

    from datasets import load_dataset
    from datasets.dataset_dict import DatasetDict

    print("Loading tiny_shakespeare dataset")
    current_dataset = load_dataset("tatsu-lab/alpaca")

    slice_dataset = DatasetDict({'train': current_dataset['train'].select(range(2000))})

    train_dataset, validation_dataset= slice_dataset['train'].train_test_split(test_size=0.1).values()
    dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset})

    ray_datasets = {
        "train": ray.data.from_huggingface(dataset["train"]),
        "validation": ray.data.from_huggingface(dataset["validation"]),
    }

    return ray_datasets


In [14]:
ray_datasets = ray.get(data_manage.remote())

### Training

In [15]:
block_size = 512


In [16]:
from transformers import AutoTokenizer

def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=block_size,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)


processed_datasets = {
    key: (
        ds.map_batches(split_text, batch_format="pandas")
        .map_batches(tokenize, batch_format="pandas")
    )
    for key, ds in ray_datasets.items()
}
processed_datasets


{'train': MapBatches(tokenize)
 +- MapBatches(split_text)
    +- Dataset(
          num_rows=1800,
          schema={
             instruction: string,
             input: string,
             output: string,
             text: string
          }
       ),
 'validation': MapBatches(tokenize)
 +- MapBatches(split_text)
    +- Dataset(
          num_rows=200,
          schema={
             instruction: string,
             input: string,
             output: string,
             text: string
          }
       )}

In [17]:
import evaluate
import torch
from transformers import (
    Trainer,
    TrainingArguments,
    GPTJForCausalLM,
    AutoTokenizer,
    default_data_collator,
)
from transformers.utils.logging import disable_progress_bar, enable_progress_bar

from ray import train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback

def train_func(config):
    # Use the actual number of CPUs assigned by Ray
    os.environ["OMP_NUM_THREADS"] = str(
        train.get_context().get_trial_resources().bundles[-1].get("CPU", 1)
    )
    # Enable tf32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True

    batch_size = config.get("batch_size", 4)
    epochs = config.get("epochs", 2)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)
    steps_per_epoch = config.get("steps_per_epoch")

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
            "hysteresis": 4,
            "consecutive_hysteresis": True,
        },
        "bf16": {"enabled": "auto"},
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }

    print("Preparing training arguments")
    training_args = TrainingArguments(
        "output",
        logging_steps=1,
        save_strategy="steps",
        save_steps=steps_per_epoch,
        max_steps=steps_per_epoch * epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=1,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        label_names=["input_ids", "attention_mask"],
        push_to_hub=False,
        report_to="none",
        disable_tqdm=True,  # declutter the output a little
        fp16=True,
        gradient_checkpointing=True,
        deepspeed=deepspeed,
    )
    disable_progress_bar()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model")

    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)
    model.resize_token_embeddings(len(tokenizer))

    print("Model loaded")

    enable_progress_bar()

    metric = evaluate.load("accuracy")

    train_ds = train.get_dataset_shard("train")
    eval_ds = train.get_dataset_shard("validation")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size,
        local_shuffle_buffer_size=train.get_context().get_world_size() * batch_size,
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    # Add callback to report checkpoints to Ray Train
    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()

[2025-02-18 00:34:48,484] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


I0000 00:00:1739838888.323429 3800410 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1739838888.352192 3800410 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


In [18]:

storage_path = "gs://sllm_checkpoints/tmp_store/deepspeed/"


In [19]:

batch_size = 16
# train_ds_size = processed_datasets["train"].count()
# steps_per_epoch = train_ds_size // (batch_size * num_workers)

train_ds_size = 1800    
steps_per_epoch = round(train_ds_size / (batch_size * num_workers))
steps_per_epoch

38

In [20]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={
        "epochs": 2,
        "batch_size": batch_size,  # per device
        "steps_per_epoch": steps_per_epoch,
    },
    scaling_config=ScalingConfig(
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker={
            "GPU": num_gpus, 
            "CPU": cpus_per_worker
        },
    ),
    datasets=processed_datasets,
    run_config=RunConfig(storage_path=storage_path),
)

In [None]:
results = trainer.fit()

In [None]:
# checkpoint = results.checkpoint
# checkpoint

In [None]:
# import os

# os.system(f"aws s3 sync s3://{checkpoint.path} /mnt/local_storage/")

In [None]:
# from transformers import pipeline, AutoTokenizer, GPTJForCausalLM

# model = GPTJForCausalLM.from_pretrained("/mnt/local_storage/checkpoint")
# tokenizer = AutoTokenizer.from_pretrained("/mnt/local_storage/checkpoint")

# pipe = pipeline(
#     model=model,
#     tokenizer=tokenizer,
#     task="text-generation",
#     torch_dtype=torch.float16,
#     device_map="auto",
# )

In [None]:
# # Generate from prompts!
# for sentence in pipe(
#     ["Romeo and Juliet", "Romeo", "Juliet"], do_sample=True, min_length=20
# ):
#     print(sentence)