In [None]:
# Copyright  2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## PyTorch distributed training with Vertex AI Reduction Server
* https://cloud.google.com/blog/topics/developers-practitioners/optimize-training-performance-reduction-server-vertex-ai?e=13802955
* https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/reduction_server/pytorch_distributed_training_reduction_server.ipynb


### Dataset

In this tutorial, we use [`imdb`](https://huggingface.co/datasets/imdb) dataset from Hugging Face. `imdb` is a large movie review dataset for binary sentiment classification containing a set of 25,000 highly polar movie reviews for training, and 25,000 for testing.

### Install required packages


In [1]:
%pip install --user --quiet google-cloud-aiplatform

Note: you may need to restart the kernel to use updated packages.


### Configuration

In [2]:
PROJECT_ID = "ai-hangsik"  
LOCATION = "us-central1"  

In [3]:
BUCKET_URI = f"gs://sllm_checkpoints/reduction_server"

In [4]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

#### Recommended training application structure

You can structure your training application in any way you like. However, the [following structure](https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container#structure) is commonly used in Vertex AI samples, and having your project's organization be similar to the samples can make it easier for you to follow the samples.

```
.
├── python_package
│   ├── README.md
│   ├── setup.py
│   └── trainer
│       ├── __init__.py
│       └── task.py
└── pytorch-distributed-training-reduction-server.ipynb    --> This notebook
```

1. Main project directory contains your `setup.py` file with the dependencies. 
2. Use a subdirectory named `trainer` to store your main application module and `scripts` to submit training jobs locally or cloud
3. Inside `trainer` directory:
    - `task.py` - Main application module 1) initializes PyTorch distributed training environment, and 2) Runs the model training and evaluation experiment, and exports the final model.
    - `__init__.py` is required to make Python treat directories containing the file as packages.

### Define variables for the training application

In [5]:
%cd /home/jupyter/llmOps_vertexAI/training/custom_training/reduction_server

/home/jupyter/llmOps_vertexAI/training/custom_training/reduction_server


In [21]:
APP_NAME = "pytorch-bert"
PYTHON_PACKAGE_DIR = "python_package"
PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI = ("us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-9:latest")

source_package_file_name = f"{PYTHON_PACKAGE_DIR}/dist/trainer-0.1.tar.gz"
python_package_gcs_uri = (f"{BUCKET_URI}/pytorch-on-gcp/{APP_NAME}/train/python_package/trainer-0.1.tar.gz")
python_module_name = "trainer.task"

#### Create file structure of the training application

In [22]:
! mkdir {PYTHON_PACKAGE_DIR}
! touch {PYTHON_PACKAGE_DIR}/README.md

! mkdir {PYTHON_PACKAGE_DIR}/trainer
! touch {PYTHON_PACKAGE_DIR}/trainer/__init__.py

mkdir: cannot create directory ‘python_package’: File exists
mkdir: cannot create directory ‘python_package/trainer’: File exists


#### Create the `setup.py` file for the training application


In [12]:
%%writefile ./{PYTHON_PACKAGE_DIR}/setup.py

import os
from setuptools import find_packages
from setuptools import setup
import setuptools

from distutils.command.build import build as _build
import subprocess


REQUIRED_PACKAGES = [
    'transformers==4.28.0',
    'datasets',
    'evaluate',
]

setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Vertex AI | Training | PyTorch | Text Classification | Python Package'
)


Overwriting ./python_package/setup.py


#### Create training application code

`task.py` is the main application module. It initializes the PyTorch distributed training environment and runs the model training and evaluation experiment, and exports the final model.


In [13]:
%%writefile ./{PYTHON_PACKAGE_DIR}/trainer/task.py
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the \"License\");
# you may not use this file except in compliance with the License.\n",
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an \"AS IS\" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import numpy as np
import pandas as pd
from datetime import datetime
import argparse

import torch
import torch.distributed as dist
torch.cuda.empty_cache()

import datasets
from datasets import ClassLabel, Sequence, load_dataset

import transformers
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer,
    EvalPrediction, 
    Trainer, 
    TrainingArguments,
    default_data_collator)

from google.cloud import storage


def main():

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--epochs", type=int, help="Number of training epochs.", default=2)
    parser.add_argument("--batch_size", type=int, help="Training batch size for one process.", default=32)
    parser.add_argument("--model_dir", type=str, help="Directory for saving models.", default=os.environ['AIP_MODEL_DIR'] if 'AIP_MODEL_DIR' in os.environ else "")
    argv = parser.parse_args()

    model_name_or_path = "bert-large-uncased"
    padding = "max_length"
    max_seq_length = 128

    datasets = load_dataset("imdb", verification_mode='no_checks')
    label_list = datasets["train"].unique("label")
    label_to_id = {1: 1, 0: 0, -1: 0}

    tokenizer = AutoTokenizer.from_pretrained(
      model_name_or_path,
      use_fast=True,
    )

    def preprocess_function(examples):
        """
        Tokenize the input example texts
        """
        args = (examples["text"],)
        result = tokenizer(
          *args, padding=padding, max_length=max_seq_length, truncation=True
        )

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
          result["label"] = [label_to_id[example] for example in examples["label"]]

        return result

    # apply preprocessing function to input examples
    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=True)

    model = AutoModelForSequenceClassification.from_pretrained(
      model_name_or_path, 
      num_labels=len(label_list)
    )

    ngpus_per_node = torch.cuda.device_count()
    world_size = int(os.environ["WORLD_SIZE"])

    # Since we have ngpus_per_node processes per node, the total world_size
    # needs to be adjusted accordingly
    world_size =  world_size * ngpus_per_node

    start = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    print(f'Starting distributed training: {start}') 

    # Use torch.multiprocessing.spawn to launch distributed processes
    torch.multiprocessing.spawn(main_worker,
    args = (ngpus_per_node, world_size, datasets, model, tokenizer, argv),
    nprocs = ngpus_per_node,
    join = True)

    end = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    print(f'Distributed training complete: {end}')

def main_worker(local_rank, ngpus_per_node, world_size, datasets, model, tokenizer, argv):

    # This is the (global) rank of the current process
    rank = int(os.environ["RANK"])

    # For multiprocessing distributed training, rank needs to be the
    # global rank among all the processes
    rank = rank * ngpus_per_node + local_rank
    print (f"Distributed and Multi-processing. Setting rank for each worker. rank={rank}")

    dist.init_process_group(
      backend="nccl", 
      init_method="env://",
      world_size=world_size, 
      rank=rank)

    per_device_batch_size = int(argv.batch_size / ngpus_per_node)

    training_args = TrainingArguments(
      output_dir="/tmp/output/",
      num_train_epochs=argv.epochs, 
      per_device_train_batch_size=per_device_batch_size,
      per_device_eval_batch_size=per_device_batch_size,
      local_rank=local_rank,
    )

    def compute_metrics(p: EvalPrediction):
        
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(preds, axis=1)
        
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["test"],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    trainer.train()

    # Save the trained model locally
    model_filename = "pytorch-bert-model"
    local_path = os.path.join("/tmp", model_filename)
    trainer.save_model(local_path)

    if (os.path.exists(local_path)):
        # Upload the trained model to Cloud storage
        model_directory = argv.model_dir
        storage_path = os.path.join(model_directory, model_filename)
        blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())

        files = [f for f in os.listdir(local_path) if os.path.isfile(os.path.join(local_path, f))]

        for file in files:
            local_file = os.path.join(local_path, file)
            blob.upload_from_filename(local_file)

        print(f"Saved model files in {model_directory}/{model_filename}")


if __name__ == "__main__":
    main()


Overwriting ./python_package/trainer/task.py


#### Create a source distribution

Create a source distribution `dist/trainer-0.1.tar.gz`, and upload the source distribution with training application to Cloud Storage bucket, and then validate the source distribution exists on Cloud Storage bucket.

In [14]:
! cd {PYTHON_PACKAGE_DIR} && python3 setup.py sdist --formats=gztar

! gsutil cp {source_package_file_name} {python_package_gcs_uri}

! gsutil ls -l {python_package_gcs_uri}

running sdist
running egg_info
writing trainer.egg-info/PKG-INFO
writing dependency_links to trainer.egg-info/dependency_links.txt
writing requirements to trainer.egg-info/requires.txt
writing top-level names to trainer.egg-info/top_level.txt
reading manifest file 'trainer.egg-info/SOURCES.txt'
writing manifest file 'trainer.egg-info/SOURCES.txt'
running check
creating trainer-0.1
creating trainer-0.1/trainer
creating trainer-0.1/trainer.egg-info
copying files to trainer-0.1...
copying README.md -> trainer-0.1
copying setup.py -> trainer-0.1
copying trainer/__init__.py -> trainer-0.1/trainer
copying trainer/task.py -> trainer-0.1/trainer
copying trainer.egg-info/PKG-INFO -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/SOURCES.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/dependency_links.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/requires.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/top_level.txt -> trainer-0.1/trainer.egg-info


### Run custom training job with Reduction Server on Vertex AI

Configure a custom job with the pre-built container image for PyTorch and training code packaged as Python source distribution.

In [15]:
print(f"APP_NAME={APP_NAME}")
print(
    f"PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI={PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI}"
)
print(f"python_package_gcs_uri={python_package_gcs_uri}")
print(f"python_module_name={python_module_name}")

APP_NAME=pytorch-bert
PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI=us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-9:latest
python_package_gcs_uri=gs://sllm_checkpoints/reduction_server/pytorch-on-gcp/pytorch-bert/train/python_package/trainer-0.1.tar.gz
python_module_name=trainer.task


#### Create a training job
* https://cloud.google.com/vertex-ai/docs/start/client-libraries#client_libraries

In [16]:
JOB_NAME = f"pytorch-bert-reduction-server"
print(f"JOB_NAME={JOB_NAME}")

job = aiplatform.CustomPythonPackageTrainingJob(
    display_name=f"{JOB_NAME}",
    python_package_gcs_uri=python_package_gcs_uri,
    python_module_name=python_module_name,
    container_uri=PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI,
)

JOB_NAME=pytorch-bert-reduction-server


#### Define the training cluster worker pool and experiment configuration parameters
* https://cloud.google.com/vertex-ai/docs/training/distributed-training

In [17]:
# Training cluster worker pool configuration
REPLICA_COUNT = 3
MACHINE_TYPE = "g2-standard-48"
ACCELERATOR_TYPE = "NVIDIA_L4"
ACCELERATOR_COUNT = 4

# Reduction Server configuration
REDUCTION_SERVER_COUNT = 4
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
REDUCTION_SERVER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest"
)
ENVIRONMENT_VARIABLES = {"NCCL_DEBUG": "INFO"}

# Training experiment parameters
EPOCHS = 2
BATCH_SIZE = 32
MODEL_DIR = f"{BUCKET_URI}/{JOB_NAME}"

training_args = [
    "--epochs",
    str(EPOCHS),
    "--batch_size",
    str(BATCH_SIZE),
    "--model_dir",
    MODEL_DIR,
]

#### Submit the training job

In [None]:
model = job.run(
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    reduction_server_replica_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
    reduction_server_container_uri=REDUCTION_SERVER_IMAGE_URI,
    environment_variables=ENVIRONMENT_VARIABLES,
    args=training_args,
    sync=True,
)

Training Output directory:
gs://sllm_checkpoints/reduction_server/aiplatform-custom-training-2025-02-19-09:47:26.698 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4048210464088260608?project=721521243942
CustomPythonPackageTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/4048210464088260608 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2865558165066350592?project=721521243942
CustomPythonPackageTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/4048210464088260608 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/4048210464088260608 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/4048210464088260608 

#### Validate the model artifacts

In [19]:
print(f"Model artifacts are available at {MODEL_DIR}")

Model artifacts are available at gs://sllm_checkpoints/reduction_server/pytorch-bert-reduction-server


### Cleaning up

In [None]:
import os

delete_custom_job = True
delete_bucket = False

if delete_custom_job:
    try:
        job.delete()
    except Exception as e:
        print(e)

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI