In [None]:
# Copyright  2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## XGBoost parallel training with Dask On Vertex AI

* https://cloud.google.com/vertex-ai/docs/training/overview
* IRIS dataset: https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html
* https://distributed.dask.org/en/stable/install.html
* https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/xgboost_data_parallel_training_on_cpu_using_dask.ipynb


### Install required packages


In [1]:
%pip install --user --quiet google-cloud-aiplatform

Note: you may need to restart the kernel to use updated packages.


### Configuration

In [2]:
PROJECT_ID = "ai-hangsik" 
LOCATION = "us-central1"  
BUCKET_URI= f"gs://sllm_checkpoints/xgboost_dask" 

In [3]:
import os
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI, location=LOCATION)

### Training
* The `train.py` file checks whether the current node is the chief node or a worker node and runs `dask-scheduler` for the chief node and `dask-worker` for worker nodes. Worker nodes connect to the chief node through the IP address and port number specified in `CLUSTER_SPEC`.

* After the Dask scheduler is set up and connected to worker nodes, call `xgb.dask.train` to train a model through Dask. Once model training is complete, the model is uploaded to `AIP_MODEL_DIR`.

In [6]:
%cd /home/jupyter/llmOps_vertexAI/training/custom_training/dask

/home/jupyter/llmOps_vertexAI/training/custom_training/dask


In [7]:
%%writefile train.py
from dask.distributed import Client, wait
from xgboost.dask import DaskDMatrix
from google.cloud import storage
import xgboost as xgb
import dask.dataframe as dd
import sys
import os
import subprocess
import time
import json

IRIS_DATA_FILENAME = 'gs://cloud-samples-data/ai-platform/iris/iris_data.csv'
IRIS_TARGET_FILENAME = 'gs://cloud-samples-data/ai-platform/iris/iris_target.csv'
MODEL_FILE = 'model.bst'
MODEL_DIR = os.getenv("AIP_MODEL_DIR")
XGB_PARAMS = {
    'verbosity': 2,
    'learning_rate': 0.1,
    'max_depth': 8,
    'objective': 'reg:squarederror',
    'subsample': 0.6,
    'gamma': 1,
    'verbose_eval': True,
    'tree_method': 'hist',
    'nthread': 1
}

def launch(cmd):
    """ launch dask workers
    """
    return subprocess.check_call(cmd, stdout=sys.stdout, stderr=sys.stderr, shell=True)


def get_chief_ip(cluster_config_dict):
    if 'workerpool0' in cluster_config_dict['cluster']:
      ip_address = cluster_config_dict['cluster']['workerpool0'][0].split(":")[0]
    else:
      # if the job is not distributed, 'chief' will be populated instead of
      # workerpool0.
      ip_address = cluster_config_dict['cluster']['chief'][0].split(":")[0]

    print('The ip address of workerpool 0 is : {}'.format(ip_address))
    return ip_address

def get_chief_port(cluster_config_dict):

    if "open_ports" in cluster_config_dict:
      port = cluster_config_dict['open_ports'][0]
    else:
      # Use any port for the non-distributed job.
      port = 7777
    print("The open port is: {}".format(port))

    return port

if __name__ == '__main__':
    
    cluster_config_str = os.environ.get('CLUSTER_SPEC')
    cluster_config_dict  = json.loads(cluster_config_str)
    print(json.dumps(cluster_config_dict, indent=2))
    print('The workerpool type is:', flush=True)
    print(cluster_config_dict['task']['type'], flush=True)
    workerpool_type = cluster_config_dict['task']['type']
    chief_ip = get_chief_ip(cluster_config_dict)
    chief_port = get_chief_port(cluster_config_dict)
    chief_address = "{}:{}".format(chief_ip, chief_port)

    if workerpool_type == "workerpool0":
        print('Running the dask scheduler.', flush=True)
        proc_scheduler = launch('dask-scheduler --dashboard --dashboard-address 8888 --port {} &'.format(chief_port))
        print('Done the dask scheduler.', flush=True)

        client = Client(chief_address, timeout=1200)
        print('Waiting the scheduler to be connected.', flush=True)
        client.wait_for_workers(1)

        X = dd.read_csv(IRIS_DATA_FILENAME, header=None)
        y = dd.read_csv(IRIS_TARGET_FILENAME, header=None)
        
        X.persist()
        y.persist()
        wait(X)
        wait(y)
        dtrain = DaskDMatrix(client, X, y)

        output = xgb.dask.train(client, XGB_PARAMS, dtrain,  num_boost_round=100, evals=[(dtrain, 'train')])
        print("Output: {}".format(output), flush=True)
        print("Saving file to: {}".format(MODEL_FILE), flush=True)
        
        output['booster'].save_model(MODEL_FILE)
        bucket_name = MODEL_DIR.replace("gs://", "").split("/", 1)[0]
        folder = MODEL_DIR.replace("gs://", "").split("/", 1)[1]
        bucket = storage.Client().bucket(bucket_name)
        print("Uploading file to: {}/{}{}".format(bucket_name, folder, MODEL_FILE), flush=True)
        
        blob = bucket.blob('{}{}'.format(folder, MODEL_FILE))
        blob.upload_from_filename(MODEL_FILE)
        print("Saved file to: {}/{}".format(MODEL_DIR, MODEL_FILE), flush=True)

        # Waiting 10 mins to connect the Dask dashboard
        #time.sleep(60 * 10)        
        client.shutdown()

    else:
        print('Running the dask worker.', flush=True)
        client = Client(chief_address, timeout=1200)
        print('client: {}.'.format(client), flush=True)
        launch('dask-worker {}'.format(chief_address))
        print('Done with the dask worker.', flush=True)

        # Waiting 10 mins to connect the Dask dashboard
        #time.sleep(60 * 10)


Overwriting train.py


### Write the docker file
The docker file is used to build the custom training container and passed to the Vertex Training.

In [8]:
%pwd

'/home/jupyter/llmOps_vertexAI/training/custom_training/dask'

In [9]:
%%writefile Dockerfile
FROM us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-9:latest
WORKDIR /root

# Install sudo
RUN apt-get update && apt-get -y install sudo

# Update the keyring in order to run apt-get update.
RUN rm -rf /usr/share/keyrings/cloud.google.gpg
RUN rm -rf /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
RUN echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list

# Install packages (without sudo)
RUN apt-get update && apt-get install -y telnet netcat iputils-ping net-tools

# Determine the default Python version
RUN echo python3 --version

# Install Python packages using the identified version
RUN python3 -m pip install 'xgboost>=1.4.2' 'dask-ml[complete]==2022.5.27' 'dask[complete]==2022.01.0' --upgrade
RUN python3 -m pip install dask==2022.01.0 distributed==2022.01.0 bokeh==2.4.3 dask-cuda==22.2.0 click==8.0.1 --upgrade
RUN python3 -m pip install gcsfs --upgrade

# Make sure gsutil will use the default service account
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg

# Copies the trainer code
RUN mkdir /root/trainer
COPY train.py /root/trainer/train.py

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python3", "trainer/train.py"]


Overwriting Dockerfile


### Build a custom training container

In [10]:
! gcloud services enable artifactregistry.googleapis.com

In [11]:
PRIVATE_REPO = "xgboost-distributed-training-repo"

! gcloud artifacts repositories create {PRIVATE_REPO} --repository-format=docker --location={LOCATION} --description="Docker repository"

! gcloud artifacts repositories list

[1;31mERROR:[0m (gcloud.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists
Listing items under project ai-hangsik, across all locations.

                                                                                            ARTIFACT_REGISTRY
REPOSITORY                         FORMAT  MODE                 DESCRIPTION                               LOCATION         LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME          SIZE (MB)
cloud-run-source-deploy            DOCKER  STANDARD_REPOSITORY  Cloud Run Source Deployments              asia-northeast3          Google-managed key  2024-03-01T14:59:17  2024-03-01T23:38:33  1505.522
kubeflow-test                      DOCKER  STANDARD_REPOSITORY                                            asia-northeast3          Google-managed key  2024-11-10T07:54:48  2024-11-10T09:23:27  604.943
cpr-handler-prediction             DOCKER  STANDARD_REPOSITORY                                            us-central1 

In [12]:
TRAIN_IMAGE = (
    f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{PRIVATE_REPO}/xgboost-dask-train:latest"
)
print("Deployment:", TRAIN_IMAGE)

Deployment: us-central1-docker.pkg.dev/ai-hangsik/xgboost-distributed-training-repo/xgboost-dask-train:latest


### Authenticate Docker to your repository

In [13]:
! gcloud auth configure-docker {LOCATION}-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


### Set the custom Docker container image
Set the custom Docker container image.

1. Pull the corresponding CPU or GPU Docker image from Docker Hub.
2. Create a tag for registering the image with Artifact Registry
3. Register the image with Artifact Registry.

In [14]:
! docker build -t $TRAIN_IMAGE -f Dockerfile .
! docker push $TRAIN_IMAGE

Sending build context to Docker daemon  164.9kB
Step 1/16 : FROM us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-9:latest
 ---> 659118eb6058
Step 2/16 : WORKDIR /root
 ---> Using cache
 ---> d5e5c617052a
Step 3/16 : RUN apt-get update && apt-get -y install sudo
 ---> Using cache
 ---> dcbb7c84bec6
Step 4/16 : RUN rm -rf /usr/share/keyrings/cloud.google.gpg
 ---> Using cache
 ---> 541c29fc4009
Step 5/16 : RUN rm -rf /etc/apt/sources.list.d/google-cloud-sdk.list
 ---> Using cache
 ---> 5e41fece9bfa
Step 6/16 : RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
 ---> Using cache
 ---> 0159eb62e71e
Step 7/16 : RUN echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
 ---> Using cache
 ---> 963de0f3618d
Step 8/16 : RUN apt-get update && apt-get install -y telnet netcat iputils-ping net-tools
 ---> Using cache
 ---> 9699d0df05b8
Step 9/16 : RUN echo python3 --version
 ---> Using cache
 ---> 

### Run a Vertex AI SDK CustomContainerTrainingJob

In [15]:
gcs_output_uri_prefix = f"{BUCKET_URI}/output"
replica_count = 3
machine_type = "n1-standard-4"
display_name = "xgboost-distributed-training"
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest"

custom_container_training_job = aiplatform.CustomContainerTrainingJob(
    display_name=display_name,
    model_serving_container_image_uri=DEPLOY_IMAGE,
    container_uri=TRAIN_IMAGE,
)


In [16]:
custom_container_training_job.run(
    base_output_dir=gcs_output_uri_prefix,
    replica_count=replica_count,
    machine_type=machine_type,
    enable_dashboard_access=True,
    enable_web_access=True,
    sync=False,
)

Training Output directory:
gs://sllm_checkpoints/xgboost_dask/output 


<google.cloud.aiplatform.models.Model object at 0x7ffa2f45d390> is waiting for upstream dependencies to complete.

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8799683992824578048?project=721521243942
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8799683992824578048 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6615227067317354496?project=721521243942
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8799683992824578048 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8799683992824578048 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8799683992824578048 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPip

### Test prediction

In [17]:
!gsutil ls $gcs_output_uri_prefix/model/

gs://sllm_checkpoints/xgboost_dask/output/model/model.bst


In [18]:
!gsutil -m cp -r gs://sllm_checkpoints/xgboost_dask/output/model .

Copying gs://sllm_checkpoints/xgboost_dask/output/model/model.bst...
/ [1/1 files][ 21.5 KiB/ 21.5 KiB] 100% Done                                    
Operation completed over 1 objects/21.5 KiB.                                     


In [19]:
import xgboost as xgb
from sklearn.datasets import load_iris

_class_names = load_iris().target_names

model = xgb.XGBClassifier()
model.load_model('model/model.bst')

outputs = model.predict([[6.7, 3.1, 4.7, 1.5],[4.6, 3.1, 1.5, 0.2]])

print(f"predictions {[_class_names[class_num] for class_num in outputs]}")


predictions ['versicolor', 'setosa']


### Cleaning up

In [None]:
import logging
import traceback

# Set this to true only if you'd like to delete your bucket
delete_bucket = False
delete_application_directory = False

! gsutil rm -rf $gcs_output_uri_prefix

if delete_bucket:
    ! gsutil rm -r $BUCKET_URI

try:
    custom_container_training_job.delete()
except Exception as e:
    logging.error(traceback.format_exc())
    print(e)

# Delete application directory
if delete_application_directory:
    ! rm -rf trainer config.yaml Dockerfile

! gcloud artifacts repositories delete {PRIVATE_REPO} --location={LOCATION} --quiet