# Quick Start

In [4]:
from google.cloud import aiplatform

def create_custom_job_sample(
    project: str = 'sabeti-encode',
    display_name: str = 'simple-test',
    container_image_uri: str = 'gcr.io/sabeti-encode/boda/production:0.0.3',
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.JobServiceClient(client_options=client_options)
    custom_job = {
        "display_name": display_name,
        "job_spec": {
            "worker_pool_specs": [
                {
                    "machine_spec": {
                        "machine_type": "n1-standard-4",
                        "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_V100,
                        "accelerator_count": 1,
                    },
                    "replica_count": 1,
                    "container_spec": {
                        "image_uri": 'gcr.io/sabeti-encode/boda/production:0.0.3',
                        "command": [],
                        "args": [
                            '--data_module=BODA2_DataModule',
                            '--datafile_path=gs://syrgoth/data/BODA.MPRA.txt',
                            '--valid_pct=5',
                            '--test_pct=5',
                            '--batch_size=32',
                            '--padded_seq_len=600',
                            '-num_workers=1',
                            '--model_module=Basset',
                            '--n_outputs=3',
                            '--loss_criterion=MSELoss',
                            '--graph_module=CNNBasicTraining',
                            '--gpus=1',
                            '--min_epochs=5',
                            '--max_epochs=5',
                            '--default_root_dir=/tmp/output/artifacts',
                            '--artifact_path=gs://haddath/sgosai/deposit_test'
                        ],
                    },
                }
            ]
        },
    }
    parent = f"projects/{project}/locations/{location}"
    response = client.create_custom_job(parent=parent, custom_job=custom_job)
    print("response:", response)
    
    return client, custom_job

In [5]:
def cancel_job(client, name):
    try:
        response = clients["job"].cancel_custom_job(name=name)
        print(response)
    except Exception as e:
        print(e)


In [6]:
j_client, j_specs = create_custom_job_sample()

response: name: "projects/482032041325/locations/us-central1/customJobs/9074955559291060224"
display_name: "simple-test"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "n1-standard-4"
      accelerator_type: NVIDIA_TESLA_V100
      accelerator_count: 1
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    container_spec {
      image_uri: "gcr.io/sabeti-encode/boda/production:0.0.3"
      args: "--data_module=BODA2_DataModule"
      args: "--datafile_path=gs://syrgoth/data/BODA.MPRA.txt"
      args: "--valid_pct=5"
      args: "--test_pct=5"
      args: "--batch_size=32"
      args: "--padded_seq_len=600"
      args: "-num_workers=1"
      args: "--model_module=Basset"
      args: "--n_outputs=3"
      args: "--loss_criterion=MSELoss"
      args: "--graph_module=CNNBasicTraining"
      args: "--gpus=1"
      args: "--min_epochs=5"
      args: "--max_epochs=5"
      args: "--default_root_dir=/tmp/output/

# Training a BODA model with a CustomJob

## Overview
This tutorial demonstrates how to use the AI Platform (Unified) Python client library to train a custom MPRA sequence function model.

### Dataset
We will be using a private MPRA dataset stashed at `gs://syrgoth/data/BODA.MPRA.txt`.

In [6]:
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

In [2]:
PROJECT_ID = 'sabeti-encode'
REGION = "us-central1"
BUCKET_NAME = 'syrgoth'

In [4]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [5]:
! gsutil ls -al gs://$BUCKET_NAME

                                 gs://syrgoth/checkpoints/
                                 gs://syrgoth/data/
                                 gs://syrgoth/my_test/


#### AI Platform constants

Set some constants for AI Platform:

- API_ENDPOINT: The AI Platform API service endpoint for the Job, Model, Endpoint, and Prediction services.
- PARENT: The AI Platform location root path for dataset, model and endpoint resources.

In [None]:
# API service endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# AI Platform (Unified) location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### Machine Type

Next, set the machine type to use for training and prediction.

- Set the variables TRAIN_COMPUTE  to configure the compute resources for the VMs you will use for training.
    - machine type
        - n1-standard: 3.75GB of memory per vCPU.
        - n1-highmem: 6.5GB of memory per vCPU
        - n1-highcpu: 0.9 GB of memory per vCPU
    - vCPUs: number of [2, 4, 8, 16, 32, 64, 96 ]

Note: The following is not supported for training

- standard: 2 vCPUs
- highcpu: 2, 4 and 8 vCPUs

Note: You may also use n2 and e2 machine types for training and deployment, but they do not support GPUs

#### Hardware Accelerators

Set the hardware accelerators (e.g., GPU), if any, for training and prediction.

Set the variables TRAIN_GPU/TRAIN_NGPU and DEPLOY_GPU/DEPLOY_NGPU to use a container image supporting a GPU and the number of GPUs allocated the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Telsa K80 GPUs allocated to each VM, specify:

(aip.AcceleratorType.NVIDIA_TESLA_K80, 4)

For GPU, available accelerators include:

- aip.AcceleratorType.NVIDIA_TESLA_K80
- aip.AcceleratorType.NVIDIA_TESLA_P100
- aip.AcceleratorType.NVIDIA_TESLA_P4
- aip.AcceleratorType.NVIDIA_TESLA_T4
- aip.AcceleratorType.NVIDIA_TESLA_V100

Otherwise specify (None, None) to use a container image to run on a CPU.

In [None]:
MACHINE_TYPE = "n1-standard"
VCPU = "4"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)

TRAIN_GPU, TRAIN_NGPU = (aip.AcceleratorType.NVIDIA_TESLA_V100, 1)

### Clients
The AI Platform Python client library works as a client/server model. On your side, the Python script, you will create a client that sends requests and receives responses from the server -- AI Platform.

Use several clients in this tutorial, so you will set these up upfront.

- Job Service for custom jobs.
- Model Service for managed models.


In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}
predict_client_options = {"api_endpoint": API_ENDPOINT}


def create_job_client():
    client = aip.JobServiceClient(client_options=client_options)
    return client


def create_model_client():
    client = aip.ModelServiceClient(client_options=client_options)
    return client

clients = {}
clients["job"] = create_job_client()
clients["model"] = create_model_client()

for client in clients.items():
    print(client)

## Prepare your `CustomJob` specification

Now that your clients are ready, your first step is to create a `CustomJob` specification for your custom training job.

To practice using the Job service, start by training an **empty job**. In other words, create a `CustomJob` specification that provisions resources for training a job, and initiate the job using the client library's Job service, but configure the `CustomJob` so it doesn't actually train an ML model.

This lets you focus on understanding the basic steps. Afterwards, you can create another `CustomJob` with a focus on adding the Python training package for training a CIFAR10 custom model.

### Define a container specification

Let's first start by defining a job name and then a container specification:

- `JOB_NAME`: A unique name for your custom training job. For convenience, append a timestamp to make the name unique.
- `MODEL_DIR`: A location in your Cloud Storage bucket for storing the model artificats.
- `image_uri`: The location of the container image in Artifact Registry, Container Registry, or Docker Hub. This can be either a Google Cloud pre-built image or your own container image.
- `--model-dir`: A command-line parameter to the container indicating the location to store the model.