In [1]:
import wandb

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mshossain035[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
wandb.sagemaker_auth(path="./")

In [1]:
s3_bucket = "unwind.dev.data"
# pretrained_model_name = 'deep-learning-analytics/segformer_semantic_segmentation'
# pretrained_model_name = "nvidia/mit-b3"
pretrained_model_name = "decapoda-research/llama-13b-hf"
# pretrained_model_name = "facebook/mask2former-swin-large-cityscapes-semantic"
# pretrained_model_name = "facebook/mask2former-swin-base-IN21k-cityscapes-semantic"
# pretrained_model_name = "openmmlab/upernet-swin-large"
# pretrained_model_name = "openmmlab/upernet-swin-base"
# model_type = "universal"
model_type = "semantic_segmentation"
# dataset_name = "aws-grey-sm-cp-1024-704"
# dataset_name = 'v-2-grey-lg-1024-704'

dataset_name = "alpaca-data-cleaned"
# dataset_name = 'v4-aws-tu-lg-cp-1024-1024'
# dataset_name = 'aws-grey-sm-cp-1024-704'
# s3_prefix_key = f'cv/segmentation/{dataset_name}_pre_crop_sample'
s3_prefix_key = f"cv/segmentation/{dataset_name}"


s3_prefix_upload = f"s3://{s3_bucket}/{s3_prefix_key}"
training_input_path = f"{s3_prefix_upload}/data/train/"
test_input_path = f"{s3_prefix_upload}/data/test/"
s3_config_key = f"{s3_prefix_key}/config/id2label.json"
# WANDB_PROJECT_NAME = "aws-drone-seg-debug"
WANDB_PROJECT_NAME = "alpoca-test"

In [2]:
base_job_prefix = "llm-1"

# Hyperparameters which are passed into the training job
hyperparameters = {
    "epochs": 20,
    "train_batch_size": 1,
    "eval_batch_size": 4,
    "num_loader_workers": 3,
    "model_name": pretrained_model_name,
    "model_type": model_type,
    "config_bucket": s3_bucket,
    "config_key": s3_config_key,
    "learning_rate": 1e-6,
    "warmup_step_ratio": 0.3,
    # "loss_type": "generalized_dice",
    # "loss_type": "soft_dice",
    # "loss_type": "weighted_ce",
    # "loss_type": "lovasz_softmax",
    # "loss_type": "combo",
    "loss_type": "cross_entropy",
    # "label_smoothing_factor": 0.01,
    # "do_loss_weight_averaging": True,
    # "do_full_train": True,
    # "freeze_base_model": True,
    # "checkpoint_index": 4,
    "do_swa": True,
    "swa_start_step_ratio": 0.75,
}

In [6]:
from sagemaker import get_execution_role, Session

# from sagemaker.huggingface import HuggingFace

from sagemaker.pytorch import PyTorch

import sagemaker
import boto3
import os

from dotenv import load_dotenv

load_dotenv("./.env")

iam_client = boto3.client("iam")
role = iam_client.get_role(RoleName=os.getenv("AWS_ROLE_NAME"))["Role"]["Arn"]
# role = get_execution_role()

sess = Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

base_job_name = f"{base_job_prefix}-{dataset_name}-{hyperparameters.get('model_name', '')}".replace(
    "/", "-"
)
hyperparameters["group_name"] = base_job_name
hyperparameters["project_name"] = WANDB_PROJECT_NAME

checkpoint_s3_uri = (
    f"s3://{sagemaker_session_bucket}/{base_job_name}/checkpoints"
)

env = {
    "SAGEMAKER_REQUIREMENTS": "requirements.txt",  # path relative to `source_dir` below.
}

# configuration for running training on smdistributed Data Parallel
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

# spot config
max_run = 86400 * 5
max_wait = max_run + 3600

hf_estimator = PyTorch(
    entry_point="train.py",
    source_dir=".",
    # instance
    # instance_type='local',
    # instance_type="ml.p3.2xlarge",
    # instance_type="ml.p3.8xlarge",
    # instance_type="ml.p3.16xlarge",
    # instance_type="ml.p3dn.24xlarge",
    # instance_type="ml.p4d.24xlarge",
    # instance_type="ml.g4dn.4xlarge",
    instance_type="ml.g5.xlarge",
    # instance_type="ml.trn1.2xlarge",
    # instance_type='local',
    # instance_type="ml.g5.16xlarge",
    # instance_type="ml.g5.24xlarge",
    # instance_type="ml.p4d.24xlarge",
    # p3.16xlarge 8 X 128 spot $10
    # instance_type="ml.g4dn.12xlarge",
    # volume_size=150,
    max_run=max_run,
    # cluster
    instance_count=1,
    # instance_count=2,
    # distribution=distribution,
    role=role,
    env=env,
    framework_version="1.13",
    py_version="py39",
    hyperparameters=hyperparameters,
    # metric_definitions=metric_definitions,
    base_job_name=base_job_name,
    # spot settings
    checkpoint_s3_uri=checkpoint_s3_uri,
    use_spot_instances=True,  # enables spot training
    max_wait=max_wait,  # max time including spot start + training time
)

In [7]:
base_job_name

'llm-1-alpaca-data-cleaned-decapoda-research-llama-13b-hf'

In [8]:
# Start the training job with the uploaded dataset as input

hf_estimator.fit(
    {
        "train": "s3://unwind.dev.data/llm/alpaca_data_cleaned/",
        # "test": test_input_path,
        # "model": "s3://unwind.dev.data/cv/models/segformer-aws-drone/checkpoint-1988/",
        # "model": "s3://unwind.dev.data/cv/models/mask2former-large/1675089631-10/",
        # "model": "s3://unwind.dev.data/cv/models/segformer-aws-drone/best_model_6259/",
        # "model": "s3://sagemaker-us-east-1-053991337303/s1e6-v3-grey-lg-cp-1024-1024-nvidia-mit-b5/checkpoints/0-29/"
        # "model": "s3://sagemaker-us-east-1-053991337303/lovasz-v3-grey-lg-cp-1024-1024-nvidia-mit-b5/checkpoints/0-23/",
        # "model": "s3://unwind.dev.data/cv/models/segformer-aws-drone/best_model_6488/",
        # "model": "s3://unwind.dev.data/cv/models/segformer-aws-drone/classifier/"
    },
    wait=False,
    logs="Rules",
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: llm-1-alpaca-data-cleaned-decapoda-rese-2023-03-30-00-25-12-751
