In [1]:
import wandb

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mshossain035[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
wandb.sagemaker_auth(path="./")

In [6]:
pretrained_model_name = "decapoda-research/llama-7b-hf"
dataset_name = "yahma/alpaca-cleaned"
WANDB_PROJECT_NAME = "alpoca-test"

In [7]:
base_job_prefix = "llama-01"

# Hyperparameters which are passed into the training job
hyperparameters = {
    "epochs": 10,
    "model_name": pretrained_model_name,
    "data_path": dataset_name,
    "learning_rate": 1e-6,
    "warmup_step_ratio": 0.3,
}

In [8]:
from sagemaker import get_execution_role, Session

# from sagemaker.huggingface import HuggingFace

from sagemaker.pytorch import PyTorch

import sagemaker
import boto3
import os

from dotenv import load_dotenv

load_dotenv("./.env")

iam_client = boto3.client("iam")
role = iam_client.get_role(RoleName=os.getenv("AWS_ROLE_NAME"))["Role"]["Arn"]
# role = get_execution_role()

sess = Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

base_job_name = f"{base_job_prefix}-{dataset_name}-{hyperparameters.get('model_name', '')}".replace(
    "/", "-"
)
hyperparameters["group_name"] = base_job_name
hyperparameters["project_name"] = WANDB_PROJECT_NAME

checkpoint_s3_uri = (
    f"s3://{sagemaker_session_bucket}/{base_job_name}/checkpoints"
)

env = {
    "SAGEMAKER_REQUIREMENTS": "requirements.txt",  # path relative to `source_dir` below.
}

# configuration for running training on smdistributed Data Parallel
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

# spot config
max_run = 86400 * 5
max_wait = max_run + 3600

hf_estimator = PyTorch(
    entry_point="train.py",
    source_dir=".",
    instance_type="ml.g5.xlarge",
    max_run=max_run,
    # cluster
    instance_count=1,
    # instance_count=2,
    # distribution=distribution,
    role=role,
    env=env,
    framework_version="1.13",
    py_version="py39",
    hyperparameters=hyperparameters,
    base_job_name=base_job_name,
    # spot settings
    checkpoint_s3_uri=checkpoint_s3_uri,
    use_spot_instances=True,  # enables spot training
    max_wait=max_wait,        # max time including spot start + training time
)

In [9]:
base_job_name

'llama-01-yahma-alpaca-cleaned-decapoda-research-llama-7b-hf'

In [10]:
# Start the training job with the uploaded dataset as input

hf_estimator.fit({
        "train": "s3://unwind.dev.data/llm/alpaca_data_cleaned/", 
        # Not needed for training since we are downloading the dataset from HuggingFacce. 
        # But, SageMaker seems to complain if you don't provide a downloadable training set.
        # So, put a path that is accessible by your SageMaker role.
    },
    wait=False,
    logs="Rules",)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: llama-01-yahma-alpaca-cleaned-decapoda--2023-04-10-00-00-14-168
