In [1]:
import tarfile
import constants

In [2]:
def untar_llama_finetuning_recipe_tarball(tarball_path: str, target: str) -> None:
    """Untar the LLama Finetuning receipe repo."""
    with tarfile.open(tarball_path, "r") as llama_recipe_tar:
        llama_recipe_tar.extractall(target)

In [6]:
untar_llama_finetuning_recipe_tarball(tarball_path=constants.LLAMA_RECIPES_TARBALL_FILENAME, target=".")

In [None]:
import argparse
from typing import Optional
from typing import Union


LOW_TRUE_STR = "true"
LOW_FALSE_STR = "false"
NONE_STR = "None"


def str2bool(v: str) -> bool:
    """Convert string argument to a boolean value."""
    if v.lower() == LOW_TRUE_STR:
        return True
    elif v.lower() == LOW_FALSE_STR:
        return False
    else:
        raise argparse.ArgumentTypeError("Boolean value expected.")


def str2optionalint(v: str) -> Optional[int]:
    """Convert string argument to optional int."""
    if v == NONE_STR:
        return None
    else:
        try:
            return int(v)
        except Exception as e:
            raise argparse.ArgumentTypeError(f"Integer or None expected. Error: {e}.")


def str2optionalstr(v: str) -> Optional[str]:
    """Convert a string argument to optional string argument."""
    if v == NONE_STR:
        return None
    elif isinstance(v, str):
        return v
    else:
        raise argparse.ArgumentTypeError("None or string value expected.")

In [None]:
# SM HUGGING FACE Training Job Args

def _parse_args():
    """Arguments to the domain adaption and instruction tuning transfer_learning.py script."""
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING"))
    parser.add_argument("--train-alt", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--validation", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION"))
    parser.add_argument("--hosts", type=list, default=json.loads(os.environ.get("SM_HOSTS")))
    parser.add_argument("--num_gpus", type=int, default=os.environ.get("SM_NUM_GPUS"))
    parser.add_argument("--current-host", type=str, default=os.environ.get("SM_CURRENT_HOST"))
    parser.add_argument(
        "--pretrained-model",
        type=str,
        default=os.environ.get("SM_CHANNEL_MODEL") or os.environ.get("SAGEMAKER_ADDITIONAL_S3_DATA_PATH"),
    )
    parser.add_argument("--peft_type", type=str, default=constants.DEFAULT_PEFT_TYPE)
    parser.add_argument("--lora_r", type=int, default=constants.DEFAULT_LORA_R)
    parser.add_argument("--lora_alpha", type=float, default=constants.DEFAULT_LORA_ALPHA)
    parser.add_argument("--lora_dropout", type=float, default=constants.DEFAULT_LORA_DROPOUT)
    parser.add_argument("--bits", type=int, default=constants.DEFAULT_LORA_QUANTIZATION_BITS)
    parser.add_argument("--double_quant", type=str2bool, default=constants.DEFAULT_DOUBLE_QUANT)
    parser.add_argument("--quant_type", type=str, default=constants.DEFAULT_QUANT_TYPE)

    parser.add_argument("--deepspeed", type=str2bool, default=constants.DEFAULT_DEEPSPEED)
    parser.add_argument("--instruction_tuned", type=str, default=constants.DEFAULT_INSTRUCTION_TUNING)
    parser.add_argument("--train_from_scratch", type=str, default=constants.DEFAULT_TRAIN_FROM_SCRATCH)
    parser.add_argument("--fp16", type=str, default=constants.DEFAULT_FP16)
    parser.add_argument("--bf16", type=str, default=constants.DEFAULT_BF16)
    parser.add_argument("--evaluation_strategy", type=str, default=constants.DEFAULT_EVALUATION_STRATEGY)
    parser.add_argument("--eval_steps", type=int, default=constants.DEFAULT_EVAL_STEPS)
    parser.add_argument("--epoch", type=int, default=constants.DEFAULT_EPOCH)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=constants.DEFAULT_GRADIENT_ACCUMULATION_STEP)
    parser.add_argument(
        "--per_device_train_batch_size", type=int, default=constants.DEFAULT_PER_DEVICE_TRAIN_BATCH_SIZE
    )
    parser.add_argument("--per_device_eval_batch_size", type=int, default=constants.DEFAULT_PER_DEVICE_EVAL_BATCH_SIZE)
    parser.add_argument("--logging_steps", type=int, default=constants.DEFAULT_LOGGING_STEPS)
    parser.add_argument("--warmup_ratio", type=float, default=constants.DEFAULT_WARMUP_RATIO)
    parser.add_argument("--learning_rate", type=float, default=constants.DEFAULT_LEARNING_RATE)
    parser.add_argument("--weight_decay", type=float, default=constants.DEFAULT_WEIGHT_DECAY)
    parser.add_argument("--load_best_model_at_end", type=str, default=constants.DEFAULT_LOAD_BEST_MODEL_AT_END)
    parser.add_argument("--max_train_samples", type=int, default=constants.DEFAULT_MAX_TRAIN_SAMPLES)
    parser.add_argument("--max_val_samples", type=int, default=constants.DEFAULT_MAX_VAL_SAMPLES)
    parser.add_argument("--seed", type=int, default=constants.DEFAULT_SEED_VALUE)
    parser.add_argument("--max_input_length", type=int, default=constants.DEFAULT_MAX_INPUT_LENGTH)
    parser.add_argument("--validation_split_ratio", type=float, default=constants.DEFAULT_VALIDATION_SPLIT_RATIO)
    parser.add_argument("--train_data_split_seed", type=int, default=constants.DEFAULT_TRAIN_DATA_SPLIT_SEED)
    parser.add_argument(
        "--preprocessing_num_workers", type=str2optionalint, default=constants.DEFAULT_PREPROCESSING_NUM_WORKERS
    )
    parser.add_argument("--max_steps", type=int, default=constants.DEFAULT_MAX_STEPS)
    parser.add_argument(
        "--gradient_checkpointing",
        type=str,
        default=constants.DEFAULT_GRADIENT_CHECKPOINTING,
        choices=constants.BOOLEAN_CHOICES,
    )
    parser.add_argument("--early_stopping_patience", type=int, default=constants.DEFAULT_EARLY_STOPPING_PATIENCE)
    parser.add_argument("--early_stopping_threshold", type=float, default=constants.DEFAULT_EARLY_STOPPING_THRESHOLD)
    parser.add_argument(
        "--adam_beta1",
        type=float,
        default=constants.DEFAULT_ADAM_BETA1,
    )
    parser.add_argument("--adam_beta2", type=float, default=constants.DEFAULT_ADAM_BETA2)
    parser.add_argument("--adam_epsilon", type=float, default=constants.DEFAULT_ADAM_EPSILON)
    parser.add_argument("--max_grad_norm", type=float, default=constants.DEFAULT_MAX_GRAD_NORM)
    parser.add_argument(
        "--label_smoothing_factor",
        type=float,
        default=constants.DEFAULT_LABEL_SMOOTHING_FACTOR,
    )
    parser.add_argument(
        "--logging_strategy",
        type=str,
        default=constants.DEFAULT_LOGGING_STRATEGY,
        choices=constants.LOGGING_STRATEGY_CHOICES,
    )
    parser.add_argument(
        "--logging_first_step",
        type=str,
        default=constants.DEFAULT_LOGGING_FIRST_STEP,
        choices=constants.BOOLEAN_CHOICES,
    )
    parser.add_argument(
        "--logging_nan_inf_filter",
        type=str,
        default=constants.DEFAULT_LOGGING_NAN_INF_FILTER,
        choices=constants.BOOLEAN_CHOICES,
    )
    parser.add_argument(
        "--save_strategy",
        type=str,
        default=constants.DEFAULT_SAVE_STRATEGY,
        choices=constants.SAVE_STRATEGY_CHOICES,
    )
    parser.add_argument("--save_steps", type=int, default=constants.DEFAULT_SAVE_STEPS)
    parser.add_argument("--save_total_limit", type=int, default=constants.DEFAULT_SAVE_TOTAL_LIMIT)
    parser.add_argument(
        "--dataloader_drop_last",
        type=str,
        default=constants.DEFAULT_DATALOADER_DROP_LAST,
        choices=constants.BOOLEAN_CHOICES,
    )
    parser.add_argument(
        "--dataloader_num_workers",
        type=int,
        default=constants.DEFAULT_DATALOADER_NUM_WORKERS,
        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process",
    )
    parser.add_argument(
        "--eval_accumulation_steps", type=str2optionalint, default=constants.DEFAULT_EVAL_ACCUMULATION_STEPS
    )

    parser.add_argument(
        "--auto_find_batch_size",
        type=str,
        default=constants.DEFAULT_AUTO_FIND_BATCH_SIZE,
        help="Whether to automatically decrease the batch size in half and rerun the training loop again each time"
        " a CUDA Out-of-Memory was reached.",
        choices=constants.BOOLEAN_CHOICES,
    )
    parser.add_argument(
        "--lr_scheduler_type",
        type=str,
        default=constants.DEFAULT_LR_SCHEDULER_TYPE,
        help="The scheduler type to use.",
        choices=constants.LR_SCHEDULER_TYPE_CHOICES,
    )
    parser.add_argument("--warmup_steps", type=int, default=constants.DEFAULT_WARMUP_STEPS)
    parser.add_argument(
        "--add_input_output_demarcation_key", type=str2bool, default=constants.DEFAULT_ADD_INPUT_OUTPUT_DEMARCATION_KEY
    )
    return parser.parse_known_args()

In [None]:
# LLama Finetuning Args

def _parse_args():
    """Arguments for llama fine tunningto the script."""
    parser = argparse.ArgumentParser()
    parser.add_argument("--lora_r", type=int, default=constants.DEFAULT_LORA_R)
    parser.add_argument("--lora_alpha", type=int, default=constants.DEFAULT_LORA_ALPHA)
    parser.add_argument("--lora_dropout", default=constants.DEFAULT_LORA_DROPOUT)
    parser.add_argument("--int8_quantization", type=str2bool, default=constants.DEFAULT_INT8_QUANTIZATION)
    parser.add_argument("--enable_fsdp", type=str2bool, default=constants.DEFAULT_ENABLE_FSDP)
    parser.add_argument("--chat_dataset", type=str2bool, default=constants.DEFAULT_CHAT_DATASET)
    parser.add_argument("--target_modules", type=str, default=constants.DEFAULT_TARGET_MODULES)
    return parser.parse_known_args()

In [7]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print("Number of GPUs:", num_gpus)
else:
    print("CUDA is not available. No GPUs detected.")

Number of GPUs: 4


In [None]:
# If --enable_fsdp == False
# Command starts with `python`

# If --enable_fsdp == True
# Command starts with `torchrun --nnodes 1 --nproc_per_node num_gpus`

In [None]:
%%sh

# python

torchrun --nnodes 1 --nproc_per_node num_gpus

In [3]:
x_args = ['torchrun', '--nnodes', '1', '--nproc_per_node', '4', 'llama_finetuning.py', '--model_name', 'None', '--num_gpus', '4', '--pure_bf16', '--dist_checkpoint_root_folder', 'model_checkpoints', '--dist_checkpoint_folder', 'fine-tuned', '--batch_size_training', '2', '--micro_batch_size', '2', '--train_file', '/opt/ml/input/data/training', '--lr', '0.0001', '--do_train', '--output_dir', 'saved_peft_model', '--num_epochs', '1', '--use_peft', '--peft_method', 'lora', '--max_train_samples', '-1', '--max_val_samples', '-1', '--seed', '10', '--per_device_eval_batch_size', '1', '--max_input_length', '2048', '--preprocessing_num_workers', '--None', '--validation_split_ratio', '0.2', '--train_data_split_seed', '0', '--num_workers_dataloader', '0', '--weight_decay', '0.1', '--lora_r', '8', '--lora_alpha', '32', '--lora_dropout', '0.05', '--target_modules', 'q_proj,v_proj', '--enable_fsdp', '--add_input_output_demarcation_key', '--instruction_tuned']

In [6]:
import json
print(json.dumps(x_args, indent=4))

[
    "torchrun",
    "--nnodes",
    "1",
    "--nproc_per_node",
    "4",
    "llama_finetuning.py",
    "--model_name",
    "None",
    "--num_gpus",
    "4",
    "--pure_bf16",
    "--dist_checkpoint_root_folder",
    "model_checkpoints",
    "--dist_checkpoint_folder",
    "fine-tuned",
    "--batch_size_training",
    "2",
    "--micro_batch_size",
    "2",
    "--train_file",
    "/opt/ml/input/data/training",
    "--lr",
    "0.0001",
    "--do_train",
    "--output_dir",
    "saved_peft_model",
    "--num_epochs",
    "1",
    "--use_peft",
    "--peft_method",
    "lora",
    "--max_train_samples",
    "-1",
    "--max_val_samples",
    "-1",
    "--seed",
    "10",
    "--per_device_eval_batch_size",
    "1",
    "--max_input_length",
    "2048",
    "--preprocessing_num_workers",
    "--None",
    "--validation_split_ratio",
    "0.2",
    "--train_data_split_seed",
    "0",
    "--num_workers_dataloader",
    "0",
    "--weight_decay",
    "0.1",
    "--lora_r",
    "8",
 