# Fine-tuning Large Language Model on Amazon SageMaker



This notebook demonstrates how to fine-tune a large language model (hypothetically LLaMA 3) on Amazon SageMaker using the Hugging Face Transformers library and Fully Sharded Data Parallel (FSDP).

## Contents
 1. Setup
 2. Set SageMaker session and execution role
 3. Configure hyperparameters
 4. Prepare training script
 5. Configure and launch training job
 6. Monitor and analyze results

### Note: This notebook assumes you have the necessary permissions to access SageMaker resources and the model.

## 1. Setup
### Install required packages

In [11]:
!pip install -q U sagemaker transformers datasets torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


### Import required libraries

In [14]:
import os
import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
import pandas as pd
from datasets import Dataset
import boto3
import sagemaker
from sagemaker.huggingface import HuggingFace
#from sagemaker.tensorboard import TensorBoardCallback
from sagemaker.s3 import S3Downloader
from sagemaker.interactive_apps import tensorboard

## 2. Set SageMaker session and execution role

In [15]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    #role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
    #use this code if you are running locally
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20220929T161862')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
sm_client = boto3.client('sagemaker', region_name=sess.boto_region_name)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::786045444066:role/service-role/AmazonSageMaker-ExecutionRole-20220929T161862
sagemaker bucket: sagemaker-us-west-2-786045444066
sagemaker session region: us-west-2


## 3. Prepare and upload dataset

This section demonstrates how to prepare your dataset locally and upload it to S3.

### Load and prepare your dataset
Assuming you have a CSV file with your data. Adjust as needed for your data format.

In [7]:
# Assuming you have a CSV file with your data. Adjust as needed for your data format.
df = pd.read_csv('path/to/your/local/dataset.csv')

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)

print(f"Train set size: {len(dataset['train'])}")
print(f"Validation set size: {len(dataset['test'])}")


### Prepare data for upload
Create temporary files for train and validation sets

In [None]:
dataset['train'].to_json('train.jsonl', orient='records', lines=True)
dataset['test'].to_json('val.jsonl', orient='records', lines=True)

### Upload data to S3

In [None]:
s3_client = boto3.client('s3')

bucket_name = 'your-s3-bucket-name'
train_key = 'path/in/bucket/train.jsonl'
val_key = 'path/in/bucket/val.jsonl'

# Upload train set
s3_client.upload_file('train.jsonl', bucket_name, train_key)

# Upload validation set
s3_client.upload_file('val.jsonl', bucket_name, val_key)


### Set S3 paths for SageMaker

In [None]:
train_data_path = f's3://{bucket_name}/{train_key}'
eval_data_path = f's3://{bucket_name}/{val_key}'

print(f"Training data uploaded to: {train_data_path}")
print(f"Validation data uploaded to: {eval_data_path}")

# Clean up local temporary files
os.remove('train.jsonl')
os.remove('val.jsonl')

print("Local temporary files cleaned up.")

Now you can use train_data_path and eval_data_path in your SageMaker estimator fit() method

## 4. Configure hyperparameters

In [1]:
hyperparameters = {
    'model_name_or_path': 'meta-llama/Llama-3.3-70b',  # Hypothetical model name
    'output_dir': '/opt/ml/model',
    'dataset_name': 'wikitext',
    'dataset_config_name': 'wikitext-2-raw-v1',
    'max_train_samples': 1000,  # Adjust as needed
    'max_eval_samples': 100,  # Adjust as needed
    'block_size': 1024,
}

print("Hyperparameters:")
for key, value in hyperparameters.items():
    print(f"  {key}: {value}")

Hyperparameters:
  model_name_or_path: meta-llama/Llama-3.3-70b
  output_dir: /opt/ml/model
  num_train_epochs: 3
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  warmup_steps: 500
  weight_decay: 0.01
  logging_dir: /opt/ml/output/logs
  logging_steps: 10
  evaluation_strategy: steps
  eval_steps: 500
  save_steps: 1000
  save_total_limit: 3
  fp16: True
  fsdp: full_shard auto_wrap
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer


## 5. Prepare training script

In [5]:
# Create a directory for our training script
!mkdir -p scripts
#Create a subdirectory for training scripts
!mkdir -p scripts/training

In [8]:
%%writefile scripts/training/train_llama_33_70b_fsdp.py
import os
import sys
import argparse
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)
from datasets import load_dataset
from torch.distributed.fsdp import (
    FullyShardedDataParallel as FSDP,
    MixedPrecision,
    BackwardPrefetch,
    ShardingStrategy,
    CPUOffload,
)
from torch.distributed.fsdp.wrap import (
    transformer_auto_wrap_policy,
    size_based_auto_wrap_policy,
    enable_wrap,
    wrap,
)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name_or_path", type=str, required=True)
    parser.add_argument("--dataset_name", type=str, default="wikitext")
    parser.add_argument("--dataset_config_name", type=str, default="wikitext-2-raw-v1")
    parser.add_argument("--max_train_samples", type=int, default=None)
    parser.add_argument("--max_eval_samples", type=int, default=None)
    parser.add_argument("--block_size", type=int, default=1024)
    parser.add_argument("--output_dir", type=str, default="/opt/ml/model")
    return parser.parse_args()

def setup_fsdp(model):
    """
    Set up Fully Sharded Data Parallel (FSDP) for the model.
    """
    from transformers.models.llama.modeling_llama import LlamaDecoderLayer

    torch.distributed.init_process_group(backend="nccl")
    
    mixed_precision_policy = MixedPrecision(
        param_dtype=torch.float16,
        reduce_dtype=torch.float16,
        buffer_dtype=torch.float16,
    )
    
    fsdp_model = FSDP(
        model,
        sharding_strategy=ShardingStrategy.FULL_SHARD,
        mixed_precision=mixed_precision_policy,
        device_id=torch.cuda.current_device(),
        auto_wrap_policy=transformer_auto_wrap_policy(transformer_layer_cls={LlamaDecoderLayer}),
        backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
        cpu_offload=CPUOffload(offload_params=True),
    )
    
    return fsdp_model

def main():
    args = parse_args()
    
    # SageMaker specific: Set the output directory
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=1,  # Reduced batch size for FSDP
        per_device_eval_batch_size=1,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='/opt/ml/output/tensorboard',
        logging_steps=10,
        fp16=True,  # Enable mixed precision training
        gradient_checkpointing=True,  # Enable gradient checkpointing
        dataloader_num_workers=4,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        save_total_limit=3,
    )

    # Set seed for reproducibility
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        config=config,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )

    # Set up FSDP
    model = setup_fsdp(model)

    # Load and preprocess the dataset
    raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=args.block_size)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=["text"])

    train_dataset = tokenized_datasets["train"]
    if args.max_train_samples:
        train_dataset = train_dataset.select(range(args.max_train_samples))

    eval_dataset = tokenized_datasets["validation"]
    if args.max_eval_samples:
        eval_dataset = eval_dataset.select(range(args.max_eval_samples))

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Training
    train_result = trainer.train()
    trainer.save_model()
    trainer.log_metrics("train", train_result.metrics)
    trainer.save_metrics("train", train_result.metrics)

    # Evaluation
    metrics = trainer.evaluate()
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

if __name__ == "__main__":
    main()

print("Training script created at scripts/train.py")

Overwriting scripts/training/train_llama_33_70b_fsdp.py


## 6. Configure and launch training job

In [16]:
# Configure the estimator
huggingface_estimator = HuggingFace(
    entry_point='training/train_llama_33_70b_fsdp.py',
    source_dir='./scripts',
    instance_type='ml.p4d.24xlarge',
    instance_count=2,
    role=role,
    transformers_version='4.28',
    pytorch_version='2.0',
    py_version='py39',
    hyperparameters=hyperparameters,
    distribution={
        'torch_distributed': {
            'enabled': True
        }
    }
)


In [None]:
# Set S3 locations for train and evaluation data
train_data_path = 's3://your-bucket/path/to/train/data'
eval_data_path = 's3://your-bucket/path/to/eval/data'

print(f"Training data S3 location: {train_data_path}")
print(f"Evaluation data S3 location: {eval_data_path}")

In [None]:
# Launch the training job
huggingface_estimator.fit({
    'train': train_data_path,
    'eval': eval_data_path
})

## 7. Monitor and analyze results

In [None]:
#Get the name of the training job
training_job_name = huggingface_estimator.latest_training_job.job_name
print(f"Training job name: {training_job_name}")

# You can use the SageMaker console to monitor the progress of your training job
print(f"Monitor your training job at: https://{sagemaker_session.boto_region_name}.console.aws.amazon.com/sagemaker/home?region={sagemaker_session.boto_region_name}#/jobs/{training_job_name}")

# After training is complete, you can analyze the results
# This might include evaluating the model, reviewing logs, or examining saved checkpoints

print("Training complete. Review the SageMaker console for detailed logs and results.")