# Model Fine-tuning Service Demo



This notebook demonstrates how to use the Model Fine-tuning Service to fine-tune language models using Amazon Bedrock and create provisioned throughput for the fine-tuned models.

## Setup



First, let's install the required packages and import the necessary modules.

In [None]:
# Install the IDP common package
%pip install -q -e "../lib/idp_common_pkg"

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os
import json
import boto3
import time
import logging
from typing import Dict, Any
from botocore.exceptions import ClientError
from datetime import datetime

# Import the model fine-tuning service
from idp_common.model_finetuning import (
    ModelFinetuningService,
    FinetuningJobConfig,
    FinetuningJobResult,
    JobStatus,
    ProvisionedThroughputConfig,
    ProvisionedThroughputResult
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Create or Update IAM Role for Bedrock Fine-tuning



This section creates or updates the IAM role with permissions to access ANY S3 bucket.

In [None]:
# Set AWS region
region = "us-east-1"  # Amazon Bedrock fine-tuning is currently only available in us-east-1

def create_or_update_model_customization_role(role_name_base="IDPModelCustomizationRole"):
    """
    Creates or updates an IAM role with permissions to access ANY S3 bucket
    for use with Amazon Bedrock fine-tuning.
    
    Args:
        role_name_base: The base name for the IAM role
        
    Returns:
        The ARN of the IAM role
    """
    # Initialize the IAM client
    iam_client = boto3.client('iam', region_name=region)
    
    # Add region suffix to role name for regional isolation
    region_suffix = region.replace('-', '')
    role_name = f"{role_name_base}{region_suffix}"
    
    # Define the trust policy - allows Bedrock service to assume this role
    trust_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Principal": {
                    "Service": "bedrock.amazonaws.com"
                },
                "Action": "sts:AssumeRole"
            }
        ]
    }
    
    # Define the S3 access policy with access to ANY bucket
    s3_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "s3:GetObject",
                    "s3:PutObject",
                    "s3:ListBucket"
                ],
                "Resource": [
                    "arn:aws:s3:::*",
                    "arn:aws:s3:::*/*"
                ]
            }
        ]
    }
    
    try:
        # Check if the role already exists
        try:
            role = iam_client.get_role(RoleName=role_name)
            print(f"Role {role_name} already exists")
            
            # Update the policy
            policy_name = f"{role_name}S3AccessPolicy"
            
            # Check if policy exists and get its ARN
            try:
                policies = iam_client.list_attached_role_policies(RoleName=role_name)
                policy_exists = False
                policy_arn = None
                
                for policy in policies['AttachedPolicies']:
                    if policy['PolicyName'] == policy_name:
                        policy_arn = policy['PolicyArn']
                        policy_exists = True
                        break
                
                if policy_exists:
                    # Detach and delete the existing policy
                    iam_client.detach_role_policy(
                        RoleName=role_name,
                        PolicyArn=policy_arn
                    )
                    
                    # AWS requires a delay when dealing with IAM
                    time.sleep(2)
                    
                    iam_client.delete_policy(
                        PolicyArn=policy_arn
                    )
                    print(f"Deleted existing policy: {policy_name}")
            except ClientError as e:
                print(f"Error checking policies: {e}")
            
            # Create a new policy
            policy_response = iam_client.create_policy(
                PolicyName=policy_name,
                PolicyDocument=json.dumps(s3_policy),
                Description='Policy for S3 access for Bedrock fine-tuning (any bucket)'
            )
            policy_arn = policy_response['Policy']['Arn']
            
            # Attach the policy to the role
            iam_client.attach_role_policy(
                RoleName=role_name,
                PolicyArn=policy_arn
            )
            print(f"Updated role {role_name} with new S3 access policy for ANY bucket")
            
            # Return full role ARN with path
            return role['Role']['Arn']
            
        except ClientError as e:
            # Role doesn't exist, create it
            if e.response['Error']['Code'] == 'NoSuchEntity':
                print(f"Role {role_name} doesn't exist. Creating...")
                
                # Set the path for service roles
                path = "/service-role/"
                
                # Create the role with trust policy
                response = iam_client.create_role(
                    Path=path,
                    RoleName=role_name,
                    AssumeRolePolicyDocument=json.dumps(trust_policy),
                    Description="Role for Amazon Bedrock fine-tuning with S3 access to ANY bucket"
                )
                role_arn = response['Role']['Arn']
                
                # Create a policy for S3 access
                policy_name = f"{role_name}S3AccessPolicy"
                policy_response = iam_client.create_policy(
                    PolicyName=policy_name,
                    PolicyDocument=json.dumps(s3_policy),
                    Description='Policy for S3 access for Bedrock fine-tuning (any bucket)'
                )
                policy_arn = policy_response['Policy']['Arn']
                
                # Attach the policy to the role
                iam_client.attach_role_policy(
                    RoleName=role_name,
                    PolicyArn=policy_arn
                )
                
                print(f"Created role {role_name} with S3 access policy for ANY bucket")
                print(f"Allow some time for the role to propagate in AWS")
                
                return role_arn
            else:
                raise
    except ClientError as e:
        print(f"Error creating/updating role: {e}")
        return None

# Create/update the role with expanded S3 permissions to allow access to ANY bucket
role_arn = create_or_update_model_customization_role()

## Configuration



Set up the configuration for the fine-tuning job.

In [None]:
# Print the role ARN being used
print(f"Using role ARN: {role_arn}")

# Set S3 URIs for training and validation data
training_data_uri = "s3://test-idp-finetuning-data-us-east-1/rvl-cdip-sampled/train.jsonl"  # Replace with your training data URI
validation_data_uri = "s3://test-idp-finetuning-data-us-east-1/rvl-cdip-sampled/validation.jsonl"  # Optional: Replace with your validation data URI
output_uri = "s3://test-idp-finetuning-data-us-east-1/"  # Replace with your output URI

# Set job and model names
# Generate a human-readable timestamp (format: YYYYMMDD_HHMMSS) 
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
job_name = f"test-ft-benchmark-1-job-{timestamp}"
assert len(job_name) <= 63, "Job name must be 63 characters or fewer" 
assert job_name[0].isalnum(), "Job name must start with an alphanumeric character" 
assert all(c.isalnum() or c in '-+.' for c in job_name.replace('-', '')), "Job name can only contain alphanumeric, hyphen, plus, and period characters" 
print(f"Job name: {job_name}")


model_name = "test-ft-benchmark-1-nova-lite"
assert len(model_name) <= 63, "Model name must be 63 characters or fewer"


# Choose the base model to fine-tune
base_model = "arn:aws:bedrock:us-east-1::foundation-model/amazon.nova-lite-v1:0:300k"

# Set hyperparameters
hyperparameters = {
    "epochCount": "1",
    "learningRate": "0.0001",
    "batchSize": "1"
}

## Initialize the Model Fine-tuning Service



Create an instance of the ModelFinetuningService.

In [None]:
# Create configuration dictionary
config = {
    "model_finetuning": {
        "base_models": base_model,
        "hyperparameters": {
            "default": {
                "epochCount": "2",
                "learningRate": "0.00001",
                "batchSize": "1"
            }
        }
    }
}

# Initialize the service
finetuning_service = ModelFinetuningService(region=region, config=config)
print("Model Fine-tuning Service initialized.")

## Option 1: Create a Fine-tuning Job with Separate Validation Data



Create a fine-tuning job with separate training and validation data.

In [None]:
# Create fine-tuning job configuration
job_config = FinetuningJobConfig(
    base_model=base_model,
    training_data_uri=training_data_uri,
    validation_data_uri=validation_data_uri,  # Optional: Remove if not using separate validation data
    output_uri=output_uri,
    role_arn=role_arn,
    job_name=job_name,
    model_name=model_name,
    hyperparameters=hyperparameters,
    model_type="nova"  # Specify the model type
)

# Create fine-tuning job
job_result = finetuning_service.create_finetuning_job(job_config)
print(f"Created fine-tuning job: {job_result.job_arn}")

## Check Fine-tuning Job Status



Check the status of the fine-tuning job.

In [None]:
# Check job status
status = finetuning_service.get_job_status(job_result.job_arn, model_type="nova")
print(f"Job status: {status.status}")

## Wait for Job Completion



Wait for the fine-tuning job to complete.

In [None]:
# Wait for job completion
final_status = finetuning_service.wait_for_job_completion(
    job_result.job_arn,
    model_type="nova",
    polling_interval=60,
    max_wait_time=3600  # 1 hour
)
print(f"Job completed with status: {final_status.status}")
print(f"Model ID: {final_status.model_id}")

## Create Provisioned Throughput



Create provisioned throughput for the fine-tuned model.

In [None]:
client = boto3.client("bedrock") 
job = client.get_model_customization_job(jobIdentifier=job_result.job_arn)
throughput_config = ProvisionedThroughputConfig(
    model_id=job["outputModelArn"],
    provisioned_model_name=f"{model_name}-provisioned",
    model_units=1,
    model_type="nova"
)

# Create provisioned throughput
throughput_result = finetuning_service.create_provisioned_throughput(throughput_config)
print(f"Created provisioned throughput: {throughput_result.provisioned_model_id}")

In [None]:
status_provisioning = client.get_provisioned_model_throughput(provisionedModelId = throughput_result.provisioned_model_arn)['status']

import time
while status_provisioning == 'Creating':
    time.sleep(60)
    status_provisioning = client.get_provisioned_model_throughput(provisionedModelId=job["provisionedModelName"])['status']
    print(status_provisioning)
    time.sleep(60)

## Wait for Provisioning Completion



Wait for the provisioning to complete.

In [None]:
# Wait for provisioning to complete
final_throughput_status = finetuning_service.wait_for_provisioning_completion(
    throughput_result.provisioned_model_arn,
    model_type="nova",
    polling_interval=5,
    max_wait_time=1800  # 30 minutes
)
print(f"Provisioning completed with status: {final_throughput_status.status}")

## Clean Up Resources



Delete the provisioned throughput to avoid incurring costs.

In [None]:
# Delete provisioned throughput
response = finetuning_service.delete_provisioned_throughput(
    throughput_result.provisioned_model_arn,
    model_type="nova"
)
print(f"Deleted provisioned throughput: {throughput_result.provisioned_model_id}")

## Option 2: Create a Fine-tuning Job with Automatic Data Splitting



Create a fine-tuning job with automatic data splitting.

In [None]:
# Create fine-tuning job configuration with automatic data splitting
job_config_auto_split = FinetuningJobConfig(
    base_model=base_model,
    training_data_uri=training_data_uri,  # Only provide training data
    output_uri=output_uri,
    role_arn=role_arn,
    job_name=f"{job_name}-auto-split",
    model_name=f"{model_name}-auto-split",
    hyperparameters=hyperparameters,
    validation_split=0.2,  # Specify validation split ratio
    model_type="nova"
)

# Uncomment to create fine-tuning job with automatic data splitting
# job_result_auto_split = finetuning_service.create_finetuning_job(job_config_auto_split)
# print(f"Created fine-tuning job with automatic data splitting: {job_result_auto_split.job_arn}")