## Setting up Libraries

In [None]:
!pip install --upgrade pip
%pip install --no-build-isolation --force-reinstall \
    boto3==1.28.57 \
    awscli==1.29.57 \
    botocore==1.31.57
!pip install -q --force-reinstall langchain typing_extensions pypdf urllib3==2.1.0
!pip install -qU 'ipywidgets>=7,<8'
!pip install jsonlines
!pip install datasets==2.15.0
!pip install pandas==2.1.3
!pip install matplotlib==3.8.2
!pip install tokenizers==0.12.1
!pip install -qU fmeval==0.3.0

In [None]:
import os
import IPython

def restart_kernel():
    os._exit(00)

restart_kernel()

In [None]:
import boto3
import json
import jsonlines
import os
import pprint
import random
import sys
import time
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

warnings.filterwarnings('ignore')

## Preparing the Session and Making Configurations

In [None]:
session = boto3.session.Session()
region = session.region_name
sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()["Account"]
s3_suffix = f"{region}-{account_id}"
bucket_name = f"bedrock-fine-tuning-custom-{s3_suffix}"
s3_client = boto3.client('s3')
bedrock = boto3.client(service_name="bedrock", region_name=region)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name=region)
iam = boto3.client('iam', region_name=region)

In [None]:
role_name = "AmazonBedrockFineTuningCustomRole"
s3_bedrock_finetuning_access_policy="AmazonBedrockFineTuningCustomPolicy"
customization_role = f"arn:aws:iam::{account_id}:role/{role_name}"

In [None]:
s3bucket = s3_client.create_bucket(
    Bucket=bucket_name,
)

In [None]:
s3_bedrock_role = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Principal": {{
                "Service": "bedrock.amazonaws.com"
            }},
            "Action": "sts:AssumeRole",
            "Condition": {{
                "StringEquals": {{
                    "aws:SourceAccount": "{account_id}"
                }},
                "ArnEquals": {{
                    "aws:SourceArn": "arn:aws:bedrock:{region}:{account_id}:model-customization-job/*"
                }}
            }}
        }}
    ]
}}
"""

In [None]:
response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=s3_bedrock_role,
    Description="Role for Bedrock to access S3 for finetuning",
)
pprint.pp(response)

In [None]:
s3_access_policy = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "s3:AbortMultipartUpload",
                "s3:DeleteObject",
                "s3:PutObject",
                "s3:GetObject",
                "s3:GetBucketAcl",
                "s3:GetBucketNotification",
                "s3:ListBucket",
                "s3:PutBucketNotification"
            ],
            "Resource": [
                "arn:aws:s3:::{bucket_name}",
                "arn:aws:s3:::{bucket_name}/*"
            ]
        }}
    ]
}}"""


In [None]:
response = iam.create_policy(
    PolicyName=s3_bedrock_finetuning_access_policy,
    PolicyDocument=s3_access_policy,
)
pprint.pp(response)

In [None]:
role_arn = response["Role"]["Arn"]
policy_arn = response["Policy"]["Arn"]
iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_arn,
)

## Loading the GovReport Dataset

In [None]:
dataset = load_dataset("ccdv/govreport-summarization")

In [None]:
print(dataset)

In [None]:
instruction='''Below is an instruction which describes a task, paired with an input which will provide further context. Write a response that appropriately completes the request.

instruction:

Summarize the report provided below.

input:

'''

In [None]:
datapoints_train=[]
for data in dataset['train']:
    temp_dict={}
    temp_dict['prompt']=instruction+data['report']
    temp_dict['completion']='response:\n'+data['summary']
    datapoints_train.append(temp_dict)

In [None]:
print(training_datapoints[4]['prompt'])

In [None]:
datapoints_valid=[]
for data in dataset['validation']:
    temp_dict={}
    temp_dict['prompt']=instruction+data['report']
    temp_dict['completion']='response:\n\n'+data['summary']
    datapoints_valid.append(temp_dict)

datapoints_test=[]
for data in dataset['test']:
    temp_dict={}
    temp_dict['prompt']=instruction+data['report']
    temp_dict['completion']='response:\n\n'+data['summary']
    datapoints_test.append(temp_dict)

In [None]:
def data_transform(data_points,num_data,max_data_length):
    lines=[]
    for data in data_points:
        if len(data['prompt']+data['completion'])<=max_data_length:
                lines.append(data)
    random.shuffle(lines)
    lines=lines[:num_data]
    return lines

In [None]:
def jsonl_converter(dataset,file_name):
    print(file_name)
    with jsonlines.open(file_name, 'w') as writer:
        for line in dataset:
            writer.write(line)

In [None]:
train=data_transform(datapoints_train,5000,10000)
validation=data_transform(datapoints_valid,999,10000)
test=data_transform(datapoints_test,10,10000)

## Using Local Directories for Fine-tuning Datasets

In [None]:
dataset_folder="fine-tuning-datasets"
train_file_name="train-govreport.jsonl"
validation_file_name="validation-govreport.jsonl"
test_file_name="test-govreport.jsonl"
!mkdir fine-tuning-datasets
abs_path=os.path.abspath(dataset_folder) 

In [None]:
jsonl_converter(train,f'{abs_path}/{train_file_name}')
jsonl_converter(validation,f'{abs_path}/{validation_file_name}')
jsonl_converter(test,f'{abs_path}/{test_file_name}')

In [None]:
s3_client.upload_file(f'{abs_path}/{train_file_name}', bucket_name, f'fine-tuning-datasets/train/{train_file_name}')
s3_client.upload_file(f'{abs_path}/{validation_file_name}', bucket_name, f'fine-tuning-datasets/validation/{validation_file_name}')
s3_client.upload_file(f'{abs_path}/{test_file_name}', bucket_name, f'fine-tuning-datasets/test/{test_file_name}')

s3_train_uri=f's3://{bucket_name}/fine-tuning-datasets/train/{train_file_name}'
s3_validation_uri=f's3://{bucket_name}/fine-tuning-datasets/validation/{validation_file_name}'
s3_test_uri=f's3://{bucket_name}/fine-tuning-datasets/test/{test_file_name}'

## Initialize Bedrock Session for Fine-Tuning

In [None]:
session = boto3.session.Session()
region = 'us-west-2'
sts_client = boto3.client('sts')
s3_client = boto3.client('s3')
aws_account_id = sts_client.get_caller_identity()["Account"]
bedrock = boto3.client(service_name="bedrock", region_name=region)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name=region)

In [None]:
test_file_name = "test-govreport.jsonl"
data_folder = "fine-tuning-datasets"

## Creating the Fine-tuning Job

In [None]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
base_model_id = "meta.llama2-13b-v1:0:4k"
customization_type = "FINE_TUNING"
customization_role = role_arn
customization_job_name = f"llama2-finetune-sm-test-model-{timestamp}"
custom_model_name = f"llama2-finetune-{timestamp}"

In [None]:
hyper_parameters = {
        "epochCount": "2",
        "batchSize": "1",
        "learningRate": "0.00005",
    }

training_data_config = {"s3Uri": s3_train_uri}

validation_data_config = {
        "validators": [{
            "s3Uri": s3_validation_uri
        }]
    }

output_data_config = {"s3Uri": f's3://{bucket_name}/outputs/output-{custom_model_name}'}

In [None]:
bedrock.create_model_customization_job(
    customizationType=customization_type,
    jobName=customization_job_name,
    customModelName=custom_model_name,
    roleArn=customization_role,
    baseModelIdentifier=base_model_id,
    hyperParameters=hyper_parameters,
    trainingDataConfig=training_data_config,
    validationDataConfig=validation_data_config,
    outputDataConfig=output_data_config
)

## Creating Provisioned Throughput

In [None]:
provisioned_model_id = bedrock.create_provisioned_model_throughput(
     modelUnits=1,
     provisionedModelName='provisioned_model_fine_tuning_1', 
     modelId='<INSERT_CUSTOM_MODEL_ID_HERE>'
    )['provisionedModelArn']    

In [None]:
file_path_for_testing = f'{data_folder}/{test_file_name}'
with open(file_path_for_testing) as f:
    lines = f.read().splitlines()

test_prompt = json.loads(lines[0])['prompt']
reference_summary = json.loads(lines[0])['completion']


In [None]:
body = json.dumps({
    "prompt": test_prompt,
    "max_gen_len": 300,
    "temperature": 0.5,
    "top_p": 0.5,
})

modelId = provisioned_model_id
accept = 'application/json'
contentType = 'application/json'

response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType) 

response_body = json.loads(response.get('body').read())
print(response_body)

## Clean Up

In [None]:
bedrock.delete_provisioned_model_throughput(provisionedModelId=provisioned_model_id)

In [None]:
objects = s3_client.list_objects(Bucket=bucket_name)  
if 'Contents' in objects:
    for obj in objects['Contents']:
        s3_client.delete_object(Bucket=bucket_name, Key=obj['Key']) 
s3_client.delete_bucket(Bucket=bucket_name)

In [None]:
iam.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn)
iam.delete_role(RoleName=role_name)