In [1]:
!pip install docker-compose --quiet
!pip install unzip --quiet

# DATASET

The dataset we're using is a BBC news dataset that can be downloaded here:

https://www.kaggle.com/datasets/pariza/bbc-news-summary?resource=download

This dataset for extractive text summarization has four hundred and seventeen political news articles of BBC from 2004 to 2005 in the News Articles folder. For each articles, five summaries are provided in the Summaries folder. The first clause of the text of articles is the respective title. 

## Upload dataset zip file to S3

In [2]:
import os
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sagemaker_session_bucket}")
print(f"sagemaker session region: {region}")

filename = "BBC_news_summary.zip"
s3_prefix = "model-fine-tuning-data"
path_to_file = os.path.join(os.getcwd(), "data", filename)


sagemaker role arn: arn:aws:iam::327216439222:role/Sagemaker
sagemaker bucket: sagemaker-us-east-1-327216439222
sagemaker session region: us-east-1


In [3]:
s3_client = boto3.client('s3')
s3_client.upload_file(path_to_file, sagemaker_session_bucket, os.path.join(s3_prefix, filename))

#S3 location with the BBC news data
training_data_s3 = os.path.join("s3://", sagemaker_session_bucket, s3_prefix, filename)
print(training_data_s3)

s3://sagemaker-us-east-1-327216439222/model-fine-tuning-data/BBC_news_summary.zip


{
  "id": "13818513",
  "summary": "Amanda baked cookies and will bring Jerry some tomorrow.",
  "dialogue": "Amanda: I baked cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"
}

In [4]:
from pipelines.finetuning_pipeline import get_pipeline

#model group, pipeline and job names
model_package_group_name = "FineTunedModels"
pipeline_name = "FineTunedModelsPipeline"
base_job_prefix="FineTunedModelsJob"

# These variables were defined the IAM role.
pipeline = get_pipeline(
    training_data_s3,
    region=region,
    role=role,
    default_bucket=sagemaker_session_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    base_job_prefix=base_job_prefix
)

In [5]:
#register the pipeline
pipeline.upsert(role_arn=role)

{'PipelineArn': 'FineTunedModelsPipeline'}

In [6]:
execution = pipeline.start(
    parameters=dict(
        ModelApprovalStatus="Approved", #PendingManualApproval
    )
)

execution.describe()

Starting execution for pipeline FineTunedModelsPipeline. Execution ID is ffc42ff8-7208-4a9a-8080-51f32a50c480
Starting pipeline step: 'FineTunedModelsProcess'
Creating tyki8fsdyn-algo-1-vyq4v ... 
Creating tyki8fsdyn-algo-1-vyq4v ... done
Attaching to tyki8fsdyn-algo-1-vyq4v
[36mtyki8fsdyn-algo-1-vyq4v |[0m Collecting huggingface-hub==0.15.1 (from -r /opt/ml/processing/input/code/dataset_preparation/requirements.txt (line 1))
[36mtyki8fsdyn-algo-1-vyq4v |[0m   Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m31m?[0m eta [36m-:--:--[0m
[36mtyki8fsdyn-algo-1-vyq4v |[0m [?25hCollecting transformers==4.30.2 (from -r /opt/ml/processing/input/code/dataset_preparation/requirements.txt (line 2))
[36mtyki8fsdyn-algo-1-vyq4v |[0m   Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.

{'CreationTime': 1688800264.312161,
 'LastModifiedTime': 1688800276.708361,
 'PipelineArn': 'FineTunedModelsPipeline',
 'PipelineExecutionArn': 'ffc42ff8-7208-4a9a-8080-51f32a50c480',
 'PipelineExecutionStatus': 'Succeeded'}