In [1]:
!pip install transformers datasets sagemaker accelerate evaluate --quiet
!pip install -U sagemaker
!pip install transformers datasets

Collecting sagemaker
  Using cached sagemaker-2.237.0-py3-none-any.whl.metadata (16 kB)
Collecting boto3<2.0,>=1.35.75 (from sagemaker)
  Using cached boto3-1.35.76-py3-none-any.whl.metadata (6.7 kB)
Collecting omegaconf<2.3,>=2.2 (from sagemaker)
  Using cached omegaconf-2.2.3-py3-none-any.whl.metadata (3.9 kB)
Collecting sagemaker-core<2.0.0,>=1.0.17 (from sagemaker)
  Using cached sagemaker_core-1.0.17-py3-none-any.whl.metadata (4.9 kB)
Collecting botocore<1.36.0,>=1.35.76 (from boto3<2.0,>=1.35.75->sagemaker)
  Using cached botocore-1.35.76-py3-none-any.whl.metadata (5.7 kB)
Collecting pydantic<3.0.0,>=2.0.0 (from sagemaker-core<2.0.0,>=1.0.17->sagemaker)
  Using cached pydantic-2.10.3-py3-none-any.whl.metadata (172 kB)
Collecting platformdirs (from sagemaker)
  Using cached platformdirs-4.3.6-py3-none-any.whl.metadata (11 kB)
Collecting mock<5.0,>4.0 (from sagemaker-core<2.0.0,>=1.0.17->sagemaker)
  Using cached mock-4.0.3-py3-none-any.whl.metadata (2.8 kB)
Collecting annotated-ty

2.1 Data Loading

In [2]:
# Import dependencies
import boto3
import sagemaker
from sagemaker import get_execution_role
import os 




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

In [4]:
# S3 bucket and file details
bucket_name = "squad-training-data"
s3_data_key = "datasets/training_data.jsonl"  # Path in S3
local_data_dir = "./data"
local_file_path = os.path.join(local_data_dir, "training_data.jsonl")

# Create local directory if it doesn't exist
os.makedirs(local_data_dir, exist_ok=True)


In [5]:
try:
    s3 = boto3.client("s3")
    s3.download_file(bucket_name, s3_data_key, local_file_path)
    print(f"File downloaded successfully to {local_file_path}")
except Exception as e:
    print(f"Error downloading file: {e}")


File downloaded successfully to ./data/training_data.jsonl


In [6]:
from datasets import load_dataset

# Load the dataset from the local JSONL file
dataset = load_dataset("json", data_files=local_file_path)

# Preview the dataset
print(dataset)


Downloading and preparing dataset json/default to /home/sagemaker-user/.cache/huggingface/datasets/json/default-5be3238443283528/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/sagemaker-user/.cache/huggingface/datasets/json/default-5be3238443283528/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'squad'],
        num_rows: 197990
    })
})


Data Preprocessing



In [7]:
from datasets import load_dataset, DatasetDict

# Load the dataset from the JSONL file
dataset = load_dataset("json", data_files=local_file_path)

# Define preprocessing function
def preprocess_data(example):
    example["input_text"] = example["prompt"]
    example["target_text"] = example["squad"]
    return example

# Apply preprocessing
processed_dataset = dataset["train"].map(preprocess_data, remove_columns=["prompt", "squad"])

# Split dataset into train, validation, and test
split_dataset = processed_dataset.train_test_split(test_size=0.2, seed=42)
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

# Further split test into validation and test sets
final_dataset = final_dataset["test"].train_test_split(test_size=0.5, seed=42)
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": final_dataset["train"],
    "test": final_dataset["test"]
})

# Save processed dataset locally
final_dataset.save_to_disk("./data/processed_dataset")

# Verify dataset
print(final_dataset)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/197990 [00:00<?, ?ex/s]

Flattening the indices:   0%|          | 0/159 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/20 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/20 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 158392
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 19799
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 19799
    })
})


In [8]:
from datasets import load_from_disk

# Load the preprocessed dataset from disk
processed_dataset = load_from_disk("./data/processed_dataset")

# Verify the structure
print(processed_dataset)


DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 158392
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 19799
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 19799
    })
})


In [9]:
# Display the first 5 samples from the training data
print(processed_dataset["train"][:5])
# Display the first 5 samples from the validation data
print(processed_dataset["validation"][:5])
# Display the first 5 samples from the test data
print(processed_dataset["test"][:5])


{'input_text': ['Streetwear is a style of casual clothing which became global in the 1990s. It grew from New York hip...\n\nAdditional details:\nTopic: Streetwear\nSubtopics: History, Hypebeast culture, Sneaker collecting, 1980s in fashion, 19th century in fashion, 21st century in fashion, Category:1990s fashion, Category:2000s fashion, Category:2010s fashion', 'Streaming media refers to multimedia for playback using an offline or online media player that is de...\n\nAdditional details:\nTopic: Streaming media\nSubtopics: Etymology, Precursors, History, A&M Records, Inc. v. Napster, Inc., ActiveMovie, Adam Yauch, Category:All Wikipedia articles in need of updating, Category:All articles containing potentially dated statements, Category:Applications of distributed computing', 'Fashion is a term used interchangeably to describe the creation of clothing, footwear, accessories, ...\n\nAdditional details:\nTopic: Fashion\nSubtopics: Definitions, History of fashion, Fashion industry, 1300–14

In [10]:
from datasets import load_from_disk

# Load the preprocessed dataset from disk
processed_dataset = load_from_disk("./data/processed_dataset")

# Verify the structure
print(processed_dataset)


DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 158392
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 19799
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 19799
    })
})


In [11]:
# Define S3 bucket and key
bucket_name = "squad-training-data"
processed_data_s3_key = "datasets/processed_dataset"

# Upload processed dataset to S3
s3 = boto3.client("s3")
for root, dirs, files in os.walk("./data/processed_dataset"):
    for file in files:
        s3.upload_file(
            os.path.join(root, file),
            bucket_name,
            f"{processed_data_s3_key}/{os.path.relpath(os.path.join(root, file), './data/processed_dataset')}"
        )

print(f"Processed dataset uploaded to s3://{bucket_name}/{processed_data_s3_key}")


Processed dataset uploaded to s3://squad-training-data/datasets/processed_dataset


In [12]:
from sagemaker.image_uris import retrieve

framework = "pytorch"
region = "us-west-1"  # Specify your AWS region
instance_type = "ml.p3.2xlarge"
py_versions = ["py39", "py38"]
framework_versions = ["1.13", "2.0", "2.1"]  # Add other versions you want to test

# Check compatibility
for py_version in py_versions:
    for framework_version in framework_versions:
        try:
            uri = retrieve(
                framework=framework,
                region=region,
                version=framework_version,
                py_version=py_version,
                instance_type=instance_type,
                image_scope="training",
            )
            print(f"Compatible: PyTorch {framework_version}, Python {py_version} -> {uri}")
        except Exception as e:
            print(f"Not compatible: PyTorch {framework_version}, Python {py_version} -> {e}")


Compatible: PyTorch 1.13, Python py39 -> 763104351884.dkr.ecr.us-west-1.amazonaws.com/pytorch-training:1.13-gpu-py39
Not compatible: PyTorch 2.0, Python py39 -> Unsupported Python version: py39. You may need to upgrade your SDK version (pip install -U sagemaker) for newer Python versions. Supported Python version(s): py310.
Not compatible: PyTorch 2.1, Python py39 -> Unsupported Python version: py39. You may need to upgrade your SDK version (pip install -U sagemaker) for newer Python versions. Supported Python version(s): py310.
Not compatible: PyTorch 1.13, Python py38 -> Unsupported Python version: py38. You may need to upgrade your SDK version (pip install -U sagemaker) for newer Python versions. Supported Python version(s): py39.
Not compatible: PyTorch 2.0, Python py38 -> Unsupported Python version: py38. You may need to upgrade your SDK version (pip install -U sagemaker) for newer Python versions. Supported Python version(s): py310.
Not compatible: PyTorch 2.1, Python py38 -> Uns

In [13]:
import transformers
print(transformers.__version__)
import torch
print(torch.__version__)
import sys
print(sys.version)


4.38.2
2.4.1.post100
3.11.10 | packaged by conda-forge | (main, Oct 16 2024, 01:27:36) [GCC 13.3.0]


In [14]:
pip install torch==2.4.0


Collecting torch==2.4.0
  Using cached torch-2.4.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.0)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.4.0)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.4.0)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-no

In [15]:
import sagemaker
from sagemaker import image_uris

framework = "pytorch"
transformers_version = "4.38.2"
pytorch_version = "2.4.0"
py_version = "py311"
region = sagemaker.Session().boto_region_name

try:
    image_uri = image_uris.retrieve(
        framework=framework,
        region=region,
        version=pytorch_version,
        py_version=py_version,
        image_scope="training",
        base_framework_version=transformers_version,
    )
    print(f"Compatible SageMaker image: {image_uri}")
except Exception as e:
    print(f"No compatible SageMaker image found: {e}")


No compatible SageMaker image found: Empty SageMaker instance type. For options, see: https://aws.amazon.com/sagemaker/pricing/instance-types


In [16]:
from sagemaker.huggingface import HuggingFace

# Define hyperparameters and settings
hyperparameters = {
    "epochs": 3,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "model_name": "t5-large",
    "max_seq_length": 128
}

# Set up Hugging Face Estimator
huggingface_estimator = HuggingFace(
   entry_point="train_t5.py",  # Script to run
    source_dir="/home/sagemaker-user/SIGROPM1/model/sigropm/major_model",
    instance_type="ml.g4dn.4xlarge",
    instance_count=1,
    role=role,  
    transformers_version="4.28",
    pytorch_version="2.0",
    py_version="py310",
    hyperparameters=hyperparameters,
)

# Start training job
huggingface_estimator.fit({"train": f"s3://{bucket_name}/{s3_data_key}"})


Evaluating the model

In [None]:
# from datasets import load_from_disk

# # Load processed test dataset
# dataset_path = "./data/processed_dataset"
# datasets = load_from_disk(dataset_path)
# test_dataset = datasets["test"]


In [None]:
# def generate_predictions(example):
#     # Tokenize the input text
#     input_ids = tokenizer.encode(example["input_text"], return_tensors="pt", truncation=True)
    
#     # Generate prediction
#     outputs = model.generate(input_ids)
    
#     # Decode and save the predicted text
#     example["predicted_text"] = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return example

# # Apply the function to the test dataset
# test_results = test_dataset.map(generate_predictions)


In [None]:
# from datasets import load_metric

# # Load the ROUGE metric
# rouge = load_metric("rouge")

# # Prepare predictions and references
# predictions = [example["predicted_text"] for example in test_results]
# references = [example["target_text"] for example in test_results]

# # Compute ROUGE scores
# results = rouge.compute(predictions=predictions, references=references)

# # Print results
# print("ROUGE Results:", results)

