In [None]:
!pip install transformers datasets sagemaker accelerate evaluate --quiet
!pip install -U sagemaker
!pip install transformers datasets

2.1 Data Loading

In [None]:
# Import dependencies
import boto3
import sagemaker
from sagemaker import get_execution_role
import os 


In [None]:
# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

In [None]:
# S3 bucket and file details
bucket_name = "squad-training-data"
s3_data_key = "datasets/training_data.jsonl"  # Path in S3
local_data_dir = "./data"
local_file_path = os.path.join(local_data_dir, "training_data.jsonl")

# Create local directory if it doesn't exist
os.makedirs(local_data_dir, exist_ok=True)


In [None]:
try:
    s3 = boto3.client("s3")
    s3.download_file(bucket_name, s3_data_key, local_file_path)
    print(f"File downloaded successfully to {local_file_path}")
except Exception as e:
    print(f"Error downloading file: {e}")


In [None]:
from datasets import load_dataset

# Load the dataset from the local JSONL file
dataset = load_dataset("json", data_files=local_file_path)

# Preview the dataset
print(dataset)


Data Preprocessing



In [None]:
from datasets import load_dataset, DatasetDict

# Load the dataset from the JSONL file
dataset = load_dataset("json", data_files=local_file_path)

# Define preprocessing function
def preprocess_data(example):
    example["input_text"] = example["prompt"]
    example["target_text"] = example["squad"]
    return example

# Apply preprocessing
processed_dataset = dataset["train"].map(preprocess_data, remove_columns=["prompt", "squad"])

# Split dataset into train, validation, and test
split_dataset = processed_dataset.train_test_split(test_size=0.2, seed=42)
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

# Further split test into validation and test sets
final_dataset = final_dataset["test"].train_test_split(test_size=0.5, seed=42)
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": final_dataset["train"],
    "test": final_dataset["test"]
})

# Save processed dataset locally
final_dataset.save_to_disk("./data/processed_dataset")

# Verify dataset
print(final_dataset)


In [None]:
from datasets import load_from_disk

# Load the preprocessed dataset from disk
processed_dataset = load_from_disk("./data/processed_dataset")

# Verify the structure
print(processed_dataset)


In [None]:
# Display the first 5 samples from the training data
print(processed_dataset["train"][:5])
# Display the first 5 samples from the validation data
print(processed_dataset["validation"][:5])
# Display the first 5 samples from the test data
print(processed_dataset["test"][:5])


In [None]:
from datasets import load_from_disk

# Load the preprocessed dataset from disk
processed_dataset = load_from_disk("./data/processed_dataset")

# Verify the structure
print(processed_dataset)


In [None]:
# Define S3 bucket and key
bucket_name = "squad-training-data"
processed_data_s3_key = "datasets/processed_dataset"

# Upload processed dataset to S3
s3 = boto3.client("s3")
for root, dirs, files in os.walk("./data/processed_dataset"):
    for file in files:
        s3.upload_file(
            os.path.join(root, file),
            bucket_name,
            f"{processed_data_s3_key}/{os.path.relpath(os.path.join(root, file), './data/processed_dataset')}"
        )

print(f"Processed dataset uploaded to s3://{bucket_name}/{processed_data_s3_key}")


In [None]:
from sagemaker.image_uris import retrieve

framework = "pytorch"
region = "us-west-1"  # Specify your AWS region
instance_type = "ml.p3.2xlarge"
py_versions = ["py39", "py38"]
framework_versions = ["1.13", "2.0", "2.1"]  # Add other versions you want to test

# Check compatibility
for py_version in py_versions:
    for framework_version in framework_versions:
        try:
            uri = retrieve(
                framework=framework,
                region=region,
                version=framework_version,
                py_version=py_version,
                instance_type=instance_type,
                image_scope="training",
            )
            print(f"Compatible: PyTorch {framework_version}, Python {py_version} -> {uri}")
        except Exception as e:
            print(f"Not compatible: PyTorch {framework_version}, Python {py_version} -> {e}")


In [None]:
import transformers
print(transformers.__version__)
import torch
print(torch.__version__)
import sys
print(sys.version)


In [None]:
pip install torch==2.4.0


In [None]:
import sagemaker
from sagemaker import image_uris

framework = "pytorch"
transformers_version = "4.38.2"
pytorch_version = "2.4.0"
py_version = "py311"
region = sagemaker.Session().boto_region_name

try:
    image_uri = image_uris.retrieve(
        framework=framework,
        region=region,
        version=pytorch_version,
        py_version=py_version,
        image_scope="training",
        base_framework_version=transformers_version,
    )
    print(f"Compatible SageMaker image: {image_uri}")
except Exception as e:
    print(f"No compatible SageMaker image found: {e}")


In [None]:
from sagemaker.huggingface import HuggingFace

# Define hyperparameters and settings
hyperparameters = {
    "epochs": 3,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "model_name": "t5-large",
    "max_seq_length": 128
}

# Set up Hugging Face Estimator
huggingface_estimator = HuggingFace(
   entry_point="train_t5.py",  # Script to run
    source_dir="/home/sagemaker-user/SIGROPM1/model/sigropm/major_model",
    instance_type="ml.g4dn.4xlarge",
    instance_count=1,
    role=role,  
    transformers_version="4.28",
    pytorch_version="2.0",
    py_version="py310",
    hyperparameters=hyperparameters,
)

# Start training job
huggingface_estimator.fit({"train": f"s3://{bucket_name}/{s3_data_key}"})


Evaluating the model

In [None]:
# from datasets import load_from_disk

# Load processed test dataset
dataset_path = "./data/processed_dataset"
datasets = load_from_disk(dataset_path)
test_dataset = datasets["test"]


In [None]:
def generate_predictions(example):
    # Tokenize the input text
    input_ids = tokenizer.encode(example["input_text"], return_tensors="pt", truncation=True)
    
    # Generate prediction
    outputs = model.generate(input_ids)
    
    # Decode and save the predicted text
    example["predicted_text"] = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return example

# Apply the function to the test dataset
test_results = test_dataset.map(generate_predictions)


In [None]:
# from datasets import load_metric

# Load the ROUGE metric
rouge = load_metric("rouge")

# Prepare predictions and references
predictions = [example["predicted_text"] for example in test_results]
references = [example["target_text"] for example in test_results]

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references)

# Print results
print("ROUGE Results:", results)

