In [3]:
import sagemaker
from sagemaker.pytorch import PyTorch

# Set up the SageMaker session
sagemaker_session = sagemaker.Session()

# Define the S3 paths to your preprocessed data
train_data = sagemaker.inputs.TrainingInput(
    s3_data='s3://csml-data-bucket/preprocessed/train/',
    content_type='application/x-image'
)

validation_data = sagemaker.inputs.TrainingInput(
    s3_data='s3://csml-data-bucket/preprocessed/validation/',
    content_type='application/x-image'
)

# Set up the PyTorch estimator with updated dependencies and framework version
pytorch_estimator = PyTorch(
    entry_point='train_with_resnet50.py',
    source_dir='/home/ec2-user/SageMaker/code/',  # Ensure this directory contains train.py, model.py, and updated requirements.txt
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.10xlarge',       # Updated instance type
    framework_version='2.0.0',          # Updated to PyTorch 2.0.0
    py_version='py310',                  # Corrected Python version
    dependencies=['/home/ec2-user/SageMaker/code/requirements.txt'],   # Ensure requirements.txt is updated and present
    inputs={
        'training': train_data,
        'validation': validation_data
    },
    output_path='s3://csml-data-bucket/output/'
)

# Start the training job
pytorch_estimator.fit({'training': train_data, 'validation': validation_data})


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-11-24-18-22-30-126


2024-11-24 18:22:31 Starting - Starting the training job...
2024-11-24 18:22:45 Starting - Preparing the instances for training...
2024-11-24 18:23:22 Downloading - Downloading input data...
2024-11-24 18:23:42 Downloading - Downloading the training image......
2024-11-24 18:24:48 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-11-24 18:25:08,113 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-11-24 18:25:08,114 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-11-24 18:25:08,114 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-11-24 18:25:08,123 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-11-24 18:25:08,124 sagemak

[34mTRAIN LOG :: Attempting to access S3 bucket: csml-data-bucket...[0m
[34mTRAIN LOG :: Successfully accessed S3 bucket: csml-data-bucket and found 1148 objects in the training folder.[0m
[34mDownloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth[0m
[34m0%|          | 0.00/97.8M [00:00<?, ?B/s][0m
[34m39%|███▊      | 37.8M/97.8M [00:00<00:00, 396MB/s][0m
[34m83%|████████▎ | 81.5M/97.8M [00:00<00:00, 433MB/s][0m
[34m100%|██████████| 97.8M/97.8M [00:00<00:00, 432MB/s][0m
[34mEPOCH 1: Train Loss: 0.4274, Val Loss: 14.7954, Val Acc: 58.21%
 VALIDATION METRICS :: Precision: 0.92, Recall: 0.18, F1-Score: 0.30[0m
[34mTRAIN LOG :: Best model saved at epoch 1[0m
[34mEPOCH 2: Train Loss: 0.2974, Val Loss: 0.6383, Val Acc: 54.10%
 VALIDATION METRICS :: Precision: 0.54, Recall: 0.60, F1-Score: 0.57[0m
[34mTRAIN LOG :: Best model saved at epoch 2[0m
[34mEPOCH 3: Train Loss: 0.2098, Val Loss: 0.0775