In [None]:
# https://docs.nvidia.com/nim/financial-fraud-training/1.0.0/configuration/config-json.html#overview-of-parameters-and-hyperparameters
# https://github.com/NVIDIA-AI-Blueprints/financial-fraud-detection/blob/main/notebooks/financial-fraud-usage.ipynb
# https://aws.amazon.com/ec2/instance-types/g5/
import os
import json
import boto3
import subprocess
import getpass
import time
from datetime import datetime
import logging
logger = logging.getLogger(__name__)


# ---------- PARAMETERS ----------
region = boto3.Session().region_name
account_id = boto3.client('sts').get_caller_identity()['Account']
ecr_image_name = "financial-fraud-training"
ecr_image_tag = "1.0.1"
ecr_repo_name = f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/{ecr_image_name}:{ecr_image_tag}"
bucket_name = f"sagemaker-{region}-{account_id}"
from pathlib import Path
import json
# Load from repo root credentials file
cred_paths = [Path('../../nvidia_credentials.json'), Path('../nvidia_credentials.json'), Path('nvidia_credentials.json')]
NGC_API_KEY = None
for p in cred_paths:
    if p.exists():
        with open(p, 'r', encoding='utf-8') as f:
            NGC_API_KEY = json.load(f).get('ngc_api_key')
        break
if not NGC_API_KEY:
    raise FileNotFoundError('nvidia_credentials.json not found or missing ngc_api_key. Place it at repo root with your real key.')
scale_pos_weight = 27.434310083918007

print(ecr_repo_name)
!aws s3 ls {bucket_name}

176843580427.dkr.ecr.us-east-1.amazonaws.com/financial-fraud-training:1.0.1
                           PRE /
                           PRE config/
                           PRE datasets/
                           PRE output/
                           PRE processed/
                           PRE s3:/


In [None]:
!pip install --upgrade sagemaker pandas

In [66]:
import pandas, sagemaker
from sagemaker.estimator import Estimator
from sagemaker.debugger import ProfilerConfig, FrameworkProfile, DetailedProfilingConfig

sagemaker_training_role = sagemaker.get_execution_role()
print(sagemaker_training_role)

arn:aws:iam::176843580427:role/SageMakerExecutionRole


In [17]:
# Upload wrapper script to S3
def upload_file(s3_bucket_name, s3_key, local_filepath):
    s3_client = boto3.client('s3')
    s3_client.upload_file(
        local_filepath,
        s3_bucket_name,
        s3_key
    )
    
    s3_path = f"s3://{s3_bucket_name}/{s3_key}"
    print(f"File uploaded from {local_filepath} to: {s3_path}")
    !aws s3 ls {s3_path} --recursive

In [14]:
# run the training in Sagemaker
# send training job to parameter store
session = sagemaker.Session()

region = session.boto_region_name

ssm_client = boto3.client("ssm")

#ssm_client.put_parameter(Name="/triton/model", Value=training_job_name, Type="String", Overwrite=True)

sagemaker_client = boto3.client('sagemaker')

BASE_DRIVE = "./"
RAW_DATA_PATH = os.path.join(BASE_DRIVE, 'datasets/ieee-fraud-detection/') 
PROCESSED_DATA_PATH = os.path.join(BASE_DRIVE, 'processed/ieee-fraud-detection/')   # output used by Docker
S3_PREPROCESS_DATA_PATH = os.path.join("s3://", bucket_name, 'processed/ieee-fraud-detection/')   # output used by Docker
S3_OUTPUT_DATA_PATH = os.path.join("s3://", bucket_name, 'output/ieee-fraud-detection/')   # output used by Docker

S3_PREPROCESS_DATA_PATH, S3_OUTPUT_DATA_PATH

('s3://sagemaker-us-east-1-176843580427/processed/ieee-fraud-detection/',
 's3://sagemaker-us-east-1-176843580427/output/ieee-fraud-detection/')

In [38]:
training_config = {
    "paths": {
        "data_dir": "/opt/ml/input/data/gnn",   # container mount path -> maps to PROCESSED_DATA_PATH on host
        "output_dir": "/opt/ml/model"     # container path -> maps to MODEL_OUTPUT_PATH on host 
    },
    "models": [
        {
            "kind": "GraphSAGE_XGBoost",
            "gpu": "single",
            "hyperparameters": {
                "gnn": {
                    "hidden_channels": 32,
                    "n_hops": 2,
                    "dropout_prob": 0.2,
                    "batch_size": 1024,
                    "fan_out": 32,
                    "num_epochs": 20
                },
                "xgb": {
                    # Only these 5 parameters are allowed for GraphSAGE_XGBoost
                    "max_depth": 8,
                    "learning_rate": 0.1,
                    "num_parallel_tree": 1,
                    "num_boost_round": 1000,
                    "gamma": 1.0
                    # Removed unsupported params:
                    # - tree_method (not allowed)
                    # - enable_categorical (not allowed)
                    # - min_child_weight (not allowed)
                    # - subsample (not allowed)
                    # - colsample_bytree (not allowed)
                    # - scale_pos_weight (not allowed)
                    # - eval_metric (not allowed)
                    # - early_stopping_rounds (not allowed)
                }
            }
        }
    ]
}

config_path_local = os.path.join(PROCESSED_DATA_PATH, "config")
config_path_filename = os.path.join(config_path_local, "config.json")
!mkdir -p {config_path_local}

with open(config_path_filename, 'w') as f:
    json.dump(training_config, f, indent=2)
    
print("Training config created at", config_path_filename)
print("\nNote: Only 5 XGBoost hyperparameters are supported for GraphSAGE_XGBoost model:")
print("  - max_depth, learning_rate, num_parallel_tree, num_boost_round, gamma")

Training config created at ./processed/ieee-fraud-detection/config/config.json

Note: Only 5 XGBoost hyperparameters are supported for GraphSAGE_XGBoost model:
  - max_depth, learning_rate, num_parallel_tree, num_boost_round, gamma


In [39]:
s3_config_key = os.path.join("processed/ieee-fraud-detection/config/config.json")
upload_file(bucket_name, s3_config_key, config_path_filename)

File uploaded from ./processed/ieee-fraud-detection/config/config.json to: s3://sagemaker-us-east-1-176843580427/processed/ieee-fraud-detection/config/config.json
2025-10-19 02:54:32        594 processed/ieee-fraud-detection/config/config.json


In [40]:
!aws s3 ls {S3_PREPROCESS_DATA_PATH}gnn/train_gnn/

                           PRE edges/
                           PRE nodes/


In [71]:
%%writefile wrapper.sh
#!/bin/bash
set -e
set -x

echo "=== Starting wrapper script ==="

# Verify GPU is accessible
echo "=== Checking GPU availability ==="
nvidia-smi || echo "WARNING: nvidia-smi failed"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}'); print(f'CUDA device count: {torch.cuda.device_count()}'); print(f'CUDA device name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}')"

# Fix pyg-lib compatibility
echo "=== Fixing pyg-lib compatibility ==="
pip uninstall -y pyg-lib
pip install --no-cache-dir pyg-lib -f https://data.pyg.org/whl/torch-2.6.0+cu124.html

# Verify installation
echo "=== Verifying pyg-lib installation ==="
python -c "import pyg_lib; print(f'pyg-lib version: {pyg_lib.__version__}')"

# Set up environment
echo "=== Setting up environment ==="
export PYTHONPATH=/opt/nim:${PYTHONPATH}

# Warm up CUDA and verify GPU operations work
echo "=== Warming up CUDA ==="
python << 'WARMUP'
import torch
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    # Create a simple tensor operation to initialize CUDA context
    x = torch.randn(100, 100, device=device)
    y = torch.randn(100, 100, device=device)
    z = torch.matmul(x, y)
    print(f"CUDA warmup successful. Device: {device}, Result shape: {z.shape}")
    # Force synchronization
    torch.cuda.synchronize()
else:
    print("WARNING: CUDA not available!")
WARMUP

echo "=== Inspecting /opt/ml recursively ==="
ls -R /opt/ml/

# Create training launcher
echo "=== Creating training launcher ==="
cat > /tmp/launch_training.py << 'EOF'
import json
import logging
import sys
import torch
import os
import shutil
import subprocess
from datetime import datetime

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

# Verify CUDA is available before training
logging.info(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    logging.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
    logging.info(f"CUDA version: {torch.version.cuda}")
    # Initialize CUDA context
    torch.cuda.init()
    logging.info("CUDA context initialized")

# Add the library path
sys.path.insert(0, '/opt/nim/lib/financial_fraud_training')

# Load the config file
config_path = '/opt/ml/input/data/config/config.json'
logging.info(f"Loading config from: {config_path}")

with open(config_path, 'r') as f:
    config_dict = json.load(f)

logging.info("Config loaded successfully")

# Store training start time
training_start_time = datetime.now()

try:
    # Import and call the training function
    from src.validate_and_launch import validate_config_and_run_training
    
    logging.info("=" * 60)
    logging.info("STARTING TRAINING")
    logging.info("=" * 60)
    
    validate_config_and_run_training(config_dict)
    
    training_end_time = datetime.now()
    training_duration = training_end_time - training_start_time
    training_success = True
    
    logging.info("=" * 60)
    logging.info("TRAINING COMPLETED SUCCESSFULLY")
    logging.info("=" * 60)
    
except Exception as e:
    training_end_time = datetime.now()
    training_duration = training_end_time - training_start_time
    training_success = False
    
    logging.error("=" * 60)
    logging.error("TRAINING FAILED")
    logging.error("=" * 60)
    logging.error(f"Error: {e}", exc_info=True)
    
finally:
    # CREATE SNAPSHOT AFTER TRAINING (SUCCESS OR FAILURE)
    logging.info("=" * 60)
    logging.info("CREATING TRAINING SNAPSHOT")
    logging.info("=" * 60)
    
    snapshot_dir = '/opt/ml/model/training_snapshot'
    os.makedirs(snapshot_dir, exist_ok=True)
    
    # 1. Save training configuration
    logging.info("Saving configuration...")
    shutil.copy(config_path, os.path.join(snapshot_dir, 'config.json'))
    
    # 2. Save INPUT DATA (all data channels)
    logging.info("Saving input data snapshot...")
    input_data_dir = os.path.join(snapshot_dir, 'input_data')
    os.makedirs(input_data_dir, exist_ok=True)
    
    # Copy all data channels from /opt/ml/input/data/
    sagemaker_data_dir = '/opt/ml/input/data'
    if os.path.exists(sagemaker_data_dir):
        for item in os.listdir(sagemaker_data_dir):
            # Skip manifest files
            if item.endswith('-manifest'):
                continue
            
            src_path = os.path.join(sagemaker_data_dir, item)
            dest_path = os.path.join(input_data_dir, item)
            
            try:
                if os.path.isdir(src_path):
                    # For directories, copy the entire tree
                    shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
                    
                    # Count files in this directory
                    file_count = sum([len(files) for r, d, files in os.walk(dest_path)])
                    dir_size = sum([os.path.getsize(os.path.join(r, f)) for r, d, files in os.walk(dest_path) for f in files])
                    
                    logging.info(f"Copied input channel '{item}': {file_count} files, {dir_size / 1e6:.2f} MB")
                elif os.path.isfile(src_path):
                    # For files, just copy
                    shutil.copy2(src_path, dest_path)
                    file_size = os.path.getsize(dest_path)
                    logging.info(f"Copied input file '{item}': {file_size / 1e6:.2f} MB")
            except Exception as e:
                logging.warning(f"Could not copy input data '{item}': {e}")
    
    # 3. Save training metadata
    logging.info("Saving training metadata...")
    metadata = {
        'training_start': training_start_time.isoformat(),
        'training_end': training_end_time.isoformat(),
        'training_duration_seconds': training_duration.total_seconds(),
        'training_success': training_success,
        'pytorch_version': torch.__version__,
        'cuda_available': torch.cuda.is_available(),
    }
    
    if torch.cuda.is_available():
        metadata['cuda_version'] = torch.version.cuda
        metadata['gpu_name'] = torch.cuda.get_device_name(0)
        metadata['gpu_memory_total_gb'] = torch.cuda.get_device_properties(0).total_memory / 1e9
        metadata['gpu_memory_allocated_gb'] = torch.cuda.memory_allocated(0) / 1e9
        metadata['gpu_memory_reserved_gb'] = torch.cuda.memory_reserved(0) / 1e9
    
    with open(os.path.join(snapshot_dir, 'training_metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)
    
    # 4. Save environment/package information
    logging.info("Saving environment info...")
    result = subprocess.run(['pip', 'list', '--format=freeze'], capture_output=True, text=True)
    with open(os.path.join(snapshot_dir, 'requirements.txt'), 'w') as f:
        f.write(result.stdout)
    
    # 5. Save system information
    logging.info("Saving system info...")
    with open(os.path.join(snapshot_dir, 'system_info.txt'), 'w', encoding='utf-8') as f:
        f.write(f"Training Status: {'SUCCESS' if training_success else 'FAILED'}\n")
        f.write(f"Training Duration: {training_duration}\n")
        f.write(f"Start Time: {training_start_time}\n")
        f.write(f"End Time: {training_end_time}\n")
        f.write(f"\n{'=' * 40}\n")
        f.write(f"PyTorch version: {torch.__version__}\n")
        f.write(f"CUDA available: {torch.cuda.is_available()}\n")
        if torch.cuda.is_available():
            f.write(f"CUDA version: {torch.version.cuda}\n")
            f.write(f"GPU: {torch.cuda.get_device_name(0)}\n")
            f.write(f"GPU memory total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB\n")
            f.write(f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB\n")
            f.write(f"GPU memory reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB\n")
    
    # 6. Save source code snapshot
    logging.info("Saving source code...")
    code_snapshot_dir = os.path.join(snapshot_dir, 'code')
    os.makedirs(code_snapshot_dir, exist_ok=True)
    
    source_dirs = [
        '/opt/nim/lib/financial_fraud_training/src',
        '/opt/nim/lib/financial_fraud_training'
    ]
    
    for src_dir in source_dirs:
        if os.path.exists(src_dir):
            dest_name = os.path.basename(src_dir)
            dest_dir = os.path.join(code_snapshot_dir, dest_name)
            try:
                shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True)
                logging.info(f"Copied {src_dir} to snapshot")
            except Exception as e:
                logging.warning(f"Could not copy {src_dir}: {e}")
    
    # 7. List model artifacts that were created
    logging.info("Cataloging model artifacts...")
    model_files = []
    if os.path.exists('/opt/ml/model'):
        for root, dirs, files in os.walk('/opt/ml/model'):
            # Skip the snapshot directory itself to avoid recursion
            if 'training_snapshot' in root:
                continue
            for file in files:
                file_path = os.path.join(root, file)
                file_size = os.path.getsize(file_path)
                relative_path = os.path.relpath(file_path, '/opt/ml/model')
                model_files.append({
                    'path': relative_path,
                    'size_bytes': file_size,
                    'size_mb': file_size / 1e6
                })
    
    with open(os.path.join(snapshot_dir, 'model_artifacts.json'), 'w') as f:
        json.dump({'artifacts': model_files, 'total_files': len(model_files)}, f, indent=2)
    
    # 8. Create input data catalog
    logging.info("Cataloging input data...")
    input_files = []
    if os.path.exists(input_data_dir):
        for root, dirs, files in os.walk(input_data_dir):
            for file in files:
                file_path = os.path.join(root, file)
                file_size = os.path.getsize(file_path)
                relative_path = os.path.relpath(file_path, input_data_dir)
                input_files.append({
                    'path': relative_path,
                    'size_bytes': file_size,
                    'size_mb': file_size / 1e6
                })
    
    with open(os.path.join(snapshot_dir, 'input_data_catalog.json'), 'w') as f:
        json.dump({
            'input_files': input_files, 
            'total_files': len(input_files),
            'total_size_mb': sum(f['size_mb'] for f in input_files)
        }, f, indent=2)
    
    logging.info(f"Training snapshot saved to {snapshot_dir}")
    logging.info(f"Total model artifacts: {len(model_files)}")
    logging.info(f"Total input files: {len(input_files)}")
    
    # 9. Create a summary file 
    with open(os.path.join(snapshot_dir, 'SUMMARY.txt'), 'w', encoding='utf-8') as f:
        f.write("=" * 60 + "\n")
        f.write("TRAINING SNAPSHOT SUMMARY\n")
        f.write("=" * 60 + "\n\n")
        f.write(f"Status: {'[SUCCESS]' if training_success else '[FAILED]'}\n")
        f.write(f"Duration: {training_duration}\n")
        f.write(f"Start: {training_start_time}\n")
        f.write(f"End: {training_end_time}\n\n")
        f.write(f"Files in snapshot:\n")
        f.write(f"  - config.json (training configuration)\n")
        f.write(f"  - training_metadata.json (detailed metadata)\n")
        f.write(f"  - system_info.txt (system and GPU info)\n")
        f.write(f"  - requirements.txt (Python packages)\n")
        f.write(f"  - model_artifacts.json (list of model files)\n")
        f.write(f"  - input_data_catalog.json (list of input files)\n")
        f.write(f"  - code/ (source code snapshot)\n")
        f.write(f"  - input_data/ (all input data channels)\n")
        f.write(f"\nTotal model artifacts: {len(model_files)}\n")
        f.write(f"Total input files: {len(input_files)}\n")
        if input_files:
            f.write(f"Total input data size: {sum(f['size_mb'] for f in input_files):.2f} MB\n")
    
    logging.info("Snapshot creation completed")
    
    # Re-raise exception if training failed
    if not training_success:
        raise

EOF

echo "=== Inspecting config file ==="
cat /opt/ml/input/data/config/config.json

# Run training with our launcher
echo "=== Starting training ==="
torchrun --standalone --nproc_per_node=1 /tmp/launch_training.py

# List output directory after everything completes
echo "=== Final model directory contents ==="
ls -lah /opt/ml/model/
echo ""
echo "=== Snapshot contents ==="
ls -lah /opt/ml/model/training_snapshot/
echo ""
echo "=== Input data snapshot ==="
ls -lah /opt/ml/model/training_snapshot/input_data/ 2>/dev/null || echo "No input data found"
echo ""
echo "=== Snapshot summary ==="
cat /opt/ml/model/training_snapshot/SUMMARY.txt 2>/dev/null || echo "No summary file found"

echo "=== Wrapper script completed ==="

Overwriting wrapper.sh


In [72]:
# Upload wrapper script to S3
s3_wrapper_key = os.path.join("processed/ieee-fraud-detection/scripts/wrapper.sh")

wrapper_s3_path = f"s3://{bucket_name}/{s3_wrapper_key}"
print(f"Wrapper script uploaded to: {wrapper_s3_path}")

upload_file(bucket_name, s3_wrapper_key, 'wrapper.sh')

# !aws s3 ls {S3_PREPROCESS_DATA_PATH} --recursive

Wrapper script uploaded to: s3://sagemaker-us-east-1-176843580427/processed/ieee-fraud-detection/scripts/wrapper.sh
File uploaded from wrapper.sh to: s3://sagemaker-us-east-1-176843580427/processed/ieee-fraud-detection/scripts/wrapper.sh
2025-10-19 04:23:13      12791 processed/ieee-fraud-detection/scripts/wrapper.sh


In [73]:
timestamp = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
# Create Estimator object using SageMaker SDK
estimator = Estimator(
    image_uri=ecr_repo_name,
    role=sagemaker_training_role,
    instance_count=1,
    # instance_type="ml.g4dn.2xlarge", # The T4 GPU (16GB) rans out of memory
    instance_type="ml.g5.xlarge",  # 24GB GPU (A10G) instead of ml.g4dn.2xlarge (16GB T4) but increase quota at https://176843580427-dzcgepfa.us-east-1.console.aws.amazon.com/servicequotas/home/services/sagemaker/quotas/L-B6D80D9C. More info at https://aws.amazon.com/ec2/instance-types/g5/
    # OR
    # instance_type="ml.g5.2xlarge",  # 24GB GPU
    # OR  
    # instance_type="ml.p3.2xlarge",  # 16GB V100 (might be enough with memory cleanup)
    volume_size=30,
    max_run=86400,
    base_job_name="fraud-detection-gnn",
    output_path=S3_OUTPUT_DATA_PATH,
    sagemaker_session=session,
    # For 1.0.1
    container_entry_point=[
        "bash",
        "/opt/ml/input/data/scripts/wrapper.sh"
    ],
    # container_log_level=logging.DEBUG,
    # https://docs.nvidia.com/nim/financial-fraud-training/1.0.0/getting-started/cont-running.html
    environment={
        "NIM_DISABLE_MODEL_DOWNLOAD": "true",
        "NGC_API_KEY": NGC_API_KEY,
        "PYTHONUNBUFFERED": "1"  # helpful for real-time logs
    },
    # Enable profiling for GPU metrics
    profiler_config=ProfilerConfig(
        system_monitor_interval_millis=500,  # Collect every 500ms
        framework_profile_params=FrameworkProfile(
            detailed_profiling_config=DetailedProfilingConfig(
                start_step=0,
                num_steps=10
            )
        )
    )
)

# Define input channels (S3 URIs)
inputs = {
    "gnn": sagemaker.inputs.TrainingInput(
        s3_data=os.path.join(S3_PREPROCESS_DATA_PATH, "gnn/train_gnn/"),
        content_type="application/x-directory",
        input_mode="File"
    ),
    "config": sagemaker.inputs.TrainingInput(
        s3_data=os.path.join(S3_PREPROCESS_DATA_PATH, "config"),
        content_type="application/x-directory",
        input_mode="File"
    ),
    "scripts": sagemaker.inputs.TrainingInput(
        s3_data=wrapper_s3_path,
        content_type="text/x-sh",
        input_mode="File"
    )
}

# Launch the training job and track in notebook
estimator.fit(
    inputs=inputs,
    job_name=f"fraud-detection-gnn-{timestamp}",
    logs=["All"],  # stream logs directly to the notebook
    wait=True   # wait for job completion
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: fraud-detection-gnn-19-Oct-2025-04-23-13


2025-10-19 04:23:14 Starting - Starting the training job
2025-10-19 04:23:14 Pending - Training job waiting for capacity...
2025-10-19 04:23:45 Pending - Preparing the instances for training...
2025-10-19 04:24:13 Downloading - Downloading input data.........
2025-10-19 04:25:39 Downloading - Downloading the training image.............................................
2025-10-19 04:33:03 Training - Training image download completed. Training in progress.[34m+ echo '=== Starting wrapper script ==='[0m
[34m+ echo '=== Checking GPU availability ==='[0m
[34m+ nvidia-smi[0m
[34m=== Starting wrapper script ===[0m
[34m=== Checking GPU availability ===[0m
[34mSun Oct 19 04:33:15 2025       [0m
[34m+-----------------------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |[0m
[34m|-----------------------------------------+------------------------+-------------------

In [74]:
!aws s3 ls --recursive {estimator.output_path}{estimator._current_job_name}/

2025-10-19 04:35:00          0 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/debug-output/training_job_end.ts
2025-10-19 04:34:57  269508998 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/output/model.tar.gz
2025-10-19 04:35:00          0 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/profiler-output/framework/training_job_end.ts
2025-10-19 04:26:01     209439 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/profiler-output/system/incremental/2025101904/1760847840.algo-1.json
2025-10-19 04:26:00     272836 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/profiler-output/system/incremental/2025101904/1760847900.algo-1.json
2025-10-19 04:27:00     272284 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/profiler-output/system/incremental/2025101904/1760847960.algo-1.json
2025-10-19 04:28:00     271807 output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-

In [75]:
!aws s3 cp {estimator.output_path}{estimator._current_job_name}/output/model.tar.gz ./model.tar.gz

download: s3://sagemaker-us-east-1-176843580427/output/ieee-fraud-detection/fraud-detection-gnn-19-Oct-2025-04-23-13/output/model.tar.gz to ./model.tar.gz


In [76]:
!mkdir ./output
!tar -xvzf ./model.tar.gz -C ./output/

mkdir: cannot create directory ‘./output’: File exists
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/prediction_and_shapley/
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/prediction_and_shapley/1/
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/prediction_and_shapley/1/state_dict_gnn_model.pth
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/prediction_and_shapley/1/model.py
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/prediction_and_shapley/1/embedding_based_xgboost.json
tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
python_backend_model_repository/prediction_and_shapley/confi

In [77]:
!cat ./output/training_snapshot/SUMMARY.txt

TRAINING SNAPSHOT SUMMARY

Status: [SUCCESS]
Duration: 0:00:48.999265
Start: 2025-10-19 04:33:36.488819
End: 2025-10-19 04:34:25.488084

Files in snapshot:
  - config.json (training configuration)
  - training_metadata.json (detailed metadata)
  - system_info.txt (system and GPU info)
  - requirements.txt (Python packages)
  - model_artifacts.json (list of model files)
  - input_data_catalog.json (list of input files)
  - code/ (source code snapshot)
  - input_data/ (all input data channels)

Total model artifacts: 8
Total input files: 8
Total input data size: 2446.58 MB


In [78]:
!ls -R ./output/

./output/:
model_repository  python_backend_model_repository  training_snapshot

./output/model_repository:
model  xgboost

./output/model_repository/model:
1  config.pbtxt

./output/model_repository/model/1:
graph_sage_node_embedder.onnx

./output/model_repository/xgboost:
1  config.pbtxt

./output/model_repository/xgboost/1:
xgboost_on_embeddings.json

./output/python_backend_model_repository:
prediction_and_shapley

./output/python_backend_model_repository/prediction_and_shapley:
1  config.pbtxt

./output/python_backend_model_repository/prediction_and_shapley/1:
embedding_based_xgboost.json  model.py	state_dict_gnn_model.pth

./output/training_snapshot:
code	     input_data_catalog.json  SUMMARY.txt
config.json  model_artifacts.json     system_info.txt
input_data   requirements.txt	      training_metadata.json

./output/training_snapshot/code:
financial_fraud_training  src

./output/training_snapshot/code/financial_fraud_training:
Dockerfile					__init__.py  tests
docs						main.py 