# Create a training job using the realistic loan approval dataset

In [None]:
import boto3
import time
from sagemaker import image_uris

# Initialize Boto3 SageMaker client
sagemaker_client = boto3.client("sagemaker", region_name="us-east-1")

# Define Training Job Parameters
training_job_name = f"real-loan-predictor-xgboost-training-{int(time.time())}"

# S3 paths (Replace with your actual S3 bucket and paths)
s3_bucket = "adgu-datasets"
training_data_s3_uri = f"s3://{s3_bucket}/INSERT-PATH"
output_s3_uri = f"s3://{s3_bucket}/real-loan-predictor-output/"

# SageMaker Execution Role ARN (Replace with your SageMaker role)
sagemaker_role = "INSERT-ARN"

In [None]:
# XGBoost Training Image URI (Region Specific)
# https://docs.aws.amazon.com/sagemaker/latest/dg-ecr-paths/sagemaker-algo-docker-registry-paths.html

xgboost_image_uri = image_uris.retrieve(framework='xgboost',region='us-east-1', version='1.7-1')
print("XGBoost image uri: {}".format(xgboost_image_uri))

In [None]:
# Define Training Job Configuration
training_params = {
    "TrainingJobName": training_job_name,
    "AlgorithmSpecification": {
        "TrainingImage": xgboost_image_uri,
        "TrainingInputMode": "File",
    },
    "RoleArn": sagemaker_role,
    "HyperParameters": {  # Required XGBoost Hyperparameters
        "num_round": "100",  # Defines number of boosting rounds
        "eta": "0.2",  # Learning rate
        "objective": "reg:squarederror",  # Regression problem
        "max_depth": "6",  # Tree depth
        "subsample": "0.8",
        "eval_metric": "rmse",
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": training_data_s3_uri,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "text/csv; label_column=3",
        }
    ],
    "OutputDataConfig": {"S3OutputPath": output_s3_uri},
    "ResourceConfig": {
        "InstanceType": "ml.m5.large",
        "InstanceCount": 1,
        "VolumeSizeInGB": 10,
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 3600},
}
print(training_params)

In [None]:
# Start SageMaker Training Job
response = sagemaker_client.create_training_job(**training_params)

# Print Training Job Details
print(f"Training Job Name: {training_job_name}")
print(f"Training Job Status: {response['TrainingJobArn']}")