In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# S3 path to your dataset (replace with your actual bucket and prefix)
s3_bucket = "apartment-pricing"
s3_prefix = "xgboost-dataset"

# s3_data_path = "s3://apartment-pricing/preprocessed/data/csv/" -- use r5.12xlarge for the dataset
s3_data_path = "s3://apartment-pricing/preprocessed/data/csv/part-00009-0afa893a-e1f7-408c-9dc3-a22b0207eb95-c000.csv"

# Get SageMaker execution role
role = get_execution_role()

# Define XGBoost container image
region = sagemaker.Session().boto_region_name
container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Define XGBoost Estimator
xgboost_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,  # Increase for distributed training if needed
    instance_type="ml.m5.xlarge",  # Choose instance type
    volume_size=30,  # GB, size of EBS volume
    max_run=3600,  # Max training time in seconds
    input_mode="Pipe",  # Use Pipe mode for large datasets
    output_path=f"s3://apartment-pricing/xgboost-output/",
)

# Set hyperparameters for XGBoost
xgboost_estimator.set_hyperparameters(
    objective="reg:squarederror",  # Regression task
    num_round=10,  # Number of boosting rounds
    max_depth=6,  # Maximum tree depth
    eta=0.3,  # Learning rate
    subsample=0.8,  # Subsampling ratio
    colsample_bytree=0.8,  # Subsampling of features
)


In [6]:
# Define training input
training_input = TrainingInput(
    s3_data_path,
    content_type="text/csv",  # Data type
    input_mode="Pipe"         # Pipe mode for efficient streaming
)


In [7]:
# Start the training job
xgboost_estimator.fit({"train": training_input})

print("Training job completed.")


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-11-24-05-20-36-140


2024-11-24 05:20:37 Starting - Starting the training job...
2024-11-24 05:20:51 Starting - Preparing the instances for training...
2024-11-24 05:21:18 Downloading - Downloading input data...
2024-11-24 05:21:43 Downloading - Downloading the training image...
  from pandas import MultiIndex, Int64Index[0m
[34m[2024-11-24 05:22:31.805 ip-10-2-88-184.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-11-24 05:22:31.826 ip-10-2-88-184.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-11-24:05:22:32:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-11-24:05:22:32:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-11-24:05:22:32:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-11-24:05:22:32:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2024-11-24:05:22:32:INFO] Determined 0 GP

In [None]:
# estimator.deploy to endpoint
# did here from console

In [1]:
# estimator predict and check samples from endpoint

In [3]:
import boto3
import json

# Initialize SageMaker runtime client
sagemaker_runtime = boto3.client('sagemaker-runtime')

# Endpoint name
endpoint_name = "xg-boost-endpoint"  # Replace with your actual endpoint name

# Input data as a CSV string (ensure no header is included)
input_data = "1098,1,1,0,1,1,2.56,2.2,16"

# Invoke the endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="text/csv",  # Ensure this matches your endpoint's expected format
    Body=input_data
)

# Parse the response
result = response['Body'].read().decode('utf-8')
print("Predicted Rent:", result)


Predicted Rent: 2765.92333984375



In [None]:
# use batch transform jobs it if there are too many samples for infer/predict.. auto deploys endpoint