In [1]:
# Check if sagemaker is installed using pip
!pip show sagemaker

Name: sagemaker
Version: 2.232.2
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk
Author: Amazon Web Services
Author-email: 
License: 
Location: c:\Users\shefa\Documents\GitHub\Advanced-Classification-for-Imbalanced-Data\venv_aws\Lib\site-packages
Requires: attrs, boto3, cloudpickle, docker, google-pasta, importlib-metadata, jsonschema, numpy, packaging, pandas, pathos, platformdirs, protobuf, psutil, pyyaml, requests, sagemaker-core, sagemaker-mlflow, schema, smdebug-rulesconfig, tblib, tqdm, urllib3
Required-by: 


In [None]:
import boto3
import sagemaker
import sagemaker.session
import sys
from sagemaker.workflow.pipeline_context import PipelineSession

In [None]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
# Setbucket to "s3://amazon-product-dataset-2024"
bucket = "s3://amazon-product-dataset-2024"
model_package_group_name = f"MLImbalancedClassificationAmazonDataset-{region}"
prefix = "processing-artifacts"  # Prefix for S3 bucket to store data

# bucket = session.default_bucket() # You can replace with your own bucket name if you have one

In [None]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

model_registry_package = ParameterString(name="ComparativeMLModelGroup", default_value="default-registry") # Name of the model registry

# Data parameters
input_data = ParameterString(name="InputData", default_value="s3://{}/transformed/transformed-data.parquet".format(bucket)) # S3 URI to input data

# Scripts parameters for preprocessing, training and evaluation
preprocessing_script = ParameterString(name="PreprocessScript", default_value="s3://{}/scripts/preprocess/preprocess.py".format(bucket))
 # Name of the preprocessing script
 
# training_script = ParameterString(name="TrainingScript", default_value="s3://{}/scripts/xgboost/train.py".format(bucket)) # Name of the training and hyperparameter tuning script. but trainingstep in sagemaker does not allow for a script to be passed in as a parameter, so we will not use this parameter.

evaluation_script = ParameterString(name="EvaluationScript", default_value="s3://{}/scripts/xgboost/evaluate.py".format(bucket)) # Name of the evaluation script

################ WE WILL NEED TO CHANGE THIS FOR EACH MODEL PIPELINE ################

# Hyperparameters for XGBoost
max_depth = ParameterInteger(name="MaxDepth", default_value=5) # Maximum depth of the tree
eta = ParameterFloat(name="Eta", default_value=0.2) # Step size shrinkage used in updates to prevent overfitting
gamma = ParameterFloat(name="Gamma", default_value=4) # Minimum loss reduction required to make a further partition on a leaf node of the tree
min_child_weight = ParameterInteger(name="MinChildWeight", default_value=6) # Minimum sum of instance weight (hessian) needed in a child
subsample = ParameterFloat(name="Subsample", default_value=0.8) # Subsample ratio of the training instances
num_round = ParameterInteger(name="NumRound", default_value=100) # Number of rounds for boosting
objective = ParameterString(name="Objective", default_value="binary:logistic") # Specify the learning task and the corresponding learning objective
eval_metric = ParameterString(name="EvalMetric", default_value="auc") # Evaluation metrics for validation data
early_stopping_rounds = ParameterInteger(name="EarlyStoppingRounds", default_value=10) # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
scale_pos_weight = ParameterFloat(name="ScalePosWeight", default_value=1) # Control the balance of positive and negative weights, useful for unbalanced classes # sqrt(count(negative examples)/count(Positive examples)) = sqrt # 1364475/8112 = 21.5. Source: https://stats.stackexchange.com/questions/243207/what-is-the-proper-usage-of-scale-pos-weight-in-xgboost-for-imbalanced-datasets



# Resource parameters
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) # instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.large") # processing instance type
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") # training instance type (optimized for training)
max_training_jobs = ParameterInteger(name="MaxiumTrainingJobs", default_value=1) # max. number of training jobs
# Maximum amount of trainingjobs to allow in the HP tuning
max_parallel_training_jobs = ParameterInteger(name="MaxiumParallelTrainingJobs", default_value=1) # max. number of parallel training jobs

# Accuracy threshold to decide whether or not to register the model with Model Registry
# accuracy_condition_threshold = ParameterFloat(name="AccuracyConditionThreshold", default_value=0.7) # Accuracy threshold to decide whether or not to register the model with Model Registry. Not relevant for comparative models as we will register all models.



In [None]:
# ProcessingStep

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables


sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type="ml.m5.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-abalone-process",
    role=role,
    sagemaker_session=pipeline_session,
)

In [None]:
processor_args = sklearn_processor.run(
    inputs = [
        ProcessingInput(source=input_data, 
                        destination="/opt/ml/processing/input")
                        ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/train",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "train",
                ],
            ),
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/validation",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "validation",
                ],
            ),
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/test",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "test",
                ],
            ),
        ),
    ],
code = preprocessing_script, # this is the preprocessing script
)

step_process = ProcessingStep(name="Prepare-Data",display_name='Preprocessing',
                              step_args=processor_args)

In [None]:
# Training and tuning step

from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sagemaker.workflow.steps import TuningStep

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",# check latest version
    py_version="py3",
    instance_type=training_instance_type,
)


xgb_estimator = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    disable_profiler=True,
)


xgb_tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name="validation:aucpr", # we will use the AUCPR as the metric to optimize as we have extremely imbalanced classes
    hyperparameter_ranges={ ##  TO BE EDITED!! ##
        "eta": ContinuousParameter(0, 0.5),
        "alpha": ContinuousParameter(0, 1000),
        "min_child_weight": ContinuousParameter(1, 120),
        "max_depth": IntegerParameter(1, 10),
        "num_round": IntegerParameter(1, 2000),
        "subsample": ContinuousParameter(0.5, 1),
        "scale_pos_weight": ContinuousParameter(15, 50),
    },
    max_jobs=max_training_jobs, # max. number of training jobs to run
    max_parallel_jobs=max_parallel_training_jobs,
    strategy="Bayesian",
    early_stopping_type="Auto",

)
        

In [None]:
step_tuning = TuningStep(
    name="Train-And-Tune-Model",
    display_name="Train-And-Tune-Model",
    tuner=xgb_tuner,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    },
)

In [None]:
# Evaluate step

# Create SKLearn Processor to evaluate model
from sagemaker.processing import ScriptProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

evaluate_model_processor = ScriptProcessor(
    image_uri=sklearn_processor_uri,
    command=["python3"],
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name=f"{base_job_prefix}/evaluation",
    role=role,
    sagemaker_session=pipeline_session,
)

# Script that will be used for model evaluation
evaluate_script = ScriptProcessor.run.__defaults__[0]  

# Create property file to store evaluation metrics
from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)


In [None]:
import json
import pathlib
import pickle
import tarfile
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    roc_curve,
    balanced_accuracy_score
)


model_path = "/opt/ml/processing/model/model.tar.gz"

with tarfile.open(model_path) as tar:
    tar.extractall(path="..")

model = pickle.load(open("xgboost-model", "rb"))

test_path = "/opt/ml/processing/test/test.csv"
df = pd.read_parquet(test_path)

y_test = df.iloc[:, 0].to_numpy()
df.drop(df.columns[0], axis=1, inplace=True)
X_test = xgboost.DMatrix(df.values)

prediction_probabilities = model.predict(X_test)
predictions = np.round(prediction_probabilities)

precision = precision_score(y_test, predictions, zero_division=1)
recall = recall_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)
balanced_accuracy = balanced_accuracy_score(y_test, predictions)
report_dict = {
    "binary_classification_metrics": {
        "accuracy": {"value": accuracy, "standard_deviation": "NaN"},
        "balanced_accuracy": {"value": balanced_accuracy, "standard_deviation": "NaN"},
        "precision": {"value": precision, "standard_deviation": "NaN"},
        "recall": {"value": recall, "standard_deviation": "NaN"},
        "confusion_matrix": {
            "0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
            "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])},
        },
        "receiver_operating_characteristic_curve": {
            "false_positive_rates": list(fpr),
            "true_positive_rates": list(tpr),
        },
    },
}

output_dir = "/opt/ml/processing/evaluation"
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

evaluation_path = f"{output_dir}/evaluation.json"
with open(evaluation_path, "w") as f:
    f.write(json.dumps(report_dict))

In [None]:
step_evaluate_model = ProcessingStep(
    name="EvaluateModel",
    processor=evaluate_model_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=Join(
                on="/",
                values=[
                    "s3://" + bucket,
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "evaluation",
                ],
            ),
        ),
    ],
    code=evaluate_script,
    property_files=[evaluation_report],
)