In [6]:
from sagemaker.workflow.function_step import step
from sagemaker.workflow.pipeline import Pipeline
import sagemaker
from sagemaker.workflow.parameters import ParameterInteger
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.fail_step import FailStep

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# GLOBAL VARIABLES

In [7]:
# data_pull, model_training, model_evaluation, model_registration
pipeline_name = "pipeline-train"
role = sagemaker.get_execution_role()
instance_type = "ml.m5.large"
cod_month_start = ParameterInteger(name="PeriodoCargaInicio")
cod_month_end = ParameterInteger(name="PeriodoCargaFin")
tracking_server_arn = 'arn:aws:sagemaker:us-east-1:635106763104:mlflow-tracking-server/mlops-utec-mlflow-server3'
experiment_name = "pipeline-train-experiment"

# DATA PULL

In [8]:
%%writefile data_pull_requirements.txt
awswrangler==3.12.0

Overwriting data_pull_requirements.txt


In [9]:
@step(
    name="DataPull",
    instance_type=instance_type,
    dependencies="./data_pull_requirements.txt"
)
def data_pull(experiment_name: str, run_name: str,
              cod_month_start: int, cod_month_end: int) -> tuple[str, str, str]:
    import awswrangler as wr
    import mlflow

    mlflow.set_tracking_uri(tracking_server_arn)
    mlflow.set_experiment(experiment_name)
    TARGET_COL = "is_fraud"
    query = """
    WITH TRAIN as
    (
        SELECT  transaction_id
                ,customer_id
                ,amount
                ,merchant_category
                ,merchant_country
                ,cast(card_present as int) card_present
                ,cast(is_fraud as int) is_fraud
                ,cast(date_format(timestamp,'%Y%m') as int) as cod_month
                ,COUNT(1) OVER(PARTITION BY customer_id ORDER BY timestamp RANGE BETWEEN INTERVAL '1' month PRECEDING AND CURRENT ROW) as trx_vel_last_1mths
                ,COUNT(1) OVER(PARTITION BY customer_id ORDER BY timestamp RANGE BETWEEN INTERVAL '2' MONTH PRECEDING AND CURRENT ROW) as trx_vel_last_2mths
                ,SUM(amount) OVER(PARTITION BY customer_id ORDER BY timestamp RANGE BETWEEN INTERVAL '1' MONTH PRECEDING AND CURRENT ROW) as amt_vel_last_1mths
                ,SUM(amount) OVER(PARTITION BY customer_id ORDER BY timestamp RANGE BETWEEN INTERVAL '2' MONTH PRECEDING AND CURRENT ROW) as amt_vel_last_2mths
        FROM    RISK_MANAGEMENT.CREDIT_CARD_TRANSACTIONS
        WHERE   is_fraud is not null
    )
    SELECT  *
    FROM    TRAIN
    WHERE   cod_month between {} and {}
    """.format(cod_month_start, cod_month_end)
    train_s3_path = "s3://mlops-utec-rpa/fraud-detection/train_data/train.csv"
    with mlflow.start_run(run_name=run_name) as run:
        run_id = run.info.run_id
        with mlflow.start_run(run_name="DataPull", nested=True):
            df = wr.athena.read_sql_query(sql=query, database="risk_management")
            df.to_csv(train_s3_path, index=False)
            mlflow.log_input(
                mlflow.data.from_pandas(df, train_s3_path,
                                        targets=TARGET_COL),
                context="DataPull"
            )
    return train_s3_path, experiment_name, run_id

# MODEL TRAINING

In [10]:
%%writefile model_training_requirements.txt
mlflow==2.13.2
sagemaker-mlflow==0.1.0

Overwriting model_training_requirements.txt


In [12]:
@step(
    name="ModelTraining",
    instance_type=instance_type,
    dependencies="./model_training_requirements.txt"
)
def model_training(train_s3_path: str, experiment_name: str,
                   run_id: str) -> tuple[str, str, str, str]:
    import pandas as pd
    import mlflow
    from sklearn.model_selection import train_test_split
    from xgboost import XGBClassifier
    TARGET_COL = "is_fraud"
    SEED = 42
    TRAIN_SPLIT = 0.7
    FEATURES = ['card_present', 'trx_vel_last_1mths', 'trx_vel_last_2mths',
                'amt_vel_last_1mths', 'amt_vel_last_2mths']
    mlflow.set_tracking_uri(tracking_server_arn)
    mlflow.set_experiment(experiment_name)
    df = pd.read_csv("s3://mlops-utec-rpa/fraud-detection/train_data/train.csv")
    X = df[FEATURES]
    y = df[TARGET_COL]
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=TRAIN_SPLIT,
                                                        random_state=SEED)
    use_gpu = False
    param = dict(
        objective="binary:logistic",
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        tree_method="gpu_hist" if use_gpu else "hist",
        n_estimators=50
    )
    with mlflow.start_run(run_id=run_id):
        with mlflow.start_run(run_name="ModelTraining",
                              nested=True) as training_run:
            training_run_id = training_run.info.run_id
            test_s3_path = "s3://mlops-utec-rpa/fraud-detection/test_data/test.csv"
            df_test = pd.concat([X_test, y_test], axis=1)
            df_test.to_csv(test_s3_path, index=False)
            mlflow.log_input(
                mlflow.data.from_pandas(df_test, test_s3_path,
                                        targets=TARGET_COL),
                context="ModelTraining"
            )
            mlflow.xgboost.autolog(
                log_input_examples=True,
                log_model_signatures=True,
                log_models=True,
                log_datasets=True,
                model_format="xgb",
            )
            xgb = XGBClassifier(**param)
            xgb.fit(X_train, y_train)
    return test_s3_path, experiment_name, run_id, training_run_id

# MODEL EVALUATION

In [13]:
@step(
    name="ModelEvaluation",
    instance_type=instance_type,
    dependencies="./model_training_requirements.txt"
)
def evaluate(
    test_s3_path: str,
    experiment_name: str,
    run_id: str,
    training_run_id: str,
) -> dict:
    import mlflow
    import pandas as pd
    TARGET_COL = "is_fraud"
    mlflow.set_tracking_uri(tracking_server_arn)
    mlflow.set_experiment(experiment_name)
    with mlflow.start_run(run_id=run_id):
        with mlflow.start_run(run_name="ModelEvaluation", nested=True):
            test_df = pd.read_csv(test_s3_path)
            model = mlflow.pyfunc.load_model(f"runs:/{training_run_id}/model")
            results = mlflow.evaluate(
                model=model,
                data=test_df,
                targets=TARGET_COL,
                model_type="classifier",
                evaluators=["default"],
            )
            return {"f1_score": results.metrics["f1_score"]}

# MODEL REGISTRATION

In [14]:
@step(
    name="ModelRegistration",
    instance_type=instance_type,
    dependencies="./model_training_requirements.txt"
)
def register(
    pipeline_name: str,
    experiment_name: str,
    run_id: str,
    training_run_id: str,
):
    import mlflow

    mlflow.set_tracking_uri(tracking_server_arn)
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run(run_id=run_id):
        with mlflow.start_run(run_name="ModelRegistration", nested=True):
            mlflow.register_model(f"runs:/{training_run_id}/model", pipeline_name)

# PIPELINE

In [15]:
data_pull_step = data_pull(experiment_name=experiment_name,
                           run_name=ExecutionVariables.PIPELINE_EXECUTION_ID,
                           cod_month_start=cod_month_start,
                           cod_month_end=cod_month_end)

model_training_step = model_training(train_s3_path=data_pull_step[0],
                                     experiment_name=data_pull_step[1],
                                     run_id=data_pull_step[2])

conditional_register_step = ConditionStep(
    name="ConditionalRegister",
    conditions=[
        ConditionGreaterThanOrEqualTo(
            left=evaluate(
                test_s3_path=model_training_step[0],
                experiment_name=model_training_step[1],
                run_id=model_training_step[2],
                training_run_id=model_training_step[3],
            )["f1_score"],
            right=0.6,
        )
    ],
    if_steps=[
        register(
            pipeline_name=pipeline_name,
            experiment_name=model_training_step[1],
            run_id=model_training_step[2],
            training_run_id=model_training_step[3],
        )
    ],
    else_steps=[FailStep(name="Fail",
                         error_message="Model performance is not good enough")]
)

In [16]:
pipeline = Pipeline(name=pipeline_name,
                    steps=[data_pull_step, model_training_step,
                           conditional_register_step],
                    parameters=[cod_month_start, cod_month_end])
pipeline.upsert(role_arn=role)

2025-06-01 08:18:13,435 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-635106763104/pipeline-train/DataPull/2025-06-01-08-18-13-152/function
2025-06-01 08:18:13,512 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-635106763104/pipeline-train/DataPull/2025-06-01-08-18-13-152/arguments
2025-06-01 08:18:13,812 sagemaker.remote_function INFO     Copied dependencies file at './data_pull_requirements.txt' to '/tmp/tmp4jkf6b_z/data_pull_requirements.txt'
2025-06-01 08:18:13,840 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-635106763104/pipeline-train/DataPull/2025-06-01-08-18-13-152/pre_exec_script_and_dependencies'
2025-06-01 08:18:13,843 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-635106763104/pipeline-train/ModelTraining/2025-06-01-08-18-13-152/function
20

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:635106763104:pipeline/pipeline-train',
 'ResponseMetadata': {'RequestId': '329f02fd-00bb-4f9f-845d-f2046073a2e7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '329f02fd-00bb-4f9f-845d-f2046073a2e7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '82',
   'date': 'Sun, 01 Jun 2025 08:18:16 GMT'},
  'RetryAttempts': 0}}

In [17]:
pipeline.start(parameters={"PeriodoCargaInicio": 202410,
                           "PeriodoCargaFin": 202412},
               execution_display_name="test-train-full",
               execution_description="Testando training full")

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:635106763104:pipeline/pipeline-train/execution/agn7atl60u19', sagemaker_session=<sagemaker.session.Session object at 0x7fd040aabfe0>)

In [None]:
execution = pipeline.start()
execution.describe()
execution.wait()
execution.list_steps()