In [2]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline_context import PipelineSession
import json
from sagemaker import ModelPackage
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.processing import ScriptProcessor
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
sess = boto3.Session()
sm = sess.client("sagemaker")
role = get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

pipeline_session = PipelineSession()

print(bucket)

sagemaker-us-east-1-836402295281


In [4]:
GROUP_NAME = 'GROUP6' # CHANGE THIS TO YOUR FIRST NAME
S3_PATH = f's3://{bucket}/' # S3 path prefix
DATA_PREFIX = f'{GROUP_NAME}/data/' # S3 prefix to store data
MODEL_PREFIX = f'{GROUP_NAME}/model/' # S3 prefix to store the XGBoost training information and model.

BASE_JOB_PROCESSING_NAME = f'{GROUP_NAME}-processing'  # base_job_name for preprocessing
BASE_JOB_TRAINING_NAME = f'{GROUP_NAME}-training'  # base_job_name for training
BASE_JOB_EVALUATION_NAME = f'{GROUP_NAME}-evaluation'  # base_job_name for evaluation

PIPELINE_NAME = f'{GROUP_NAME}-pipeline'  # SageMaker Pipeline name
MODEL_PACKAGE_GROUP_NAME = f'{GROUP_NAME}-ModelPackageGroup'  # Model package group name in the Model Registry

print(f'DATA_PREFIX: {DATA_PREFIX}')
print(f'PIPELINE_NAME: {PIPELINE_NAME}')
print(f'MODEL_PACKAGE_GROUP_NAME: {MODEL_PACKAGE_GROUP_NAME}')

DATA_PREFIX: GROUP6/data/
PIPELINE_NAME: GROUP6-pipeline
MODEL_PACKAGE_GROUP_NAME: GROUP6-ModelPackageGroup


In [5]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat

# Define initial parameters
input_data_path1 = f'{DATA_PREFIX}/US_flights_2023.csv'
input_data_path2 = f'{DATA_PREFIX}/airports_geolocation.csv'
input_data_path3 = f'{DATA_PREFIX}/weather_meteo_by_airport.csv'

# raw input data
input_data1 = ParameterString(name="InputData1", default_value=input_data_path1)
input_data2 = ParameterString(name="InputData2", default_value=input_data_path2)
input_data3 = ParameterString(name="InputData3", default_value=input_data_path3)

# status of newly trained model in registry
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")

# processing step parameters
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.xlarge"
)
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

# training step parameters
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
training_epochs = ParameterString(name="TrainingEpochs", default_value="100")

# model performance step parameters
accuracy_rmse_threshold = ParameterFloat(name="AccuracyRMSEThreshold", default_value=2.0)

In [6]:
s3_bucket_path = 's3://' + sagemaker.Session().default_bucket() + '/'

s3_bucket_path

's3://sagemaker-us-east-1-836402295281/'

In [7]:
# Specify the local paths to your files and the S3 prefix (directory) to upload to
files = ['US_flights_2023.csv', 'airports_geolocation.csv', 'weather_meteo_by_airport.csv']

# Upload files to S3
for file in files:
    sagemaker_session.upload_data(path=file, key_prefix=DATA_PREFIX)
    
# Specify the paths to your uploaded files
file_paths = [f'{DATA_PREFIX}/US_flights_2023.csv', f'{DATA_PREFIX}/airports_geolocation.csv', f'{DATA_PREFIX}/weather_meteo_by_airport.csv']

In [54]:
s3 = boto3.client('s3')
s3.download_file(bucket,'GROUP6/data/test.csv','test.csv')

In [16]:
%%writefile code/preprocess.py
import numpy as np
import pandas as pd
import os
import joblib
from io import StringIO
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import tarfile
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

try:
    from sagemaker_containers.beta.framework import (
        content_types,
        encoders,
        env,
        modules,
        transformer,
        worker,
        server,
    )
except ImportError:
    pass

RANDOM_STATE = 2024
LABEL_COLUMN = 'Dep_Delay'
feature_columns = ['Day_Of_Week', 'Airline', 'Dep_Airport', 'Dep_CityName', 'DepTime_label', 'Distance_type', 'Manufacturer', 'Model', 'Aicraft_age',
                   'STATE', 'LATITUDE', 'LONGITUDE', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres', 'FlightMonth']

one_hot_columns = ['Day_Of_Week', 'Airline', 'Dep_Airport', 'Dep_CityName', 'DepTime_label', 'Distance_type', 'Manufacturer', 'Model', 'STATE', 'FlightMonth']
non_one_hot_columns = ['Aicraft_age', 'LATITUDE', 'LONGITUDE', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']

base_dir = "/opt/ml/processing"
base_output_dir = "/opt/ml/processing/output/"

if __name__ == "__main__":
    logger.debug("Starting preprocessing script")

    # Define the input data path within the processing environment
    input_data_path1 = f'{base_dir}/input/US_flights_2023.csv'
    input_data_path2 = f'{base_dir}/input/airports_geolocation.csv'
    input_data_path3 = f'{base_dir}/input/weather_meteo_by_airport.csv'
    
    # Read the CSV file from the input path
    logger.debug("Reading input data")
    df1 = pd.read_csv(input_data_path1)
    df2 = pd.read_csv(input_data_path2)
    df3 = pd.read_csv(input_data_path3)
    
    # Sample the data to reduce size (e.g., take a 2% sample)
    logger.debug("Sampling data to reduce size")
    df1_sample = df1.sample(frac=0.02, random_state=RANDOM_STATE)

    # Merge sampled data
    logger.debug("Merging dataframes")
    merged_df = pd.merge(df1_sample, df2, left_on='Dep_Airport', right_on='IATA_CODE', how='left')
    
    merged_df['FlightDate'] = pd.to_datetime(merged_df['FlightDate'])
    df3['time'] = pd.to_datetime(df3['time'])
    merged_df['FlightMonth'] = merged_df['FlightDate'].dt.month
    
    df = pd.merge(merged_df, df3, left_on=['Dep_Airport', 'FlightDate'], right_on=['airport_id', 'time'], how='left')

    # Include only relevant columns in feature_data
    feature_data = df[feature_columns]
    label_data = df[LABEL_COLUMN]

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), one_hot_columns),
            ('scaler', StandardScaler(), non_one_hot_columns)
        ],
        remainder='passthrough'
    )

    logger.debug("Applying transformations to the data")
    feature_data_transformed = preprocessor.fit_transform(feature_data)

    # Ensure transformed features have correct shape
    logger.debug(f"Transformed feature shape: {feature_data_transformed.shape}")

    logger.debug("Splitting data into train, validation, and test sets")
    x_train, x_temp, y_train, y_temp = train_test_split(feature_data_transformed, label_data, test_size=0.2, random_state=42)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

    # Ensure y arrays are 2D
    y_train = y_train.values.reshape(-1, 1)
    y_val = y_val.values.reshape(-1, 1)
    y_test = y_test.values.reshape(-1, 1)

    # Verify the shapes
    logger.debug(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
    logger.debug(f"x_val shape: {x_val.shape}, y_val shape: {y_val.shape}")
    logger.debug(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

    # Print a few rows of each to verify
    logger.debug(f"First 5 rows of x_train: {x_train[:5]}")
    logger.debug(f"First 5 rows of y_train: {y_train[:5]}")
    
    logger.debug("Combining features and labels")
    train_dataset = pd.DataFrame(np.hstack((y_train, x_train.toarray())))
    val_dataset = pd.DataFrame(np.hstack((y_val, x_val.toarray())))
    test_dataset = pd.DataFrame(np.hstack((y_test, x_test.toarray())))

    # Get feature columns after transformation
    transformed_feature_columns = preprocessor.get_feature_names_out()

    train_dataset.columns = [LABEL_COLUMN] + list(transformed_feature_columns)
    val_dataset.columns = [LABEL_COLUMN] + list(transformed_feature_columns)
    test_dataset.columns = [LABEL_COLUMN] + list(transformed_feature_columns)
    
    logger.debug("Creating output directories if they don't exist")
    os.makedirs(f'{base_output_dir}train', exist_ok=True)
    os.makedirs(f'{base_output_dir}validation', exist_ok=True)
    os.makedirs(f'{base_output_dir}test', exist_ok=True)
    os.makedirs(f'{base_output_dir}preprocessor', exist_ok=True)

    logger.debug("Saving datasets to CSV")
    train_dataset.to_csv(f'{base_output_dir}train/train.csv', header=False, index=False)
    val_dataset.to_csv(f'{base_output_dir}validation/validation.csv', header=False, index=False)
    test_dataset.to_csv(f'{base_output_dir}test/test.csv', header=False, index=False)

    logger.debug("Saving preprocessor model")
    joblib.dump(preprocessor, f'{base_output_dir}preprocessor/preprocessor.joblib')
    
    with tarfile.open(f'{base_output_dir}preprocessor/preprocessor.tar.gz', 'w:gz') as tar_handle:
        tar_handle.add(f'{base_output_dir}preprocessor/preprocessor.joblib', arcname='preprocessor.joblib')

    logger.debug("Preprocessing script completed successfully")

def input_fn(input_data, content_type):
    if content_type == "text/csv":
        df = pd.read_csv(StringIO(input_data), header=None)
        df.columns = transformed_feature_columns if len(df.columns) == len(transformed_feature_columns) else transformed_feature_columns + [LABEL_COLUMN]
        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))

def output_fn(prediction, accept):
    if accept == "application/json":
        instances = [row.tolist() for row in prediction]
        json_output = {"instances": instances}
        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == "text/csv":
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        print(f"Warning: {accept} accept type is not supported by this script. Defaulting to text/csv.")
        return worker.Response(encoders.encode(prediction, "text/csv"), mimetype="text/csv")

def predict_fn(input_data, model):
    features = model.transform(input_data)
    if LABEL_COLUMN in input_data:
        return np.insert(features, 0, input_data[LABEL_COLUMN], axis=1)
    else:
        return features

def model_fn(model_dir):
    preprocessor = joblib.load(os.path.join(model_dir, "preprocessor.joblib"))
    return preprocessor


Overwriting code/preprocess.py


In [17]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(
    framework_version='1.2-1',
    role=role,
    instance_type='ml.c5.xlarge',
    instance_count=1 
)

sklearn_processor.run(
    code="code/preprocess.py",
    inputs=[
        ProcessingInput(
            source=S3_PATH + DATA_PREFIX,
            destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="scaler_model",
            source="/opt/ml/processing/output/preprocessor",
            destination=S3_PATH + MODEL_PREFIX
        ),
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=S3_PATH + DATA_PREFIX
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation",
            destination=S3_PATH + DATA_PREFIX
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=S3_PATH + DATA_PREFIX
        ),
    ],
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-06-20-13-18-29-805


.............[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mDEBUG:__main__:Starting preprocessing script[0m
[34mDEBUG:__main__:Reading input data[0m
[34mDEBUG:__main__:Sampling data to reduce size[0m
[34mDEBUG:__main__:Merging dataframes[0m
[34mDEBUG:__main__:Applying transformations to the data[0m
[34mDEBUG:__main__:Transformed feature shape: (134868, 813)[0m
[34mDEBUG:__main__:Splitting data into train, validation, and test sets[0m
[34mDEBUG:__main__:x_train shape: (107894, 813), y_train shape: (107894, 1)[0m
[34mDEBUG:__main__:x_val shape: (13487, 813), y_val shape: (13487, 1)[0m
[34mDEBUG:__main__:x_test shape: (13487, 813), y_test shape: (13487, 1)[0m
[34mDEBUG:__main__:First 5 rows of x_train:   (0, 2)#0111.0
  (0, 18)#0111.0
  (0, 120)#0111.0
  (0, 452)#0111.0
  (0, 706)#0111.0
  (0, 712)#0111.0
  (0, 715)#0111.0
  (0, 735)#0111.0
  (0, 758)#0111.0
  (0, 790)#0111.0
  (0, 802)#0111.3382639788382422
  (0, 803)#0110.93544

In [17]:
sklearn_processor.jobs[0].describe()

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-202646161072/GROUP6/data/',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-202646161072/sagemaker-scikit-learn-2024-06-20-02-35-30-277/input/code/preprocess.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'scaler_model',
    'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-202646161072/GROUP6/model/',
     'LocalPath': '/opt/ml/processing/output/preprocessor',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName'

### XGBoost

In [34]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime

sess = sagemaker.Session()

container = retrieve("xgboost", region, "1.7-1")

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    base_job_name=BASE_JOB_TRAINING_NAME + '-xgboost',
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=S3_PATH + MODEL_PREFIX + 'xgboost',
    sagemaker_session=sess,
)

xgb.set_hyperparameters(objective='reg:squarederror',num_round=100)

hyperparameter_ranges = {
    'eta': ContinuousParameter(0.001, 0.2),
    'max_depth': IntegerParameter(3, 50),
    'min_child_weight': IntegerParameter(1, 10),
    'colsample_bytree': ContinuousParameter(0.5, 1)
}

objective_metric_name = 'validation:rmse'

tuner = HyperparameterTuner(estimator=xgb,
                            objective_metric_name=objective_metric_name,
                            hyperparameter_ranges=hyperparameter_ranges,
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=4,
                            strategy='Random')

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
tuner.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="xgb-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

# train_args = xgb.fit(
#     inputs={
#         "train": TrainingInput(
#             s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
#             content_type="text/csv",
#         ),
#         "validation": TrainingInput(
#             s3_data=step_process.properties.ProcessingOutputConfig.Outputs["val"].S3Output.S3Uri,
#             content_type="text/csv",
#         )
#     }
# )

# step_train_model = TrainingStep(name="TrainXGBModel1", step_args=train_args)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: xgb-randsearch-20240620-07-03-31


.

KeyboardInterrupt: 

In [25]:
df_tuner = sagemaker.HyperparameterTuningJobAnalytics(
    tuner.latest_tuning_job.job_name
).dataframe()
df_tuner

Unnamed: 0,colsample_bytree,eta,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.941825,0.117626,30.0,7.0,xgb-randsearch-20240620-05-16-16-005-5ab7deb7,Completed,56.492619,2024-06-20 05:37:49+00:00,2024-06-20 05:58:22+00:00,1233.0
1,0.940992,0.001262,26.0,10.0,xgb-randsearch-20240620-05-16-16-004-e652a66e,Completed,55.47694,2024-06-20 05:17:14+00:00,2024-06-20 05:37:22+00:00,1208.0
2,0.81616,0.036128,31.0,3.0,xgb-randsearch-20240620-05-16-16-003-fbd05ca3,Completed,55.984032,2024-06-20 05:17:09+00:00,2024-06-20 05:38:13+00:00,1264.0
3,0.723725,0.003638,49.0,9.0,xgb-randsearch-20240620-05-16-16-002-05678414,Completed,54.787861,2024-06-20 05:17:05+00:00,2024-06-20 05:40:48+00:00,1423.0
4,0.535194,0.016688,42.0,8.0,xgb-randsearch-20240620-05-16-16-001-be493fef,Completed,53.900372,2024-06-20 05:17:02+00:00,2024-06-20 05:35:00+00:00,1078.0


In [26]:
best_model_name = df_tuner.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_model_name

'xgb-randsearch-20240620-05-16-16-001-be493fef'

In [31]:
best_model_name = 'xgb-randsearch-20240619-14-39-43-001-2a4b13da'
best_model_path = f'{S3_PATH}{MODEL_PREFIX}xgboost/{best_model_name}/output/model.tar.gz'
best_model_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/xgboost/xgb-randsearch-20240619-14-39-43-001-2a4b13da/output/model.tar.gz'

In [32]:
%%writefile code/evaluate_xgboost.py

import json
import logging
import math
import pickle
import tarfile

import numpy as np
import pandas as pd
import xgboost
from sklearn.metrics import mean_squared_error

from pathlib import Path

label_column = 'Dep_Delay'

model_tar_path = '/opt/ml/processing/model/model.tar.gz'

if __name__ == "__main__":
    ## Your code to perform model evaluation on testing dataset, and 
    ## store the evaluation report
    with tarfile.open(model_tar_path, 'r:gz') as tar:
        tar.extractall(path='./model')
    
    xgb_model = xgboost.Booster()
    xgb_model.load_model('./model/xgboost-model')
    
    test_path = "/opt/ml/processing/test/"
    df = pd.read_csv(test_path + "/test.csv")

    x_test = xgboost.DMatrix(df.iloc[:,1:].values)
    y_test = df.iloc[:,0].values
    y_pred = xgb_model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    print("\nTest RMSE :", score)

    # Available metrics to add to model: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html
    report_dict = {
        "regression_metrics": {
            "rmse": {"value": score, "standard_deviation": "NaN"},
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting code/evaluate_xgboost.py


In [33]:
from sagemaker.workflow.properties import PropertyFile
from sagemaker.sklearn.processing import ScriptProcessor

s3_test_data = f'{S3_PATH}{DATA_PREFIX}test.csv'

xgb_eval_image = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version='1.7-1'
)

evaluate_model_processor = ScriptProcessor(
    role=role,
    image_uri=xgb_eval_image,
    command=["python3"],
    instance_count=1,
    instance_type='ml.m5.xlarge',
    sagemaker_session=sess,
)

# Create a PropertyFile
# A PropertyFile is used to be able to reference outputs from a processing step, for instance to use in a condition step.
# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

# eval_args = 
evaluate_model_processor.run(
    inputs=[
        ProcessingInput(
            source=best_model_path,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=s3_test_data,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", destination=f'{S3_PATH}{GROUP_NAME}/evaluation/xgboost/'),
    ],
    code="code/evaluate_xgboost.py",
)

# step_evaluate_model = ProcessingStep(
#     name="EvaluateXgboost",
#     step_args=eval_args,
#     property_files=[evaluation_report],
# )

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name sagemaker-xgboost-2024-06-20-15-23-50-390


.............[34mTest RMSE : 49.99842436026933[0m



### Linear Regression

In [None]:
%%writefile code/train_linear.py

import argparse
import numpy as np
import os
import tensorflow as tf
import pandas as pd

feature_columns = [
    "longitude",
    "latitude",
    "housingMedianAge",
    "totalRooms",
    "totalBedrooms",
    "population",
    "households",
    "medianIncome",
]
label_column = "medianHouseValue"


def parse_args():

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--learning_rate", type=float, default=0.1)

    # data directories
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))

    # model directory
    parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))

    return parser.parse_known_args()


def get_train_data(train_dir):
    train_data = pd.read_csv(os.path.join(train_dir, "train.csv"))
    x_train = train_data[feature_columns].to_numpy()
    y_train = train_data[label_column].to_numpy()
    print("x train", x_train.shape, "y train", y_train.shape)

    return x_train, y_train


def get_test_data(test_dir):

    test_data = pd.read_csv(os.path.join(test_dir, "test.csv"))
    x_test = test_data[feature_columns].to_numpy()
    y_test = test_data[label_column].to_numpy()
    print("x test", x_test.shape, "y test", y_test.shape)

    return x_test, y_test


def get_model():

    inputs = tf.keras.Input(shape=(8,))
    hidden_1 = tf.keras.layers.Dense(8, activation="tanh")(inputs)
    hidden_2 = tf.keras.layers.Dense(4, activation="sigmoid")(hidden_1)
    outputs = tf.keras.layers.Dense(1)(hidden_2)
    return tf.keras.Model(inputs=inputs, outputs=outputs)


if __name__ == "__main__":

    args, _ = parse_args()

    print("Training data location: {}".format(args.train))
    print("Test data location: {}".format(args.test))
    x_train, y_train = get_train_data(args.train)
    x_test, y_test = get_test_data(args.test)

    batch_size = args.batch_size
    epochs = args.epochs
    learning_rate = args.learning_rate
    print(
        "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
    )

    model = get_model()
    optimizer = tf.keras.optimizers.SGD(learning_rate)
    model.compile(optimizer=optimizer, loss="mse")
    model.fit(
        x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test)
    )

    # evaluate on test set
    scores = model.evaluate(x_test, y_test, batch_size, verbose=2)
    print("\nTest MSE :", scores)

    # save model
    model.save(args.sm_model_dir + "/1")

In [36]:
# Linear Regression
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.amazon.randomcutforest import RandomCutForest
from time import gmtime, strftime
from sagemaker.image_uris import retrieve

sess = sagemaker.Session()
container = retrieve("linear-learner", region, version="1")

# Setup the Random Forest estimator
lr = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode='File',
    base_job_name=BASE_JOB_TRAINING_NAME + 'linear',
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=S3_PATH + MODEL_PREFIX + 'linear',
    sagemaker_session=sess,
)

lr.set_hyperparameters(
    predictor_type="regressor",
    optimizer="adam",
    mini_batch_size=100,
    use_lr_scheduler=True
)

# Hyperparameters can be tuned as per the specific requirements
hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.01, 0.2),
    'wd': ContinuousParameter(0.0, 0.01)
}

objective_metric_name = 'validation:rmse'  # Adjust the metric name based on what is relevant for Random Forest in SageMaker

tuner_lr = HyperparameterTuner(estimator=lr,
                            objective_metric_name=objective_metric_name,
                            hyperparameter_ranges=hyperparameter_ranges,
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=5,
                            strategy='Random')

# Define training and validation datasets
# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

tuner_lr.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="lr-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: lr-randsearch-20240620-15-39-11


...

KeyboardInterrupt: 

In [19]:
df_tuner_lr = sagemaker.HyperparameterTuningJobAnalytics(
    tuner_lr.latest_tuning_job.job_name
).dataframe()
df_tuner_lr

Unnamed: 0,learning_rate,wd,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.041387,0.007256,lr-randsearch-20240620-13-40-45-005-03b1ccb1,Completed,54.109787,2024-06-20 13:41:55+00:00,2024-06-20 13:53:37+00:00,702.0
1,0.039622,0.007937,lr-randsearch-20240620-13-40-45-004-4a059713,Completed,54.105003,2024-06-20 13:41:55+00:00,2024-06-20 13:53:36+00:00,701.0
2,0.195047,0.000752,lr-randsearch-20240620-13-40-45-003-6d7fd290,Completed,54.368877,2024-06-20 13:41:39+00:00,2024-06-20 13:55:36+00:00,837.0
3,0.015839,0.008427,lr-randsearch-20240620-13-40-45-002-81033a3e,Completed,54.115536,2024-06-20 13:41:48+00:00,2024-06-20 13:52:23+00:00,635.0
4,0.118939,0.006393,lr-randsearch-20240620-13-40-45-001-d346b7c3,Completed,54.113075,2024-06-20 13:41:59+00:00,2024-06-20 13:54:30+00:00,751.0


In [34]:
best_model_name = df_tuner_lr.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_model_path = f'{S3_PATH}{MODEL_PREFIX}linear/{best_model_name}/output/model.tar.gz'
best_model_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/linear/lr-randsearch-20240620-13-40-45-004-4a059713/output/model.tar.gz'

In [37]:
best_model_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/linear/lr-randsearch-20240620-13-40-45-004-4a059713/output/model.tar.gz'

In [38]:
# Retrieve the image URI for the linear-learner algorithm
image_uri = retrieve(framework='linear-learner', region=region, version='1')

# Create a SageMaker model
model = sagemaker.Model(
    image_uri=image_uri,
    model_data=best_model_path,
    role=role,
    sagemaker_session=sess
)

# Deploy the model
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: linear-learner-2024-06-20-15-51-04-998
INFO:sagemaker:Creating endpoint-config with name linear-learner-2024-06-20-15-51-05-845
INFO:sagemaker:Creating endpoint with name linear-learner-2024-06-20-15-51-05-845


-------!

AttributeError: 'NoneType' object has no attribute 'serializer'

In [45]:
endpoint_name = 'linear-learner-2024-06-20-15-51-05-845'

# Initialize a predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name
)

In [55]:
test_df = pd.read_csv('test.csv')
test_df.shape

(13486, 814)

In [57]:
payload = test_df.iloc[:5,1:].to_csv(header=False, index=False)
p = predictor.predict(payload, initial_args={"ContentType": "text/csv"})
print(p.decode("utf-8"))

{"predictions": [{"score": 5.982748985290527}, {"score": 31.250896453857422}, {"score": 15.27232551574707}, {"score": 1.4978976249694824}, {"score": 12.278738021850586}]}


In [58]:
test_df.iloc[:5,1]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: 0.0, dtype: float64

### Factorization Machine

In [10]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime

fm_container = retrieve('factorization-machines',region)

fm = sagemaker.estimator.Estimator(
    fm_container,
    role,
    base_job_name=BASE_JOB_TRAINING_NAME + '-fm',
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=S3_PATH + MODEL_PREFIX + 'fm',
    sagemaker_session=sagemaker_session,
)

fm.set_hyperparameters(feature_dim=813,
                       num_factors=64,
                       predictor_type='regressor',
                       mini_batch_size=1000)

hyperparameter_ranges = {
    'factors_wd': ContinuousParameter(1e-8, 1e-4),
    'linear_wd': ContinuousParameter(1e-8, 1e-4),
    'bias_wd': ContinuousParameter(1e-8, 1e-4),
    'epochs': IntegerParameter(10, 100)
}

objective_metric_name = 'validation:rmse'

fm_tuner = HyperparameterTuner(estimator=fm,
                            objective_metric_name='test:rmse',
                            hyperparameter_ranges=hyperparameter_ranges,
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=5,
                            strategy='Random')

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
fm_tuner.fit(
    inputs={
        'train': train_data,
        'test': validation_data
    },
    job_name="fm-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: fm-randsearch-20240621-03-28-46


.................................................................................................................................................................................................................!


In [11]:
df_tuner_fm = sagemaker.HyperparameterTuningJobAnalytics(
    fm_tuner.latest_tuning_job.job_name
).dataframe()
df_tuner_fm

Unnamed: 0,bias_wd,epochs,factors_wd,linear_wd,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,9.768333e-07,21.0,4.933619e-05,2e-06,fm-randsearch-20240621-03-28-46-005-29bc72cf,Completed,149.103622,2024-06-21 03:29:41+00:00,2024-06-21 03:39:11+00:00,570.0
1,5.242554e-05,29.0,1.475626e-08,5e-05,fm-randsearch-20240621-03-28-46-004-1f596440,Completed,102.370323,2024-06-21 03:29:41+00:00,2024-06-21 03:40:16+00:00,635.0
2,3.816215e-06,41.0,1.000964e-06,1e-06,fm-randsearch-20240621-03-28-46-003-5c321590,Completed,94.614632,2024-06-21 03:29:41+00:00,2024-06-21 03:42:52+00:00,791.0
3,3.218829e-08,48.0,5.05809e-05,6e-06,fm-randsearch-20240621-03-28-46-002-148f5a50,Completed,80.167862,2024-06-21 03:29:34+00:00,2024-06-21 03:45:11+00:00,937.0
4,2.630834e-07,50.0,1.659995e-08,1.9e-05,fm-randsearch-20240621-03-28-46-001-7acd1054,Completed,78.112427,2024-06-21 03:29:35+00:00,2024-06-21 03:45:12+00:00,937.0


In [13]:
best_fm_name = df_tuner_fm.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_fm_path = f'{S3_PATH}{MODEL_PREFIX}linear/{best_fm_name}/output/model.tar.gz'
best_fm_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/linear/fm-randsearch-20240621-03-28-46-001-7acd1054/output/model.tar.gz'

### LightGBM

In [18]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve
from sagemaker import image_uris, model_uris, script_uris
from sagemaker import hyperparameters
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime

train_model_id, train_model_version, train_scope = "lightgbm-regression-model", "*", "training"
training_instance_type = "ml.m5.xlarge"

# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type='ml.m5.xlarge'
)

# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)

train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

hyperparameters = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)


lgbm = sagemaker.estimator.Estimator(
    role=role,
    base_job_name=BASE_JOB_TRAINING_NAME + '-lgbm',
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type='ml.m5.xlarge',
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=S3_PATH + MODEL_PREFIX + 'lgbm',
)

fm.set_hyperparameters(feature_dim=813,
                       num_factors=64,
                       predictor_type='regressor',
                       mini_batch_size=1000)

hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(1e-4, 1, scaling_type="Logarithmic"),
    "num_boost_round": IntegerParameter(2, 30),
    "early_stopping_rounds": IntegerParameter(2, 30),
    "num_leaves": IntegerParameter(10, 50),
    "feature_fraction": ContinuousParameter(0, 1),
    "bagging_fraction": ContinuousParameter(0, 1),
    "bagging_freq": IntegerParameter(1, 10),
    "max_depth": IntegerParameter(5, 30),
    "min_data_in_leaf": IntegerParameter(5, 50),
}

# objective_metric_name = 'validation:rmse'

# lgbm_tuner = HyperparameterTuner(
#     lgbm,
#     "rmse",
#     hyperparameter_ranges,
#     [{"Name": "rmse", "Regex": "rmse: ([0-9\\.]+)"}],
#     max_jobs=5,
#     max_parallel_jobs=5,
#     objective_type="Minimize",
#     strategy='Random'
# )

# lgbm_tuner.fit({"training": training_dataset_s3_path}, logs=True)

lgbm_tuner = HyperparameterTuner(estimator=lgbm,
                            objective_metric_name='rmse',
                            hyperparameter_ranges=hyperparameter_ranges,
                            metric_definitions=[{"Name": "rmse", "Regex": "rmse: ([0-9\\.]+)"}],
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=5,
                            strategy='Random')

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
lgbm_tuner.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="lgbm-rand-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker:Creating hyperparameter tuning job with name: lgbm-rand-20240621-04-14-39


...................................!


In [19]:
df_tuner_lgbm = sagemaker.HyperparameterTuningJobAnalytics(
    lgbm_tuner.latest_tuning_job.job_name
).dataframe()
df_tuner_lgbm

Unnamed: 0,bagging_fraction,bagging_freq,early_stopping_rounds,feature_fraction,learning_rate,max_depth,min_data_in_leaf,num_boost_round,num_leaves,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.571666,5.0,8.0,0.161938,0.007978,29.0,30.0,19.0,45.0,lgbm-rand-20240621-04-14-39-005-af474496,Completed,53.001099,2024-06-21 04:15:34+00:00,2024-06-21 04:17:18+00:00,104.0
1,0.560695,9.0,10.0,0.295746,0.000187,28.0,11.0,8.0,17.0,lgbm-rand-20240621-04-14-39-004-543173dd,Completed,53.100601,2024-06-21 04:15:34+00:00,2024-06-21 04:17:12+00:00,98.0
2,0.258755,6.0,14.0,0.381989,0.000261,15.0,17.0,13.0,37.0,lgbm-rand-20240621-04-14-39-003-fccd40f6,Completed,53.098,2024-06-21 04:15:32+00:00,2024-06-21 04:17:15+00:00,103.0
3,0.2462,7.0,25.0,0.923566,0.000251,18.0,27.0,10.0,21.0,lgbm-rand-20240621-04-14-39-002-3f679207,Completed,53.098701,2024-06-21 04:15:30+00:00,2024-06-21 04:17:09+00:00,99.0
4,0.019957,2.0,11.0,0.46404,0.379247,22.0,6.0,16.0,34.0,lgbm-rand-20240621-04-14-39-001-2f125bcc,Completed,53.174,2024-06-21 04:15:27+00:00,2024-06-21 04:17:11+00:00,104.0


In [20]:
best_lgbm_name = df_tuner_fm.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_lgbm_path = f'{S3_PATH}{MODEL_PREFIX}linear/{best_lgbm_name}/output/model.tar.gz'
best_lgbm_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/linear/fm-randsearch-20240621-03-28-46-001-7acd1054/output/model.tar.gz'