In [13]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline_context import PipelineSession
import json
from sagemaker import ModelPackage
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.processing import ScriptProcessor
from sagemaker import get_execution_role

In [6]:
sess = boto3.Session()
sm = sess.client("sagemaker")
role = get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

pipeline_session = PipelineSession()

print(bucket)

sagemaker-us-east-1-412510750633


In [7]:
GROUP_NAME = 'GROUP6' # CHANGE THIS TO YOUR FIRST NAME
DATA_PREFIX = f'group6/{GROUP_NAME}/data/' # S3 prefix to store data
MODEL_OUTPUT_S3_PATH = f's3://{bucket}/assignment2/{GROUP_NAME}/model/' # S3 prefix to store the XGBoost training information and model.

BASE_JOB_PROCESSING_NAME = f'{GROUP_NAME}-processing'  # base_job_name for preprocessing
BASE_JOB_TRAINING_NAME = f'{GROUP_NAME}-training'  # base_job_name for training
BASE_JOB_EVALUATION_NAME = f'{GROUP_NAME}-evaluation'  # base_job_name for evaluation

PIPELINE_NAME = f'{GROUP_NAME}-pipeline'  # SageMaker Pipeline name
MODEL_PACKAGE_GROUP_NAME = f'{GROUP_NAME}-ModelPackageGroup'  # Model package group name in the Model Registry

print(f'DATA_PREFIX: {DATA_PREFIX}')
print(f'PIPELINE_NAME: {PIPELINE_NAME}')
print(f'MODEL_PACKAGE_GROUP_NAME: {MODEL_PACKAGE_GROUP_NAME}')

DATA_PREFIX: group6/GROUP6/data/
PIPELINE_NAME: GROUP6-pipeline
MODEL_PACKAGE_GROUP_NAME: GROUP6-ModelPackageGroup


In [None]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat

# Define initial parameters
DATA_PREFIX = 's3://your-bucket/path/to/data'
input_data_path1 = f'{DATA_PREFIX}/US_flights_2023.csv'
input_data_path2 = f'{DATA_PREFIX}/airports_geolocation.csv'
input_data_path3 = f'{DATA_PREFIX}/weather_meteo_by_airport.csv'

# raw input data
input_data1 = ParameterString(name="InputData1", default_value=input_data_path1)
input_data2 = ParameterString(name="InputData2", default_value=input_data_path2)
input_data3 = ParameterString(name="InputData3", default_value=input_data_path3)

# status of newly trained model in registry
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")

# processing step parameters
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.xlarge"
)
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

# training step parameters
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
training_epochs = ParameterString(name="TrainingEpochs", default_value="100")

# model performance step parameters
accuracy_mse_threshold = ParameterFloat(name="AccuracyMseThreshold", default_value=0.75)

In [15]:
# Specify the local paths to your files and the S3 prefix (directory) to upload to
files = ['US_flights_2023.csv', 'airports_geolocation.csv', 'weather_meteo_by_airport.csv']

# Upload files to S3
for file in files:
    sagemaker_session.upload_data(path=file, key_prefix=DATA_PREFIX)
    
# Specify the paths to your uploaded files
file_paths = [f'{DATA_PREFIX}/US_flights_2023.csv', f'{DATA_PREFIX}/airports_geolocation.csv', f'{DATA_PREFIX}/weather_meteo_by_airport.csv']

FileNotFoundError: [Errno 2] No such file or directory: 'US_flights_2023.csv'

In [None]:
%%writefile code/preprocess.py

import numpy as np
import pandas as pd
import os
import json
import joblib
from io import StringIO
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tarfile
import sys

try:
    from sagemaker_containers.beta.framework import (
        content_types,
        encoders,
        env,
        modules,
        transformer,
        worker,
        server,
    )
except ImportError:
    pass

RANDOM_STATE = 2024
LABEL_COLUMN = 'Dep_Delay'
feature_columns = ['Day_Of_Week','Airline','Dep_Airport','Dep_CityName','DepTime_label','Distance_type','Manufacturer','Model','Aicraft_age,
                    'STATE','LATITUDE','LONGITUDE','tavg','tmin','tmax','prcp','snow','wdir','wspd','pres','FlightMonth']

# Define the feature columns and those to be one-hot encoded
one_hot_columns = ['Day_Of_Week', 'Airline', 'Dep_Airport', 'Dep_CityName', 'DepTime_label', 'Distance_type', 'Manufacturer', 'Model', 'STATE', 'FlightMonth']
non_one_hot_columns = ['Aicraft_age', 'LATITUDE', 'LONGITUDE', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']

base_dir = "/opt/ml/processing"
base_output_dir = "/opt/ml/output/"


if __name__ == "__main__":
    ## Your preprocessing code

    # Define the input data path within the processing environment
    input_data_path1 = f'{base_dir}/input/US_flights_2023.csv'
    input_data_path2 = f'{base_dir}/input/airports_geolocation.csv'
    input_data_path3 = f'{base_dir}/input/weather_meteo_by_airport.csv'
    
    # Read the CSV file from the input path
    df1 = pd.read_csv(input_data_path1)
    df2 = pd.read_csv(input_data_path2)
    df3 = pd.read_csv(input_data_path3)
    
    merged_df = pd.merge(df1, df2, left_on='Dep_Airport', right_on='IATA_CODE', how='left')
    
    # For example, 'date' in both DataFrames and 'time' in both DataFrames
    merged_df['FlightDate'] = pd.to_datetime(merged_df['FlightDate'])
    df3['time'] = pd.to_datetime(df_weather['time'])
    
    # Extract month from FlightDate
    merged_df['FlightMonth'] = merged_df['FlightDate'].dt.month
    
    # Then, merge the result with weather data (different column names)
    df = pd.merge(merged_df, df3, left_on=['Dep_Airport', 'FlightDate'], right_on=['airport_id', 'time'], how='left')

    feature_data = df.drop(LABEL_COLUMN, axis=1, inplace=False)
    label_data = df[LABEL_COLUMN]
    
    # Create a ColumnTransformer for one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), one_hot_columns),
            ('scaler', StandardScaler(), non_one_hot_columns)
        ],
        remainder='passthrough'
    )
    
    # Create a list of feature column names
    feature_columns = list(feature_data.columns)
    
    # First split: 80% training, 20% test
    x_train, x_temp, y_train, y_temp = train_test_split(feature_data, label_data, test_size=0.2, random_state=42)

    # Second split: 50% validation, 50% test from the 20% test set
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

    scaler = StandardScaler() #scaling using z-transformation (0 mean, 1 sd), can use minmax scaling to get range between 0 and 1 (cannot do 

    scaler.fit(x_train) #have to save scaler, just have to fit scaler to training data, assumption is range of value is based on training data
    x_train = scaler.transform(x_train)
    x_val = scaler.transform(x_val)
    x_test = scaler.transform(x_test)

    train_dataset = pd.concat([y_train.reset_index(drop=True), pd.DataFrame(x_train)], axis=1)
    val_dataset = pd.concat([y_val.reset_index(drop=True), pd.DataFrame(x_val)], axis=1)
    test_dataset = pd.concat([y_test.reset_index(drop=True), pd.DataFrame(x_test)], axis=1)

    # Assign column names to the datasets
    feature_columns = list(feature_data.columns)  # Get the feature column names
    train_dataset.columns = feature_columns + [LABEL_COLUMN]
    val_dataset.columns = feature_columns + [LABEL_COLUMN]
    test_dataset.columns = feature_columns + [LABEL_COLUMN]

    train_dataset.to_csv('/opt/ml/processing/train/train.csv', header=True, index=False)
    val_dataset.to_csv('/opt/ml/processing/validation/val.csv', header=True, index=False)
    test_dataset.to_csv('/opt/ml/processing/test/test.csv', header=True, index=False)
    
    # Save the scaler model locally
    joblib.dump(scaler, '/opt/ml/processing/scaler_model/model.joblib')
    
    with tarfile.open('/opt/ml/processing/scaler_model/model.tar.gz', 'w:gz') as tar_handle:
        tar_handle.add('/opt/ml/processing/scaler_model/model.joblib', arcname='model.joblib')

def input_fn(input_data, content_type):
    """Parse input data payload."""
    if content_type == "text/csv":
        df = pd.read_csv(StringIO(input_data), header=None)
        df.columns = feature_columns if len(df.columns) == len(feature_columns) else feature_columns + [LABEL_COLUMN]
        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))


def output_fn(prediction, accept):
    """Format prediction output
    The default accept/content-type between containers for serial inference is JSON.
    But, our XGBoost uses text/csv. We want to set the ContentType or mimetype as text/csv so the XGBoost
    container can read the response payload correctly.
    """
    ## Your code to process output to be sent to XGBoost
    if accept == "application/json":
        instances = [row.tolist() for row in prediction]
        json_output = {"instances": instances}

        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == "text/csv":
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        # Log a warning and default to text/csv
        print(f"Warning: {accept} accept type is not supported by this script. Defaulting to text/csv.")
        return worker.Response(encoders.encode(prediction, "text/csv"), mimetype="text/csv")


def predict_fn(input_data, model):
    """Preprocess input data
    We implement this because the default predict_fn uses .predict(), but our model is a preprocessor
    so we want to use .transform().
    """
    ## Your code to perform scaling using StandardScaler model
    features = model.transform(input_data)

    if LABEL_COLUMN in input_data:
        # Return the label (as the first column) and the set of features.
        return np.insert(features, 0, input_data[LABEL_COLUMN], axis=1)
    else:
        # Return only the set of features
        return features
    

def model_fn(model_dir):
    """Deserialize fitted StandardScaler model"""
    ## Your code to load the StandardScaler model
    preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
    return preprocessor


In [14]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Create the SKLearnProcessor instance with parameterized values
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1", 
    role=role, 
    instance_type=ml.m5.xlarge, 
    instance_count=1
)

# Define the processing job
sklearn_processor.run(
    code="code/preprocess.py",
    inputs=[
        ProcessingInput(source=input_data1, destination="/opt/ml/processing/input/US_flights_2023.csv"),
        ProcessingInput(source=input_data2, destination="/opt/ml/processing/input/airports_geolocation.csv"),
        ProcessingInput(source=input_data3, destination="/opt/ml/processing/input/weather_meteo_by_airport.csv")
    ],
    outputs=[
        ProcessingOutput(output_name="scaler_model", source="/opt/ml/processing/scaler_model"),
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    
)


NameError: name 'sklearn_processor' is not defined

In [10]:
sklearn_processor.jobs[0].describe()

NameError: name 'sklearn_processor' is not defined

In [9]:
import boto3

s3_client = boto3.client("s3")
default_bucket = sagemaker.Session().default_bucket()
for i in range(1, 4):
    prefix = s3_client.list_objects(
        Bucket=default_bucket, Prefix="sagemaker-scikit-learn"
    )["Contents"][-i]["Key"]
    print("s3://" + default_bucket + "/" + prefix)

s3://sagemaker-us-east-1-412510750633/sagemaker-scikit-learn-2024-06-07-09-32-01-595/sourcedir.tar.gz
s3://sagemaker-us-east-1-412510750633/sagemaker-scikit-learn-2024-06-07-09-05-12-627/sourcedir.tar.gz
s3://sagemaker-us-east-1-412510750633/sagemaker-scikit-learn-2024-06-07-08-54-41-796/sourcedir.tar.gz
