In [None]:
DATASET_FILE = 

GROUP_NAME = 'GROUP6' # CHANGE THIS TO YOUR FIRST NAME
DATA_PREFIX = f'group6/{GROUP_NAME}/data/' # S3 prefix to store data
MODEL_OUTPUT_S3_PATH = f's3://{bucket}/assignment2/{GROUP_NAME}/model/' # S3 prefix to store the XGBoost training information and model.

BASE_JOB_PROCESSING_NAME = f'{GROUP_NAME}-processing'  # base_job_name for preprocessing
BASE_JOB_TRAINING_NAME = f'{GROUP_NAME}-training'  # base_job_name for training
BASE_JOB_EVALUATION_NAME = f'{GROUP_NAME}-evaluation'  # base_job_name for evaluation

PIPELINE_NAME = f'{GROUP_NAME}-pipeline'  # SageMaker Pipeline name
MODEL_PACKAGE_GROUP_NAME = f'{GROUP_NAME}-ModelPackageGroup'  # Model package group name in the Model Registry

print(f'DATA_PREFIX: {DATA_PREFIX}')
print(f'PIPELINE_NAME: {PIPELINE_NAME}')
print(f'MODEL_PACKAGE_GROUP_NAME: {MODEL_PACKAGE_GROUP_NAME}')

In [None]:
# Specify the local paths to your files and the S3 prefix (directory) to upload to
files = ['US_flights_2023.csv', 'airports_geolocation.csv', 'weather_meteo_by_airport.csv']
DATA_PREFIX = 'your-s3-prefix'

# Upload files to S3
for file in files:
    sagemaker_session.upload_data(path=file, key_prefix=DATA_PREFIX)

In [None]:
# Specify the paths to your uploaded files
file_paths = [f'{DATA_PREFIX}/US_flights_2023.csv', f'{DATA_PREFIX}/airports_geolocation.csv', f'{DATA_PREFIX}/weather_meteo_by_airport.csv']

# Read files into DataFrames
df_flights = pd.read_csv(file_paths[0])
df_airports = pd.read_csv(file_paths[1])
df_weather = pd.read_csv(file_paths[2])

# Merge DataFrames
# First, merge flights data with airports data (different column names)
merged_df = pd.merge(df_flights, df_airports, left_on='Dep_Airport', right_on='IATA_CODE', how='left')

# Then, merge the result with weather data (different column names)
final_merged_df = pd.merge(merged_df, df_weather, left_on='Dep_Airport', right_on='airport_id', how='left')

# Now `final_merged_df` contains the combined data from all three files based on specified columns

In [None]:
%%writefile code/preprocess.py

import numpy as np
import pandas as pd
import os
import json
import joblib
from io import StringIO
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tarfile
import sys

try:
    from sagemaker_containers.beta.framework import (
        content_types,
        encoders,
        env,
        modules,
        transformer,
        worker,
        server,
    )
except ImportError:
    pass

RANDOM_STATE = 2024
LABEL_COLUMN = 'Dep_Delay'
feature_columns = [
    "T_atm", "Humidity", "Distance", "T_offset1", "Max1R13_1", "Max1L13_1", "aveAllR13_1", "aveAllL13_1", "T_RC1", "T_RC_Dry1", "T_RC_Wet1", "T_RC_Max1", "T_LC1", "T_LC_Dry1", "T_LC_Wet1", "T_LC_Max1", "RCC1", "LCC1",
    "canthiMax1", "canthi4Max1", "T_FHCC1", "T_FHRC1", "T_FHLC1", "T_FHBC1", "T_FHTC1", "T_FH_Max1", "T_FHC_Max1", "T_Max1", "T_OR1", "T_OR_Max1"
]

base_dir = "/opt/ml/processing"
base_output_dir = "/opt/ml/output/"

if __name__ == "__main__":
    ## Your preprocessing code

    # Define the input data path within the processing environment
    input_data_path = '/opt/ml/processing/input/raw_irt_dataset.csv'

    # Read the CSV file from the input path
    df = pd.read_csv(input_data_path)

    feature_data = df.drop(LABEL_COLUMN, axis=1, inplace=False)
    label_data = df[LABEL_COLUMN]
    
    # Create a list of feature column names
    feature_columns = list(feature_data.columns)
    
    # First split: 80% training, 20% test
    x_train, x_temp, y_train, y_temp = train_test_split(feature_data, label_data, test_size=0.2, random_state=42)

    # Second split: 50% validation, 50% test from the 20% test set
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)



    scaler = StandardScaler() #scaling using z-transformation (0 mean, 1 sd), can use minmax scaling to get range between 0 and 1 (cannot do 

    scaler.fit(x_train) #have to save scaler, just have to fit scaler to training data, assumption is range of value is based on training data
    x_train = scaler.transform(x_train)
    x_val = scaler.transform(x_val)
    x_test = scaler.transform(x_test)


    train_dataset = pd.concat([pd.DataFrame(x_train), y_train.reset_index(drop=True)], axis=1)
    val_dataset = pd.concat([pd.DataFrame(x_val), y_val.reset_index(drop=True)], axis=1)
    test_dataset = pd.concat([pd.DataFrame(x_test), y_test.reset_index(drop=True)], axis=1)

    # Assign column names to the datasets
    feature_columns = list(feature_data.columns)  # Get the feature column names
    train_dataset.columns = feature_columns + [LABEL_COLUMN]
    val_dataset.columns = feature_columns + [LABEL_COLUMN]
    test_dataset.columns = feature_columns + [LABEL_COLUMN]

    train_dataset.to_csv('/opt/ml/processing/train/train.csv', header=True, index=False)
    val_dataset.to_csv('/opt/ml/processing/validation/val.csv', header=True, index=False)
    test_dataset.to_csv('/opt/ml/processing/test/test.csv', header=True, index=False)
    
    # Save the scaler model locally
    joblib.dump(scaler, '/opt/ml/processing/scaler_model/model.joblib')
    
    with tarfile.open('/opt/ml/processing/scaler_model/model.tar.gz', 'w:gz') as tar_handle:
        tar_handle.add('/opt/ml/processing/scaler_model/model.joblib', arcname='model.joblib')

def input_fn(input_data, content_type):
    """Parse input data payload."""
    if content_type == "text/csv":
        df = pd.read_csv(StringIO(input_data), header=None)
        df.columns = feature_columns if len(df.columns) == len(feature_columns) else feature_columns + [LABEL_COLUMN]
        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))


def output_fn(prediction, accept):
    """Format prediction output
    The default accept/content-type between containers for serial inference is JSON.
    But, our XGBoost uses text/csv. We want to set the ContentType or mimetype as text/csv so the XGBoost
    container can read the response payload correctly.
    """
    ## Your code to process output to be sent to XGBoost
    if accept == "application/json":
        instances = [row.tolist() for row in prediction]
        json_output = {"instances": instances}

        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == "text/csv":
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        # Log a warning and default to text/csv
        print(f"Warning: {accept} accept type is not supported by this script. Defaulting to text/csv.")
        return worker.Response(encoders.encode(prediction, "text/csv"), mimetype="text/csv")


def predict_fn(input_data, model):
    """Preprocess input data
    We implement this because the default predict_fn uses .predict(), but our model is a preprocessor
    so we want to use .transform().
    """
    ## Your code to perform scaling using StandardScaler model
    features = model.transform(input_data)

    if LABEL_COLUMN in input_data:
        # Return the label (as the first column) and the set of features.
        return np.insert(features, 0, input_data[LABEL_COLUMN], axis=1)
    else:
        # Return only the set of features
        return features
    



def model_fn(model_dir):
    """Deserialize fitted StandardScaler model"""
    ## Your code to load the StandardScaler model
    preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
    return preprocessor


Error uploading file: name 'os' is not defined
