## Predictive Maintenance Pipeline

In [1]:
import json
import time
import boto3
import string
import sagemaker
import pandas as pd
import numpy as np
import awswrangler as wr

from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.workflow.pipeline_context import PipelineSession

In [2]:
region = sagemaker.Session().boto_region_name
print("Using AWS Region: {}".format(region))

Using AWS Region: us-east-1


In [3]:
boto3.setup_default_session(region_name = region)
boto_session = boto3.Session(region_name = region)

s3_client = boto3.client("s3", region_name = region)

sagemaker_boto_client = boto_session.client("sagemaker")
sagemaker_session = sagemaker.session.Session(
    boto_session = boto_session, sagemaker_client = sagemaker_boto_client
)
account_id = boto3.client("sts").get_caller_identity()["Account"]
sagemaker_role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()

In [4]:
sagemaker.get_execution_role()

'arn:aws:iam::451633145432:role/service-role/AmazonSageMaker-ExecutionRole-20220302T144665'

In [5]:
bucket = "ideaaiml-demo"
prefix = "mlops/predictive-maintenance"

### Upload raw data to S3

In [13]:
s3_client.upload_file(
    Filename="datasets/PdM_telemetry.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_telemetry.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_errors.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_errors.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_maint.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_maint.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_failures.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_failures.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_machines.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_machines.csv"
)

### Read Data from S3

In [224]:
telemetry_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_telemetry.csv"
errors_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_errors.csv"
maint_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_maint.csv"
failures_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_failures.csv"
machines_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_machines.csv"

In [73]:
response = s3_client.get_object(Bucket = bucket, Key = f"{prefix}/data/raw/PdM_telemetry.csv")
telemetry = pd.read_csv(response.get("Body"))
telemetry.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511


In [230]:
telemetry = pd.read_csv(telemetry_data_uri)
# errors = wr.s3.read_csv(errors_data_uri)
# maint = wr.s3.read_csv(maint_data_uri)
# failures = wr.s3.read_csv(failures_data_uri)
# machines = wr.s3.read_csv(machines_data_uri)

PermissionError: Forbidden

### Define Parameters to Parametrize Pipeline Execution

In [6]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

In [7]:
input_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/preprocessed.csv"
batch_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/PdM_inference_data.csv"

In [8]:
processing_instance_count = ParameterInteger(name = "ProcessingInstanceCount", default_value = 1)
instance_type = ParameterString(name = "TrainingInstanceType", default_value = "ml.m5.xlarge")

model_approval_status = ParameterString(
    name = "ModelApprovalStatus", default_value = "PendingManualApproval"
)

### Define a Processing Step for Feature Engineering

### Preprocessing Script

In [9]:
%%writefile scripts/preprocessing.py
import numpy as np
import pandas as pd
import boto3
from io import StringIO
import awswrangler as wr

base_dir = "/opt/ml/processing"
bucket = "ideaaiml-demo"
prefix = "mlops/predictive-maintenance"

def upload_file_s3(df, name):
    boto3.setup_default_session(region_name = "us-east-1")
    s3_client = boto3.client("s3", region_name = "us-east-1")
    with StringIO() as csv_buffer:
        df.to_csv(csv_buffer, index = False)

        response = s3_client.put_object(
            Bucket = bucket, Key = f"{prefix}/data/preprocessed/{name}.csv", Body = csv_buffer.getvalue()
        )
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")

# Convert to datetime datatype
def datetime_datatype(df):
    print("Converting to type datetime")
    df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%d %H:%M:%S")
    return df


# Convert to category datatype
def category_datatype(df, column_name):
    print("Converting to type category")
    df[column_name] = df[column_name].astype('category')
    return df


# Lag Features from Telemetry
def telemetry_features(df):
    df = datetime_datatype(df)
    # Calculate mean values for telemetry features -- 3 hours rolling window
    print("Calculate mean values for telemetry features -- 3 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col).resample('3H', closed = 'left', label = 'right').mean().unstack())
    telemetry_mean_3h = pd.concat(temp, axis = 1)
    telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
    telemetry_mean_3h.reset_index(inplace = True)

    # repeat for standard deviation
    print("Calculate standard deviation for telemetry features -- 3 hours rolling window")
    temp = []
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col).resample('3H', closed = 'left', label = 'right').std().unstack())
    telemetry_sd_3h = pd.concat(temp, axis = 1)
    telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
    telemetry_sd_3h.reset_index(inplace = True)
    
    # Calculate mean values for telemetry features -- 24 hours rolling window
    print("Calculate mean values for telemetry features -- 24 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed = 'left', label = 'right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).mean())
    telemetry_mean_24h = pd.concat(temp, axis = 1)
    telemetry_mean_24h.columns = [i + 'mean_24h' for i in fields]
    telemetry_mean_24h.reset_index(inplace = True)
    telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

    # repeat for standard deviation
    print("Calculate standard deviation for telemetry features -- 24 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).std())
    telemetry_sd_24h = pd.concat(temp, axis = 1)
    telemetry_sd_24h.columns = [i + 'sd_24h' for i in fields]
    telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]
    telemetry_sd_24h.reset_index(inplace = True)
    
    telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis = 1).dropna()

    upload_file_s3(telemetry_feat, "telemetry")
    
    return telemetry_feat


# Lag Features for Errors
def errors_lag_features(df):
    df = datetime_datatype(df)
    df = category_datatype(df, 'errorID')
    print("Lag features for errors")
    error_count = pd.get_dummies(df.set_index('datetime')).reset_index()
    error_count.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']
    error_count = error_count.groupby(['machineID', 'datetime']).sum().reset_index()
    error_count = telemetry[['datetime', 'machineID']].merge(error_count, on = ['machineID', 'datetime'], how = 'left').fillna(0.0)
    temp = []
    fields = ['error%d' % i for i in range(1, 6)]
    for col in fields:
        temp.append(pd.pivot_table(error_count,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).sum())
    error_count = pd.concat(temp, axis = 1)
    error_count.columns = [i + 'count' for i in fields]
    error_count.reset_index(inplace = True)
    error_count = error_count.dropna()
    
    upload_file_s3(error_count, "errors")
    
    return error_count


# Maintenance Features
def maintenance_features(df):
    df = datetime_datatype(df)
    df = category_datatype(df, 'comp')
    print("Maintenance Features -- Days since last replacement")
    comp_rep = pd.get_dummies(df.set_index('datetime')).reset_index()
    comp_rep.columns = ['datetime', 'machineID', 'comp1', 'comp2', 'comp3', 'comp4']

    # combine repairs for a given machine in a given hour
    comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

    # add timepoints where no components were replaced
    comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                          on=['datetime', 'machineID'],
                                                          how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])
    components = ['comp1', 'comp2', 'comp3', 'comp4']
    for comp in components:
        comp_rep.loc[comp_rep[comp] < 1, comp] = None
        comp_rep.loc[-comp_rep[comp].isnull(),
                     comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
        comp_rep[comp] = comp_rep[comp].fillna(method = 'ffill')

    comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]
    for comp in components:
        comp_rep[comp] = (comp_rep["datetime"] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, "D")
        
    upload_file_s3(comp_rep, "maint")
    
    return comp_rep


# Failures Features
def failure_features(df):
    print("Failure features")
    df = datetime_datatype(df)
    df = category_datatype(df, 'failure')
    upload_file_s3(df, "failures")
    return df


# Final Features
def final_features(telemetry_df, errors_df, maint_df, machines_df):
    upload_file_s3(machines_df, "machines")
    print("Final features")
    final_feat = telemetry_df.merge(errors_df, on = ['datetime', 'machineID'], how = 'left')
    final_feat = final_feat.merge(maint_df, on = ['datetime', 'machineID'], how = 'left')
    final_feat = final_feat.merge(machines_df, on = ['machineID'], how = 'left')
    return final_feat


# Label Construction
def label_construct(tele_df, error_df, maint_df, machine_df, failure_df):
    print("----- Final Features -----")
    final_feat = final_features(tele_df, error_df, maint_df, machine_df)
    
    print("----- Label Construction -----")
    labeled_features = pd.DataFrame()
    labeled_features = final_feat.merge(
        failure_df, on = ['datetime', 'machineID'], how = 'left')
    labeled_features['failure'] = labeled_features['failure'].astype(str)
    labeled_features['failure'] = labeled_features['failure'].fillna(method = 'bfill', limit = 7)
    labeled_features['failure'] = labeled_features['failure'].replace('nan', 'none')
    print("----- Preprocessing completed -----")
    
    upload_file_s3(labeled_features, "preprocessed")
#     pd.DataFrame(labeled_features).to_csv(f"{base_dir}/preprocessed/final_data.csv", index = False)


if __name__ == "__main__":

    telemetry_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_telemetry.csv"
    errors_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_errors.csv"
    maint_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_maint.csv"
    failures_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_failures.csv"
    machines_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_machines.csv"
    
    telemetry = wr.s3.read_csv(telemetry_data_uri)
    errors = wr.s3.read_csv(errors_data_uri)
    maint = wr.s3.read_csv(maint_data_uri)
    failures = wr.s3.read_csv(failures_data_uri)
    machines = wr.s3.read_csv(machines_data_uri)
    
    telemetry_df = telemetry_features(telemetry)
    errors_df = errors_lag_features(errors)
    maint_df = maintenance_features(maint)
    failures_df = failure_features(failures)
    machines_df = category_datatype(machines, 'model')
    
    label_construct(telemetry_df, errors_df, maint_df, machines_df, failures_df)

Overwriting scripts/preprocessing.py


### Feature Store Creation Script

In [10]:
%%writefile scripts/featurestore.py
import numpy as np
import pandas as pd
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import awswrangler as wr

base_dir = "/opt/ml/processing"
bucket = "ideaaiml-demo"
prefix = "mlops/predictive-maintenance"

boto_session = boto3.Session(region_name = "us-east-1")
sagemaker_boto_client = boto_session.client("sagemaker")
featurestore_runtime = boto_session.client(
    service_name = "sagemaker-featurestore-runtime", region_name = "us-east-1"
)
try:
    sagemaker_role = sagemaker.get_execution_role()
    print(f"Sagemaker Role for Feature Store file: {sagemaker_role}")
except ValueError:
    sagemaker_role = 'arn:aws:iam::451633145432:role/service-role/AmazonSageMaker-ExecutionRole-20220302T144665'
    
feature_store_session = sagemaker.Session(
    boto_session = boto_session,
    sagemaker_client = sagemaker_boto_client,
    sagemaker_featurestore_runtime_client = featurestore_runtime,
)

# ------------------------------------ Read Data
telemetry_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/telemetry.csv"
errors_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/errors.csv"
maint_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/maint.csv"
failures_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/failures.csv"
machines_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/machines.csv"

telemetry = wr.s3.read_csv(telemetry_data_uri)
errors = wr.s3.read_csv(errors_data_uri)
maint = wr.s3.read_csv(maint_data_uri)
failures = wr.s3.read_csv(failures_data_uri)
machines = wr.s3.read_csv(machines_data_uri)

# ------------------------------------ Add Timestamp
telemetry['event_time'] = pd.to_datetime("now").timestamp()
errors['event_time'] = pd.to_datetime("now").timestamp()
maint['event_time'] = pd.to_datetime("now").timestamp()
failures['event_time'] = pd.to_datetime("now").timestamp()
machines['event_time'] = pd.to_datetime("now").timestamp()

# ------------------------------------ Create Feature Group
telemetry_feature_group = FeatureGroup(name = 'telemetry_fg', sagemaker_session = feature_store_session)
errors_feature_group = FeatureGroup(name = 'errors_fg', sagemaker_session = feature_store_session)
maintenance_feature_group = FeatureGroup(name = 'maintenance_fg', sagemaker_session = feature_store_session)
failures_feature_group = FeatureGroup(name = 'failures_fg', sagemaker_session = feature_store_session)
machines_feature_group = FeatureGroup(name = 'machines_fg', sagemaker_session = feature_store_session)

# ------------------------------------ Loading Definitions
telemetry_feature_group.load_feature_definitions(data_frame = telemetry)
errors_feature_group.load_feature_definitions(data_frame = errors)
maintenance_feature_group.load_feature_definitions(data_frame = maint)
failures_feature_group.load_feature_definitions(data_frame = failures)
machines_feature_group.load_feature_definitions(data_frame = machines)

record_identifier_feature_name = "machineID"
event_time_feature_name = "event_time"

# ------------------------------------ Telemetry Feature Store
try:
    telemetry_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "telemetry" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Errors Feature Store      
try:
    errors_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "errors" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Maintenance Feature Store
try:
    maintenance_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "maintenance" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Failures Feature Store
try:
    failures_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "failures" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Machines Feature Store
try:
    machines_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "machines" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Ingesting Data
while (
    telemetry_feature_group.describe()['FeatureGroupStatus'] == 'Creating'):
    print("Feature Group Creating")
    time.sleep(60)
else:
    print("Feature Group Created")
    ## Below code needs to run only once to ingest the data into feature store
    #---------------------------------------------------------------
#     telemetry_feature_group.ingest(data_frame = telemetry, max_workers = 3, wait = True)
#     errors_feature_group.ingest(data_frame = errors, max_workers = 3, wait = True)
#     maintenance_feature_group.ingest(data_frame = maint, max_workers = 3, wait = True)
#     failures_feature_group.ingest(data_frame = failures, max_workers = 3, wait = True)
#     machines_feature_group.ingest(data_frame = machines, max_workers = 3, wait = True)
#     print("Feature Data Ingested")
    #---------------------------------------------------------------

Overwriting scripts/featurestore.py


### Train Test Split Script

In [11]:
%%writefile scripts/train_test_split_data.py
import pandas as pd
import boto3
from io import StringIO
import awswrangler as wr

base_dir = "/opt/ml/processing"
bucket = "ideaaiml-demo"
prefix = "mlops/predictive-maintenance"

def upload_file_s3(df, name):
    boto3.setup_default_session(region_name = "us-east-1")
    s3_client = boto3.client("s3", region_name = "us-east-1")
    with StringIO() as csv_buffer:
        df.to_csv(csv_buffer, index = False)

        response = s3_client.put_object(
            Bucket = bucket, Key = f"{prefix}/data/train-test/{name}.csv", Body = csv_buffer.getvalue()
        )
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")

def train_test_split_script(labeled_features):
    threshold_dates = [[pd.to_datetime('2015-07-31 01:00:00'), pd.to_datetime('2015-08-01 01:00:00')],
                   [pd.to_datetime('2015-08-31 01:00:00'), pd.to_datetime('2015-09-01 01:00:00')],
                   [pd.to_datetime('2015-09-30 01:00:00'), pd.to_datetime('2015-10-01 01:00:00')]]
    
    for last_train_date, first_test_date in threshold_dates:
        # split out training and test data
        print(labeled_features['datetime'][0])
        train_y = labeled_features.loc[labeled_features['datetime'] < last_train_date, 'failure']
        train_data = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date].drop(['datetime',
                                                                                                            'machineID',
                                                                                                              'failure'], axis = 1))
        test_y = labeled_features.loc[labeled_features['datetime'] > last_train_date, 'failure']
        test_data = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] > first_test_date].drop(['datetime',
                                                                                                           'machineID',
                                                                                                             'failure'], axis = 1))
    
    train_data['failure'] = train_y
    test_data['failure'] = test_y
    
    upload_file_s3(train_data, "train")
    upload_file_s3(test_data, "test")

    pd.DataFrame(train_data).to_csv(f"{base_dir}/train/train.csv", index = False)
    pd.DataFrame(test_data).to_csv(f"{base_dir}/test/test.csv", index = False)
    
if __name__ == "__main__":
    final_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/preprocessed.csv"
    final_data = wr.s3.read_csv(final_data_uri)
    final_data['datetime'] = pd.to_datetime(final_data['datetime'], format="%Y-%m-%d %H:%M:%S")
    train_test_split_script(final_data)

Overwriting scripts/train_test_split_data.py


### Create an instance of a FrameworkProcessor

In [12]:
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn import SKLearn

framework_processor = FrameworkProcessor(
    estimator_cls = SKLearn,
    framework_version = "1.2-1",
    instance_type = "ml.m5.xlarge",
    instance_count = processing_instance_count,
    base_job_name = "sklearn-pred-maint-process",
    role = sagemaker_role,
    sagemaker_session = pipeline_session,
)

### Creating Processing Step

In [13]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = framework_processor.run(
    code = "preprocessing.py",
    source_dir = "scripts"
)

preprocess_step = ProcessingStep(name = "PdM-Data-Read-And-PreProcessing", step_args = processor_args)



In [14]:
train_test_args = framework_processor.run(
    outputs = [
        ProcessingOutput(output_name = "train", source = "/opt/ml/processing/train"),
        ProcessingOutput(output_name = "test", source = "/opt/ml/processing/test"),
    ],
    code = "train_test_split_data.py",
    source_dir = "scripts"
)

train_test_split_step = ProcessingStep(name = "PdM-Train-Test-Data-Split", step_args = train_test_args, depends_on = [preprocess_step.name])

In [15]:
fs_data = framework_processor.run(
    code = "featurestore.py",
    source_dir = "scripts"
)
feature_store_step = ProcessingStep(name = "PdM-FeatureStore-Creation", step_args = fs_data, depends_on = [preprocess_step.name])

### Define a Training Step to Train a Model

In [16]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.sklearn.model import SKLearnModel

In [17]:
# SKLearn estimator is used for end-to-end training and deployment
sklearn_estimator = SKLearn(
    entry_point = "scripts/rf_script.py",
    role = sagemaker_role,
    instance_count = 1,
    instance_type = "ml.m5.xlarge",
    framework_version = "1.2-1",
    base_job_name = "rf-scikit",
    hyperparameters = {
        "n-estimators": 100,
        "min-samples-leaf": 3,
    },
    sagemaker_session = pipeline_session,
)

In [18]:
train_args = sklearn_estimator.fit(
    inputs = {
        "train": TrainingInput(
            s3_data = train_test_split_step.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type = "text/csv"
        ),
        "test": TrainingInput(
            s3_data = train_test_split_step.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            content_type = "text/csv",
        )
    }
)

In [19]:
training_step = TrainingStep(
    name = "PdM-ModelTraininig",
    step_args = train_args,
    depends_on = [train_test_split_step.name]
)

### Define a Model Step to Create a Model

In [55]:
#     from sagemaker.model import Model
#     from sagemaker.workflow.model_step import ModelStep

#     model = SKLearnModel(
#         model_data = training_step.properties.ModelArtifacts.S3ModelArtifacts,
#         role = sagemaker_role,
#         entry_point = "scripts/rf_script.py",
#         framework_version = "1.2-1",
#         name = "Predictive-Maintenance-Model-DataRest",
#         sagemaker_session = pipeline_session,
#     )

#     create_model_step = ModelStep(
#         name = "PdM-PreDeployment",    
#         step_args = model.create(instance_type="ml.m5.xlarge")
#     )

In [34]:
# from sagemaker.sklearn.model import SKLearnModel
# from sagemaker.workflow.model_step import ModelStep

# model = Model(
# #     image_uri = sagemaker.image_uris.retrieve('sklearn', region = region, version = "1.2-1",),
#     image_uri = training_step.properties.AlgorithmSpecification.TrainingImage,
#     model_data = training_step.properties.ModelArtifacts.S3ModelArtifacts,
#     sagemaker_session = pipeline_session,
#     name = "Predictive-Maintenance-Model-DataRest",
#     role = sagemaker_role,
# )

# # Using the SKLearn Model to create model step
# # SKLearnModel is used for deployment-only workflows, where the model has already been trained
# model = SKLearnModel(
#     entry_point = "scripts/rf_script.py",
#     model_data = training_step.properties.ModelArtifacts.S3ModelArtifacts,
#     sagemaker_session = pipeline_session,
#     framework_version = "1.2-1",
#     role = sagemaker_role,
# )

# step_create_model = ModelStep(
#     name = "PdM-Create-Model",
#     step_args = model.create(instance_type = "ml.m5.large"),
#     depends_on = [training_step.name]
# )

### Define a Transform Step to Perform Batch Transformation

In [56]:
# Throwing AttributeError: 'NoneType' object has no attribute 'startswith'
## ----------------------------------
from sagemaker.transformer import Transformer
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

transformer = Transformer(
    model_name = "pipelines-s3w0zfsb10v5-PdM-Model-Creation-C-3ONBZoPQPt",
    instance_type = "ml.m5.xlarge",
    instance_count = 1,
    output_path = f"s3://{bucket}/{prefix}/PdM-Batch-Transform",
    accept = 'text/csv',
    strategy = 'MultiRecord',
    base_transform_job_name = "PdM-Batch-Transform-Job",
    env = {
        'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv',
        'SAGEMAKER_DEFAULT_INVOCATIONS_CONTENT_TYPE': 'text/csv'
    }
)

transform_step = TransformStep(
    name = "PdM-Batch-Transform", transformer = transformer, inputs = TransformInput(data = batch_data)
)
## ----------------------------------

transformer.transform(
    data = batch_data_uri,
    content_type = 'text/csv',
)

INFO:sagemaker:Creating transform job with name: PdM-Batch-Transform-Job-2023-05-02-06-14-51-647


..........................[34m2023-05-02 06:19:12,888 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-02 06:19:12,890 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-02 06:19:12,891 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
    

KeyboardInterrupt: 

### Run Bias Metrics with Clarify

In [59]:
import pathlib

# Initializes a configuration of both input and output datasets.
bias_data_config = sagemaker.clarify.DataConfig(
    s3_data_input_path = train_test_split_step.properties.ProcessingOutputConfig.Outputs[
        "train"
    ].S3Output.S3Uri,
    s3_output_path = f"s3://{bucket}/{prefix}/clarify-output/pipeline/bias",
    label = "failure",
    dataset_type = "text/csv",
)

# Initializes a configuration of the sensitive groups in the dataset.
bias_config = sagemaker.clarify.BiasConfig(
    label_values_or_threshold = [0],
    facet_name = "age",
    facet_values_or_threshold = [1],
)

analysis_config = bias_data_config.get_config()
analysis_config.update(bias_config.get_config())
analysis_config["methods"] = {"pre_training_bias": {"methods": "all"}}

clarify_config_dir = pathlib.Path("config")
clarify_config_dir.mkdir(exist_ok = True)
with open(clarify_config_dir / "analysis_config.json", "w") as f:
    json.dump(analysis_config, f)

s3_client.upload_file(
    Filename = "config/analysis_config.json",
    Bucket = bucket,
    Key = f"{prefix}/clarify-config/analysis_config.json",
)

In [62]:
clarify_processor = sagemaker.processing.Processor(
    base_job_name = "PdM-Clarify-Processor",
    image_uri = sagemaker.clarify.image_uris.retrieve(framework = "clarify", region = region, version = "1.2-1"),
    role = sagemaker_role,
    instance_count = 1,
    instance_type = "ml.c5.xlarge",
)

clarify_step = ProcessingStep(
    name = "PdM-ClarifyProcessor",
    processor = clarify_processor,
    inputs = [
        sagemaker.processing.ProcessingInput(
            input_name = "analysis_config",
            source = f"s3://{bucket}/{prefix}/clarify-config/analysis_config.json",
            destination = "/opt/ml/processing/input/config",
        ),
        sagemaker.processing.ProcessingInput(
            input_name = "dataset",
            source = train_test_split_step.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            destination = "/opt/ml/processing/input/data",
        ),
    ],
    outputs = [
        sagemaker.processing.ProcessingOutput(
            source = "/opt/ml/processing/output/analysis.json",
            destination = f"s3://{bucket}/{prefix}/clarify-output/pipeline/bias",
            output_name = "analysis_result",
        )
    ],
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


### Define a Register Model Step to Create a Model Package

In [None]:
mpg_name = prefix

model_metrics = demo_helpers.ModelMetrics(
    bias=sagemaker.model_metrics.MetricsSource(
        s3_uri=clarify_step.properties.ProcessingOutputConfig.Outputs[
            "analysis_result"
        ].S3Output.S3Uri,
        content_type="application/json",
    )
)

In [20]:
register_args = model.register(
    content_types = ["text/csv"],
    response_types = ["text/csv"],
    inference_instances = ["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances = ["ml.m5.xlarge"],
    model_package_group_name = "PdM-ModelPackageGroupName",
    approval_status = model_approval_status,
)
step_register = ModelStep(name = "PdM-RegisterModel", step_args = register_args)

NameError: name 'model' is not defined

In [21]:
from sagemaker.workflow.step_collections import RegisterModel

register_step = RegisterModel(
    name = "PdM-Register-Model",
    estimator = sklearn_estimator,
    model_data = training_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types = ["text/csv"],
    response_types = ["text/csv"],
    inference_instances = ["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances = ["ml.m5.xlarge"],
    model_package_group_name = "predictive-maintenance",
    approval_status = model_approval_status,
)

In [22]:
# Data Capture Config
deploy_args = framework_processor.run(
    code = "deploy_model.py",
    source_dir = "scripts",
    arguments = [
        "--model-data",
        training_step.properties.ModelArtifacts.S3ModelArtifacts,
        "--region",
        region,
        "--endpoint-instance-type",
        "ml.m5.xlarge",
        "--endpoint-name",
        "PdM-SKLearn-Pipeline-Endpoint",
    ],
)

deploy_step = ProcessingStep(
    name = "PdM-DeployModel",
    step_args = deploy_args
)

In [43]:
# deploy_args = framework_processor.run(
#     code = "deploy_model.py",
#     source_dir = "scripts",
#     arguments = [
#         "--model-name",
#         create_model_step.properties.ModelName,
#         "--region",
#         region,
#         "--endpoint-instance-type",
#         "ml.m5.xlarge",
#         "--endpoint-name",
#         "PdM-SKLearn-Pipeline-Endpoint",
#     ],
# )

# deploy_step = ProcessingStep(
#     name = "PdM-DeployModel",
#     step_args = deploy_args
# )

### Define a Pipeline of Parameters, Steps, and Conditions

In [23]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = "Predictive-Maintenance-Pipeline"
pipeline = Pipeline(
    name = pipeline_name,
    parameters = [
        processing_instance_count,
        instance_type,
        model_approval_status,
    ],
    steps = [preprocess_step, 
             feature_store_step, 
             train_test_split_step, 
             training_step, 
#              create_model_step,
             register_step, 
             deploy_step],
)

In [61]:
definition = json.loads(pipeline.definition())
definition

INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/4e77a1652948c15f8238ad155f94d9b2/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/54f0ef6bee583ff9186b762aaf572190/runproc.sh
INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/7c8208efe04bc0be19adf43987768eb5/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/efe76bc8a2d1c725e4512e8329a05ce5/runproc.sh
INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/fd3a7b1fe5101af0047c8fa381d57c4f/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/203aada7b8aa37a41044caef741

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceCount',
   'Type': 'Integer',
   'DefaultValue': 1},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'PdM-Data-Read-And-PreProcessing',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.xlarge',
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3',
     'ContainerEntrypoint': ['/bin/bash',
      '/opt/ml/processing/input/entrypoint/runproc.sh']},
    'RoleArn': 'arn:aws:iam::4516

In [62]:
pipeline.upsert(role_arn = sagemaker_role)

INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/4e77a1652948c15f8238ad155f94d9b2/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/54f0ef6bee583ff9186b762aaf572190/runproc.sh
INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/7c8208efe04bc0be19adf43987768eb5/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/efe76bc8a2d1c725e4512e8329a05ce5/runproc.sh
INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/fd3a7b1fe5101af0047c8fa381d57c4f/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-451633145432/Predictive-Maintenance-Pipeline/code/203aada7b8aa37a41044caef741

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:451633145432:pipeline/predictive-maintenance-pipeline',
 'ResponseMetadata': {'RequestId': '63bdc9b3-a047-47a3-972b-d54eee9c5996',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '63bdc9b3-a047-47a3-972b-d54eee9c5996',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '99',
   'date': 'Thu, 04 May 2023 12:35:46 GMT'},
  'RetryAttempts': 0}}

In [63]:
execution = pipeline.start()

In [64]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:451633145432:pipeline/predictive-maintenance-pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:451633145432:pipeline/predictive-maintenance-pipeline/execution/nfdikljekywz',
 'PipelineExecutionDisplayName': 'execution-1683203749099',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2023, 5, 4, 12, 35, 48, 990000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2023, 5, 4, 12, 35, 48, 990000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:451633145432:user-profile/d-l7econawrxww/aiml-sandbox',
  'UserProfileName': 'aiml-sandbox',
  'DomainId': 'd-l7econawrxww'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:451633145432:user-profile/d-l7econawrxww/aiml-sandbox',
  'UserProfileName': 'aiml-sandbox',
  'DomainId': 'd-l7econawrxww'},
 'ResponseMetadata': {'RequestId': '37f1eabe-4732-4eeb-831f-aea4e6d82484',
  'HTTPStatusCode': 200,
  'HTTPH

In [66]:
execution.list_steps()

[{'StepName': 'PdM-Data-Read-And-PreProcessing',
  'StartTime': datetime.datetime(2023, 5, 4, 12, 35, 50, 159000, tzinfo=tzlocal()),
  'StepStatus': 'Executing',
  'AttemptCount': 0,
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:451633145432:processing-job/pipelines-nfdikljekywz-PdM-Data-Read-And-Pr-RZnUAJRQRg'}}}]

In [60]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer


viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

{'StepName': 'PredictiveMaintenanceProcess', 'StartTime': datetime.datetime(2023, 4, 23, 17, 17, 26, 95000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2023, 4, 23, 17, 21, 36, 865000, tzinfo=tzlocal()), 'StepStatus': 'Failed', 'AttemptCount': 0, 'FailureReason': 'ClientError: AlgorithmError: See job logs for more information', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:451633145432:processing-job/pipelines-zz185sc95bbe-predictivemaintenanc-r6sgbfjtd8'}}}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...7d1d7483ff10e4a22a58f19/preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...ve-maintenance/data/train-test/train.csv,Input,DataSet,ContributedTo,artifact
2,68331...com/sagemaker-scikit-learn:1.2-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...PredictiveMaintenanceProcess/output/test,Output,DataSet,Produced,artifact
4,s3://...redictiveMaintenanceProcess/output/train,Output,DataSet,Produced,artifact


### List and Check Endpoint Status

In [25]:
endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName = "PdM-SKLearn-Pipeline-Endpoint")
endpoint_info["EndpointStatus"]

'InService'

In [27]:
sagemaker_boto_client.list_endpoints(NameContains = "PdM-SKLearn-Pipeline-Endpoint")[
    "Endpoints"
]

[{'EndpointName': 'PdM-SKLearn-Pipeline-Endpoint',
  'EndpointArn': 'arn:aws:sagemaker:us-east-1:451633145432:endpoint/pdm-sklearn-pipeline-endpoint',
  'CreationTime': datetime.datetime(2023, 5, 4, 12, 53, 4, 753000, tzinfo=tzlocal()),
  'LastModifiedTime': datetime.datetime(2023, 5, 4, 12, 54, 52, 972000, tzinfo=tzlocal()),
  'EndpointStatus': 'InService'}]

### Test the Endpoint

In [125]:
inference_data = [[170.301017,449.0369949,94.80520453,40.81679659,11.0616672,58.42505515,4.931305335,2.428740172,176.8443758,456.5981069,100.65744,39.20591517,13.01510513,53.2529348,9.681705707,5.916193396,0,0,0,0,0,28.875,13.875,118.875,28.875,18,0,0,1,0]]

In [121]:
# body = json.dumps(inference_data)
content_type = "text/csv"

# respnse type
accept = "text/plain"
endpoint_name = "PdM-SKLearn-Model-Pipeline"
print(body)
# res = sm_runtime.invoke_endpoint(
#     EndpointName=endpoint_name,
#     Body=body,  # encoded input data
#     ContentType=content_type,  # I told the endpoint what's the encode
#     Accept=accept,  # I told the endpoint how I want to decode its response
# )

"170.301017,449.0369949,94.80520453,40.81679659,11.0616672,58.42505515,4.931305335,2.428740172,176.8443758,456.5981069,100.65744,39.20591517,13.01510513,53.2529348,9.681705707,5.916193396,0,0,0,0,0,28.875,13.875,118.875,28.875,18,0,0,1,0"


In [122]:
# "170.301017,449.0369949,94.80520453,40.81679659,11.0616672,58.42505515,4.931305335,2.428740172,176.8443758,456.5981069,100.65744,39.20591517,13.01510513,53.2529348,9.681705707,5.916193396,0,0,0,0,0,28.875,13.875,118.875,28.875,18,0,0,1,0"
sm_runtime = boto3.client("sagemaker-runtime")

res = sm_runtime.invoke_endpoint(
    EndpointName = endpoint_name,
    Body = inference_data,  # encoded input data
    ContentType = content_type,  # I told the endpoint what's the encode
)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/PdM-SKLearn-Model-Pipeline in account 451633145432 for more information.

In [None]:
result = res['Body'].read().decode("utf-8")
print(result)

In [177]:
inference_data = pd.read_csv("datasets/train-test/test.csv")
inference_data = inference_data.drop("failure", axis = 1)
inference_data.head(5)

Unnamed: 0,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,voltmean_24h,rotatemean_24h,...,error5count,comp1,comp2,comp3,comp4,age,model_model1,model_model2,model_model3,model_model4
0,170.301017,449.036995,94.805205,40.816797,11.061667,58.425055,4.931305,2.42874,176.844376,456.598107,...,0.0,28.875,13.875,118.875,28.875,18,0,0,1,0
1,165.339972,435.660354,103.35132,31.892462,10.717864,26.009485,22.071933,6.020669,176.141499,453.900566,...,0.0,29.0,14.0,119.0,29.0,18,0,0,1,0
2,183.752875,463.05864,109.525083,41.945037,9.369264,43.646584,10.859804,9.395067,175.764202,451.753148,...,0.0,29.125,14.125,119.125,29.125,18,0,0,1,0
3,177.866822,506.692032,98.74526,39.861149,16.59609,38.086352,10.410456,5.418325,175.352459,455.124136,...,0.0,29.25,14.25,119.25,29.25,18,0,0,1,0
4,167.471524,425.963281,111.996389,39.396999,9.015089,63.342755,4.648154,6.365146,174.712824,451.436346,...,0.0,29.375,14.375,119.375,29.375,18,0,0,1,0


In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer, JSONSerializer

endpoint_name = "PdM-SKLearn-Pipeline-Endpoint"

inference_data = [[175.0811885,367.0124825,85.56419146,38.93223666,9.619609293,14.06342007,8.532316108,6.92274997,173.6695574,410.6964281,101.2469478,39.05261843,16.14284195,46.31450847,10.57037821,5.183643471,0,1,2,0,0,15,0,135,0,18,0,0,1,0]]
predictor = Predictor(
    endpoint_name = endpoint_name, 
    sagemaker_session = sagemaker_session, 
    serializer = JSONSerializer()
)
print(predictor.predict(inference_data))

In [None]:
for i in inference_data.values.tolist():
    data = []
    data.append(i)
    result = predictor.predict(data).decode("utf-8")
    print(f"Value Predictied: {result}")

Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
Value Predictied: ["none"]
V

### Testing with Json onput

In [238]:
sagemaker_runtime = boto3.client('sagemaker-runtime')

inference_data = [[175.0811885,367.0124825,85.56419146,38.93223666,9.619609293,14.06342007,8.532316108,6.92274997,173.6695574,410.6964281,101.2469478,39.05261843,16.14284195,46.31450847,10.57037821,5.183643471,0,1,2,0,0,15,0,135,0,18,0,0,1,0]]

# specify the endpoint name and content type
endpoint_name = "PdM-SKLearn-Pipeline-Endpoint"
content_type = 'application/json'

# convert the input data to JSON format
json_data = json.dumps(inference_data)
print(type(json_data))

# make a request to the SageMaker endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=json_data,
)

print(f"Response object: {response}")
print(type(response))

# parse the response
result = json.loads(response['Body'].read().decode())

# print the inference result
print(result)


<class 'str'>
Response object: {'ResponseMetadata': {'RequestId': 'f75a2e84-9bba-4047-9e81-edd47c0d7a98', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'f75a2e84-9bba-4047-9e81-edd47c0d7a98', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Tue, 09 May 2023 06:53:29 GMT', 'content-type': 'application/json', 'content-length': '9'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7fb9ffd4ae50>}
<class 'dict'>
['comp2']


### Testing with CSV input

In [247]:
inference_data = "175.0811885,367.0124825,85.56419146,38.93223666,9.619609293,14.06342007,8.532316108,6.92274997,173.6695574,410.6964281,101.2469478,39.05261843,16.14284195,46.31450847,10.57037821,5.183643471,0,1,2,0,0,15,0,135,0,18,0,0,1,0"

# specify the endpoint name and content type
endpoint_name = "PdM-SKLearn-Pipeline-Endpoint"
content_type = 'text/csv'

# make a request to the SageMaker endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=inference_data,
)

print(f"Response object: {response}")
print(type(response))

# parse the response
# result = json.loads(response['Body'].read().decode())

# print the inference result
# print(result)


ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/PdM-SKLearn-Pipeline-Endpoint in account 451633145432 for more information.

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/PdM-SKLearn-Pipeline-Endpoint in account 451633145432 for more information.

### Model Monitoring - Data Drift
- DefaultModelMonitor() - Data Drift
- ModelQualityMonitor() - Model Drift

In [28]:
from datetime import datetime

In [55]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

pdm_model_quality_monitor = DefaultModelMonitor(
    role = sagemaker_role,
    instance_count = 1,
    instance_type = "ml.m5.xlarge",
    volume_size_in_gb = 1,
    max_runtime_in_seconds = 360,
    sagemaker_session = sagemaker_session,
)

TypeError: __init__() got an unexpected keyword argument 'monitor_schedule_name'

In [61]:
baseline_data = "datasets/train-test/train.csv"
baseline_results_uri = f"s3://{bucket}/{prefix}/data/baselining/results"
baseline_job_name = f"PdM-Baseline-Job-Data-Monitor-{datetime.utcnow():%Y-%m-%d-%H%M}"

pdm_model_quality_monitor.suggest_baseline(
    job_name = baseline_job_name,
    baseline_dataset = baseline_data,
    dataset_format = DatasetFormat.csv(header = True),
    output_s3_uri = baseline_results_uri,
)

INFO:sagemaker:Creating processing-job with name PdM-Baseline-Job-Data-Monitor-2023-05-08-0649


.........................[34m2023-05-08 06:53:39,474 - matplotlib.font_manager - INFO - Generating new fontManager, this may take some time...[0m
[34m2023-05-08 06:53:40.027614: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory[0m
[34m2023-05-08 06:53:40.027647: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.[0m
[34m2023-05-08 06:53:41.603397: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory[0m
[34m2023-05-08 06:53:41.603429: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)[0m
[34m2023-05-08 06:53:41.603453: I tensorflow/stream_executor/cuda/cuda_diagnostics

<sagemaker.processing.ProcessingJob at 0x7fba158997d0>

In [65]:
baseline_job = pdm_model_quality_monitor.latest_baselining_job

In [230]:
print(baseline_job)

<sagemaker.model_monitor.model_monitoring.BaseliningJob object at 0x7fba158992d0>


In [66]:
schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict["features"])
schema_df.head(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,name,inferred_type,numerical_statistics.common.num_present,numerical_statistics.common.num_missing,numerical_statistics.mean,numerical_statistics.sum,numerical_statistics.std_dev,numerical_statistics.min,numerical_statistics.max,numerical_statistics.distribution.kll.buckets,numerical_statistics.distribution.kll.sketch.parameters.c,numerical_statistics.distribution.kll.sketch.parameters.k,numerical_statistics.distribution.kll.sketch.data,string_statistics.common.num_present,string_statistics.common.num_missing,string_statistics.distinct_count,string_statistics.distribution.categorical.buckets
0,voltmean_3h,Fractional,217410.0,0.0,170.791034,37131680.0,9.492604,125.532506,241.420717,"[{'lower_bound': 125.53250571207518, 'upper_bo...",0.64,2048.0,"[[167.79487771268165, 171.603703637144, 177.55...",,,,
1,rotatemean_3h,Fractional,217410.0,0.0,446.655268,97107320.0,33.024742,211.811184,584.830768,"[{'lower_bound': 211.81118441944167, 'upper_bo...",0.64,2048.0,"[[463.1528971222156, 441.41769323748935, 468.3...",,,,
2,pressuremean_3h,Fractional,217410.0,0.0,100.826458,21920680.0,7.387576,72.118639,162.309656,"[{'lower_bound': 72.11863935291926, 'upper_bou...",0.64,2048.0,"[[101.34153968848203, 107.76510085160199, 94.6...",,,,
3,vibrationmean_3h,Fractional,217410.0,0.0,40.396874,8782684.0,3.488732,26.569635,69.311324,"[{'lower_bound': 26.569635352906833, 'upper_bo...",0.64,2048.0,"[[36.902166654524365, 43.71610625186734, 47.64...",,,,
4,voltsd_3h,Fractional,217410.0,0.0,13.283418,2887948.0,6.969838,0.025509,58.444332,"[{'lower_bound': 0.02550887572660243, 'upper_b...",0.64,2048.0,"[[19.113673369961358, 11.004017122591721, 8.00...",,,,
5,rotatesd_3h,Fractional,217410.0,0.0,44.42953,9659424.0,23.21495,0.078991,179.903039,"[{'lower_bound': 0.0789906042092925, 'upper_bo...",0.64,2048.0,"[[33.14340686390072, 37.45519735513709, 49.227...",,,,
6,pressuresd_3h,Fractional,217410.0,0.0,8.880941,1930805.0,4.649008,0.027417,34.910352,"[{'lower_bound': 0.02741697855984789, 'upper_b...",0.64,2048.0,"[[11.777238686887012, 5.018177177004166, 3.977...",,,,
7,vibrationsd_3h,Fractional,217410.0,0.0,4.442863,965922.9,2.322745,0.015375,18.305595,"[{'lower_bound': 0.015374693376591765, 'upper_...",0.64,2048.0,"[[2.7780427681845623, 4.04623180112535, 4.5948...",,,,
8,voltmean_24h,Fractional,217410.0,0.0,170.771986,37127540.0,4.175947,156.713608,206.333895,"[{'lower_bound': 156.71360772998275, 'upper_bo...",0.64,2048.0,"[[170.30209225913416, 170.76029599609637, 171....",,,,
9,rotatemean_24h,Fractional,217410.0,0.0,446.664528,97109340.0,15.660982,315.190034,491.081522,"[{'lower_bound': 315.1900341737185, 'upper_bou...",0.64,2048.0,"[[459.085308468445, 459.02234667320295, 460.04...",,,,


In [67]:
constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict["features"])
constraints_df.head(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,name,inferred_type,completeness,num_constraints.is_non_negative,string_constraints.domains
0,voltmean_3h,Fractional,1.0,True,
1,rotatemean_3h,Fractional,1.0,True,
2,pressuremean_3h,Fractional,1.0,True,
3,vibrationmean_3h,Fractional,1.0,True,
4,voltsd_3h,Fractional,1.0,True,
5,rotatesd_3h,Fractional,1.0,True,
6,pressuresd_3h,Fractional,1.0,True,
7,vibrationsd_3h,Fractional,1.0,True,
8,voltmean_24h,Fractional,1.0,True,
9,rotatemean_24h,Fractional,1.0,True,


### Create Monitoring Schedule

In [73]:
# Cron Expression Generator
from sagemaker.model_monitor import CronExpressionGenerator

monitor_schedule_name = "PdM-DataDrift-Monitoring-Schedule"

pdm_model_quality_monitor.create_monitoring_schedule(
    monitor_schedule_name = monitor_schedule_name,
    statistics = pdm_model_quality_monitor.baseline_statistics(),
    endpoint_input = endpoint_name,
    constraints = pdm_model_quality_monitor.suggested_constraints(),
    schedule_cron_expression = CronExpressionGenerator.hourly(),
    output_s3_uri = baseline_results_uri,
    enable_cloudwatch_metrics = True
)

INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: PdM-DataDrift-Monitoring-Schedule


In [246]:
CronExpressionGenerator.daily_every_x_hours(hour_interval = 0.01, starting_hour = 0)

'cron(0 0/0.01 ? * * *)'

In [72]:
sagemaker_boto_client.delete_monitoring_schedule(MonitoringScheduleName = "PdM-DataDrift-Monitoring-Schedule")

{'ResponseMetadata': {'RequestId': '958ae49b-196c-4d1b-bab3-d81592a93482',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '958ae49b-196c-4d1b-bab3-d81592a93482',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 08 May 2023 07:03:10 GMT'},
  'RetryAttempts': 0}}

In [135]:
executions = pdm_model_quality_monitor.list_executions()

In [144]:
desc_schedule_result = pdm_model_quality_monitor.describe_schedule()
print(f"Schedule status: {desc_schedule_result['MonitoringScheduleStatus']}")

Schedule status: Scheduled


In [236]:
desc_schedule_result["LastMonitoringExecutionSummary"]

{'MonitoringScheduleName': 'PdM-DataDrift-Monitoring-Schedule',
 'ScheduledTime': datetime.datetime(2023, 5, 8, 12, 0, tzinfo=tzlocal()),
 'CreationTime': datetime.datetime(2023, 5, 8, 12, 9, 29, 410000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2023, 5, 8, 12, 11, 14, 340000, tzinfo=tzlocal()),
 'MonitoringExecutionStatus': 'Failed',
 'EndpointName': 'PdM-SKLearn-Pipeline-Endpoint',
 'FailureReason': 'Job inputs had no data'}

In [237]:
ongoing_executions = pdm_model_quality_monitor.list_executions()
while len(executions) == 0:
    print("Waiting for the 1st execution to happen...")
    time.sleep(60)
    ongoing_executions = pdm_model_quality_monitor.list_executionst()
print(f"Ongoing Executions: {len(ongoing_executions)}")

Ongoing Executions: 9


In [233]:
ongoing_executions[-1].describe()

{'ProcessingInputs': [{'InputName': 'baseline',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://ideaaiml-demo/mlops/predictive-maintenance/data/baselining/results/statistics.json',
    'LocalPath': '/opt/ml/processing/baseline/stats',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated'}},
  {'InputName': 'constraints',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://ideaaiml-demo/mlops/predictive-maintenance/data/baselining/results/constraints.json',
    'LocalPath': '/opt/ml/processing/baseline/constraints',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated'}},
  {'InputName': 'endpoint_input_1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://ideaaiml-demo/mlops/predictive-maintenance/data-capture-model-monitor/PdM-SKLearn-Pipeline-Endpoint/AllTraffic/2023/05/09/05',
    'LocalPath': '/opt/ml/processing/input/endpoint/PdM-SKLearn-Pipeline-Endpoint/AllTraffic/20

In [None]:
suggested_constraints = pdm_model_quality_monitor.suggested_constraints()
baseline_statistics = pdm_model_quality_monitor.baseline_statistics()

print(f"Suggested constraints: {type(suggested_constraints)}")
print(f"Baseline Statistics: {type(baseline_statistics)}")

print(suggested_constraints.file_s3_uri)
print(baseline_statistics.file_s3_uri)

In [None]:
pdm_model_quality_monitor.constraints(suggested_constraints.file_s3_uri)
pdm_model_quality_monitor.cons

In [138]:
latest_monitoring_violations = pdm_model_quality_monitor.latest_monitoring_constraint_violations()
latest_monitoring_schedule = pdm_model_quality_monitor.latest_monitoring_schedule()
print(f"Latest Monitoring Violations: {latest_monitoring_constraint_violations}")
print(f"Latest Monitoring Schedule: {latest_monitoring_schedule}")


Could not retrieve constraints file at location 's3://ideaaiml-demo/mlops/predictive-maintenance/data/baselining/results/PdM-SKLearn-Pipeline-Endpoint/PdM-DataDrift-Monitoring-Schedule/2023/05/08/10/constraint_violations.json'. To manually retrieve ConstraintViolations object from a given uri, use 'my_model_monitor.constraints(my_s3_uri)' or 'ConstraintViolations.from_s3_uri(my_s3_uri)'


UnexpectedStatusException: The underlying job is not in 'Completed' state. You may only retrieve files for a job that has completed successfully.

In [None]:
from threading import Thread

def invoke_endpoint(ep_name, file_name):
    with open(file_name, "r") as f:
        i = 0
        for row in f:
            payload = row.rstrip("\n")
            print(payload)
            predictor.predict(inference_data)

def invoke_endpoint_forever():
    while True:
        try:
            invoke_endpoint(endpoint_name, "datasets/PdM_inference_data.csv")
        except sagemaker_session.sagemaker_runtime_client.exceptions.ValidationError:
            pass


thread = Thread(target = invoke_endpoint_forever)
thread.start()

#### List Captured Data

In [151]:
from sagemaker.s3 import S3Downloader

print("Waiting for captures to show up", end = "")
for _ in range(120):
    capture_files = sorted(S3Downloader.list(f"s3://{bucket}/{prefix}/data-capture-model-monitor/{endpoint_name}"))
    if capture_files:
        capture_file = S3Downloader.read_file(capture_files[-1]).split("\n")
        capture_record = json.loads(capture_file[0])
        if "inferenceId" in capture_record["eventMetadata"]:
            break
    print(".", end="", flush=True)
    time.sleep(1)
print()
print("Found Capture Files:")
print("\n ".join(capture_files[-3:]))

Waiting for captures to show up........................................................................................................................
Found Capture Files:
s3://ideaaiml-demo/mlops/predictive-maintenance/data-capture-model-monitor/PdM-SKLearn-Pipeline-Endpoint/AllTraffic/2023/05/08/09/53-57-217-09ca2202-0133-4b13-a6f9-20422a8bea78.jsonl
 s3://ideaaiml-demo/mlops/predictive-maintenance/data-capture-model-monitor/PdM-SKLearn-Pipeline-Endpoint/AllTraffic/2023/05/08/12/25-51-482-cdcc4a42-5088-4fec-9411-5110956fce3a.jsonl
 s3://ideaaiml-demo/mlops/predictive-maintenance/data-capture-model-monitor/PdM-SKLearn-Pipeline-Endpoint/AllTraffic/2023/05/08/12/27-13-319-a9a5f0cd-4319-4479-be04-c77734539bab.jsonl
