## Predictive Maintenance Pipeline

In [1]:
import json
import time
import boto3
import string
import sagemaker
import pandas as pd
import numpy as np
import awswrangler as wr

from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.workflow.pipeline_context import PipelineSession

In [None]:
region = sagemaker.Session().boto_region_name
print("Using AWS Region: {}".format(region))

In [3]:
boto3.setup_default_session(region_name = region)
boto_session = boto3.Session(region_name = region)

s3_client = boto3.client("s3", region_name = region)

sagemaker_boto_client = boto_session.client("sagemaker")
sagemaker_session = sagemaker.session.Session(
    boto_session = boto_session, sagemaker_client = sagemaker_boto_client
)
account_id = boto3.client("sts").get_caller_identity()["Account"]
sagemaker_role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()

In [None]:
sagemaker.get_execution_role()

In [5]:
bucket = "BUCKET-NAME"
prefix = "mlops/predictive-maintenance"

### Upload raw data to S3

In [13]:
s3_client.upload_file(
    Filename="datasets/PdM_telemetry.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_telemetry.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_errors.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_errors.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_maint.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_maint.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_failures.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_failures.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_machines.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_machines.csv"
)

In [None]:
!pip install s3fs

### Read Data from S3

In [224]:
telemetry_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_telemetry.csv"
errors_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_errors.csv"
maint_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_maint.csv"
failures_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_failures.csv"
machines_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_machines.csv"

In [73]:
response = s3_client.get_object(Bucket = bucket, Key = f"{prefix}/data/raw/PdM_telemetry.csv")
telemetry = pd.read_csv(response.get("Body"))
telemetry.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511


In [None]:
telemetry = pd.read_csv(telemetry_data_uri)
# errors = wr.s3.read_csv(errors_data_uri)
# maint = wr.s3.read_csv(maint_data_uri)
# failures = wr.s3.read_csv(failures_data_uri)
# machines = wr.s3.read_csv(machines_data_uri)

### Define Parameters to Parametrize Pipeline Execution

In [6]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

In [7]:
input_data_uri = f"s3://{bucket}/{prefix}/data/final_data.csv"
batch_data_uri = f"s3://{bucket}/{prefix}/data/train-test/test.csv"

In [8]:
processing_instance_count = ParameterInteger(name = "ProcessingInstanceCount", default_value = 1)
instance_type = ParameterString(name = "TrainingInstanceType", default_value = "ml.m5.xlarge")

model_approval_status = ParameterString(
    name = "ModelApprovalStatus", default_value = "PendingManualApproval"
)

input_data = ParameterString(
    name = "InputData",
    default_value = input_data_uri,
)

telemetry_data = ParameterString(
    name = "TelemetryData",
    default_value = f"s3://{bucket}/{prefix}/data/raw/PdM_telemetry.csv",
)

errors_data = ParameterString(
    name = "ErrorsData",
    default_value = f"s3://{bucket}/{prefix}/data/raw/PdM_errors.csv",
)

maint_data = ParameterString(
    name = "MaintenanceData",
    default_value = f"s3://{bucket}/{prefix}/data/raw/PdM_maint.csv",
)

failures_data = ParameterString(
    name = "FailuresData",
    default_value = f"s3://{bucket}/{prefix}/data/raw/PdM_failures.csv",
)

machines_data = ParameterString(
    name = "MachinesData",
    default_value = f"s3://{bucket}/{prefix}/data/raw/PdM_machines.csv",
)

batch_data = ParameterString(
    name = "BatchData",
    default_value = batch_data_uri,
)

mse_threshold = ParameterFloat(name = "MseThreshold", default_value = 6.0)

### Define a Processing Step for Feature Engineering

### Preprocessing Script

In [9]:
%%writefile processing_scripts/preprocessing.py
import numpy as np
import pandas as pd
import boto3
from io import StringIO
import awswrangler as wr

base_dir = "/opt/ml/processing"
bucket = "BUCKET-NAME"
prefix = "mlops/predictive-maintenance"

def upload_file_s3(df, name):
    boto3.setup_default_session(region_name = "us-east-1")
    s3_client = boto3.client("s3", region_name = "us-east-1")
    with StringIO() as csv_buffer:
        df.to_csv(csv_buffer, index = False)

        response = s3_client.put_object(
            Bucket = bucket, Key = f"{prefix}/data/preprocessed/{name}.csv", Body = csv_buffer.getvalue()
        )
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")

# Convert to datetime datatype
def datetime_datatype(df):
    print("Converting to type datetime")
    df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%d %H:%M:%S")
    return df


# Convert to category datatype
def category_datatype(df, column_name):
    print("Converting to type category")
    df[column_name] = df[column_name].astype('category')
    return df


# Lag Features from Telemetry
def telemetry_features(df):
    df = datetime_datatype(df)
    # Calculate mean values for telemetry features -- 3 hours rolling window
    print("Calculate mean values for telemetry features -- 3 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col).resample('3H', closed = 'left', label = 'right').mean().unstack())
    telemetry_mean_3h = pd.concat(temp, axis = 1)
    telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
    telemetry_mean_3h.reset_index(inplace = True)

    # repeat for standard deviation
    print("Calculate standard deviation for telemetry features -- 3 hours rolling window")
    temp = []
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col).resample('3H', closed = 'left', label = 'right').std().unstack())
    telemetry_sd_3h = pd.concat(temp, axis = 1)
    telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
    telemetry_sd_3h.reset_index(inplace = True)
    
    # Calculate mean values for telemetry features -- 24 hours rolling window
    print("Calculate mean values for telemetry features -- 24 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed = 'left', label = 'right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).mean())
    telemetry_mean_24h = pd.concat(temp, axis = 1)
    telemetry_mean_24h.columns = [i + 'mean_24h' for i in fields]
    telemetry_mean_24h.reset_index(inplace = True)
    telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

    # repeat for standard deviation
    print("Calculate standard deviation for telemetry features -- 24 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).std())
    telemetry_sd_24h = pd.concat(temp, axis = 1)
    telemetry_sd_24h.columns = [i + 'sd_24h' for i in fields]
    telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]
    telemetry_sd_24h.reset_index(inplace = True)
    
    telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis = 1).dropna()

    upload_file_s3(telemetry_feat, "telemetry")
    
    return telemetry_feat


# Lag Features for Errors
def errors_lag_features(df):
    df = datetime_datatype(df)
    df = category_datatype(df, 'errorID')
    print("Lag features for errors")
    error_count = pd.get_dummies(df.set_index('datetime')).reset_index()
    error_count.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']
    error_count = error_count.groupby(['machineID', 'datetime']).sum().reset_index()
    error_count = telemetry[['datetime', 'machineID']].merge(error_count, on = ['machineID', 'datetime'], how = 'left').fillna(0.0)
    temp = []
    fields = ['error%d' % i for i in range(1, 6)]
    for col in fields:
        temp.append(pd.pivot_table(error_count,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).sum())
    error_count = pd.concat(temp, axis = 1)
    error_count.columns = [i + 'count' for i in fields]
    error_count.reset_index(inplace = True)
    error_count = error_count.dropna()
    
    upload_file_s3(error_count, "errors")
    
    return error_count


# Maintenance Features
def maintenance_features(df):
    df = datetime_datatype(df)
    df = category_datatype(df, 'comp')
    print("Maintenance Features -- Days since last replacement")
    comp_rep = pd.get_dummies(df.set_index('datetime')).reset_index()
    comp_rep.columns = ['datetime', 'machineID', 'comp1', 'comp2', 'comp3', 'comp4']

    # combine repairs for a given machine in a given hour
    comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

    # add timepoints where no components were replaced
    comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                          on=['datetime', 'machineID'],
                                                          how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])
    components = ['comp1', 'comp2', 'comp3', 'comp4']
    for comp in components:
        comp_rep.loc[comp_rep[comp] < 1, comp] = None
        comp_rep.loc[-comp_rep[comp].isnull(),
                     comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
        comp_rep[comp] = comp_rep[comp].fillna(method = 'ffill')

    comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]
    for comp in components:
        comp_rep[comp] = (comp_rep["datetime"] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, "D")
        
    upload_file_s3(comp_rep, "maint")
    
    return comp_rep


# Failures Features
def failure_features(df):
    print("Failure features")
    df = datetime_datatype(df)
    df = category_datatype(df, 'failure')
    upload_file_s3(df, "failures")
    return df


# Final Features
def final_features(telemetry_df, errors_df, maint_df, machines_df):
    upload_file_s3(machines_df, "machines")
    print("Final features")
    final_feat = telemetry_df.merge(errors_df, on = ['datetime', 'machineID'], how = 'left')
    final_feat = final_feat.merge(maint_df, on = ['datetime', 'machineID'], how = 'left')
    final_feat = final_feat.merge(machines_df, on = ['machineID'], how = 'left')
    return final_feat


# Label Construction
def label_construct(tele_df, error_df, maint_df, machine_df, failure_df):
    print("----- Final Features -----")
    final_feat = final_features(tele_df, error_df, maint_df, machine_df)
    
    print("----- Label Construction -----")
    labeled_features = pd.DataFrame()
    labeled_features = final_feat.merge(
        failure_df, on = ['datetime', 'machineID'], how = 'left')
    labeled_features['failure'] = labeled_features['failure'].astype(str)
    labeled_features['failure'] = labeled_features['failure'].fillna(method = 'bfill', limit = 7)
    labeled_features['failure'] = labeled_features['failure'].replace('nan', 'none')
    print("----- Preprocessing completed -----")
    
    upload_file_s3(labeled_features, "final_data")
#     pd.DataFrame(labeled_features).to_csv(f"{base_dir}/preprocessed/final_data.csv", index = False)


if __name__ == "__main__":

    telemetry_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_telemetry.csv"
    errors_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_errors.csv"
    maint_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_maint.csv"
    failures_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_failures.csv"
    machines_data_uri = f"s3://{bucket}/{prefix}/data/raw/PdM_machines.csv"
    
    telemetry = wr.s3.read_csv(telemetry_data_uri)
    errors = wr.s3.read_csv(errors_data_uri)
    maint = wr.s3.read_csv(maint_data_uri)
    failures = wr.s3.read_csv(failures_data_uri)
    machines = wr.s3.read_csv(machines_data_uri)
    
    telemetry_df = telemetry_features(telemetry)
    errors_df = errors_lag_features(errors)
    maint_df = maintenance_features(maint)
    failures_df = failure_features(failures)
    machines_df = category_datatype(machines, 'model')
    
    label_construct(telemetry_df, errors_df, maint_df, machines_df, failures_df)

Overwriting processing_scripts/preprocessing.py


### Feature Store Creation Script

In [10]:
%%writefile processing_scripts/featurestore.py
import numpy as np
import pandas as pd
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import awswrangler as wr

base_dir = "/opt/ml/processing"
bucket = "BUCKET-NAME"
prefix = "mlops/predictive-maintenance"

boto_session = boto3.Session(region_name = "us-east-1")
sagemaker_boto_client = boto_session.client("sagemaker")
featurestore_runtime = boto_session.client(
    service_name = "sagemaker-featurestore-runtime", region_name = "us-east-1"
)
try:
    sagemaker_role = sagemaker.get_execution_role()
    print(f"Sagemaker Role for Feature Store file: {sagemaker_role}")
except ValueError:
    sagemaker_role = 'SAGEMAKER-ROLE'
    
feature_store_session = sagemaker.Session(
    boto_session = boto_session,
    sagemaker_client = sagemaker_boto_client,
    sagemaker_featurestore_runtime_client = featurestore_runtime,
)

# ------------------------------------ Read Data
telemetry_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/telemetry.csv"
errors_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/errors.csv"
maint_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/maint.csv"
failures_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/failures.csv"
machines_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/machines.csv"

telemetry = wr.s3.read_csv(telemetry_data_uri)
errors = wr.s3.read_csv(errors_data_uri)
maint = wr.s3.read_csv(maint_data_uri)
failures = wr.s3.read_csv(failures_data_uri)
machines = wr.s3.read_csv(machines_data_uri)

# ------------------------------------ Add Timestamp
telemetry['event_time'] = pd.to_datetime("now").timestamp()
errors['event_time'] = pd.to_datetime("now").timestamp()
maint['event_time'] = pd.to_datetime("now").timestamp()
failures['event_time'] = pd.to_datetime("now").timestamp()
machines['event_time'] = pd.to_datetime("now").timestamp()

# ------------------------------------ Create Feature Group
telemetry_feature_group = FeatureGroup(name = 'telemetry_fg', sagemaker_session = feature_store_session)
errors_feature_group = FeatureGroup(name = 'errors_fg', sagemaker_session = feature_store_session)
maintenance_feature_group = FeatureGroup(name = 'maintenance_fg', sagemaker_session = feature_store_session)
failures_feature_group = FeatureGroup(name = 'failures_fg', sagemaker_session = feature_store_session)
machines_feature_group = FeatureGroup(name = 'machines_fg', sagemaker_session = feature_store_session)

# ------------------------------------ Loading Definitions
telemetry_feature_group.load_feature_definitions(data_frame = telemetry)
errors_feature_group.load_feature_definitions(data_frame = errors)
maintenance_feature_group.load_feature_definitions(data_frame = maint)
failures_feature_group.load_feature_definitions(data_frame = failures)
machines_feature_group.load_feature_definitions(data_frame = machines)

record_identifier_feature_name = "machineID"
event_time_feature_name = "event_time"

# ------------------------------------ Telemetry Feature Store
try:
    telemetry_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "telemetry" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Errors Feature Store      
try:
    errors_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "errors" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Maintenance Feature Store
try:
    maintenance_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "maintenance" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Failures Feature Store
try:
    failures_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "failures" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Machines Feature Store
try:
    machines_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}/feature_store_data",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    time.sleep(30)
    print(f'Create "machines" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print("Using existing feature group")
    else:
        raise (e)

# ------------------------------------ Ingesting Data
while (
    telemetry_feature_group.describe()['FeatureGroupStatus'] == 'Creating'):
    print("Feature Group Creating")
    time.sleep(60)
else:
    print("Feature Group Created")
#     telemetry_feature_group.ingest(data_frame = telemetry, max_workers = 3, wait = True)
#     errors_feature_group.ingest(data_frame = errors, max_workers = 3, wait = True)
#     maintenance_feature_group.ingest(data_frame = maint, max_workers = 3, wait = True)
#     failures_feature_group.ingest(data_frame = failures, max_workers = 3, wait = True)
#     machines_feature_group.ingest(data_frame = machines, max_workers = 3, wait = True)
    print("Feature Data Ingested")

Overwriting processing_scripts/featurestore.py


### Train Test Split Script

In [12]:
%%writefile processing_scripts/train_test_split_data.py
import pandas as pd
import boto3
from io import StringIO
import awswrangler as wr

base_dir = "/opt/ml/processing"
bucket = "BUCKET-NAME"
prefix = "mlops/predictive-maintenance"

def upload_file_s3(df, name):
    boto3.setup_default_session(region_name = "us-east-1")
    s3_client = boto3.client("s3", region_name = "us-east-1")
    with StringIO() as csv_buffer:
        df.to_csv(csv_buffer, index = False)

        response = s3_client.put_object(
            Bucket = bucket, Key = f"{prefix}/data/train-test/{name}.csv", Body = csv_buffer.getvalue()
        )
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")

def train_test_split_script(labeled_features):
    threshold_dates = [[pd.to_datetime('2015-07-31 01:00:00'), pd.to_datetime('2015-08-01 01:00:00')],
                   [pd.to_datetime('2015-08-31 01:00:00'), pd.to_datetime('2015-09-01 01:00:00')],
                   [pd.to_datetime('2015-09-30 01:00:00'), pd.to_datetime('2015-10-01 01:00:00')]]
    
    for last_train_date, first_test_date in threshold_dates:
        # split out training and test data
        print(labeled_features['datetime'][0])
        train_y = labeled_features.loc[labeled_features['datetime'] < last_train_date, 'failure']
        train_data = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date].drop(['datetime',
                                                                                                            'machineID',
                                                                                                              'failure'], axis = 1))
        test_y = labeled_features.loc[labeled_features['datetime'] > last_train_date, 'failure']
        test_data = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] > first_test_date].drop(['datetime',
                                                                                                           'machineID',
                                                                                                             'failure'], axis = 1))
    
    train_data['failure'] = train_y
    test_data['failure'] = test_y
    
    upload_file_s3(train_data, "train")
    upload_file_s3(test_data, "test")

    pd.DataFrame(train_data).to_csv(f"{base_dir}/train/train.csv", index = False)
    pd.DataFrame(test_data).to_csv(f"{base_dir}/test/test.csv", index = False)
    
if __name__ == "__main__":
    final_data_uri = f"s3://{bucket}/{prefix}/data/preprocessed/final_data.csv"
    final_data = wr.s3.read_csv(final_data_uri)
    final_data['datetime'] = pd.to_datetime(final_data['datetime'], format="%Y-%m-%d %H:%M:%S")
    train_test_split_script(final_data)

Overwriting processing_scripts/train_test_split_data.py


### Create an instance of a SKLearnProcessor

In [13]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn import SKLearn
framework_version = "1.2-1"

sklearn_processor = FrameworkProcessor(
    estimator_cls = SKLearn,
    framework_version = framework_version,
    instance_type = "ml.m5.xlarge",
    instance_count = processing_instance_count,
    base_job_name = "sklearn-pred-maint-process",
    role = sagemaker_role,
    sagemaker_session = pipeline_session,
)

### Creating Processing Step

In [14]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = sklearn_processor.run(
#     inputs = [
#         ProcessingInput(source = telemetry_data, destination = "/opt/ml/processing/input/telemetry"),
#         ProcessingInput(source = errors_data, destination = "/opt/ml/processing/input/errors"),
#         ProcessingInput(source = maint_data, destination = "/opt/ml/processing/input/maint"),
#         ProcessingInput(source = failures_data, destination = "/opt/ml/processing/input/failures"),
#         ProcessingInput(source = machines_data, destination = "/opt/ml/processing/input/machines"),
#     ],
#     outputs = [
#         ProcessingOutput(output_name = "final_data", source = "/opt/ml/processing/preprocessed"),
# #         ProcessingOutput(output_name = "test", source = "/opt/ml/processing/test"),
#     ],
    code = "preprocessing.py",
    source_dir = "processing_scripts"
)

preprocess_step = ProcessingStep(name = "PdM-Data-Read-And-PreProcessing", step_args = processor_args)



In [15]:
train_test_args = sklearn_processor.run(
#     inputs = [
#         ProcessingInput(source = input_data, destination = "/opt/ml/processing/processing/preprocessed"),
#     ],
    outputs = [
        ProcessingOutput(output_name = "train", source = "/opt/ml/processing/train"),
        ProcessingOutput(output_name = "test", source = "/opt/ml/processing/test"),
    ],
    code = "train_test_split_data.py",
    source_dir = "processing_scripts"
)

train_test_split_step = ProcessingStep(name = "PdM-Train-Test-Data-Split", step_args = train_test_args, depends_on = [preprocess_step.name])

In [16]:
fs_data = sklearn_processor.run(
    code = "featurestore.py",
    source_dir = "processing_scripts"
)
feature_store_step = ProcessingStep(name = "PdM-FeatureStore-Creation", step_args = fs_data, depends_on = [preprocess_step.name])

### Define a Training Step to Train a Model

In [17]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.inputs import TrainingInput

In [36]:
sklearn_estimator = SKLearn(
    entry_point = "scripts/rf_script.py",
    role = sagemaker_role,
    instance_count = 1,
    instance_type = "ml.m5.xlarge",
    framework_version = "1.2-1",
    base_job_name = "rf-scikit",
    hyperparameters = {
        "n-estimators": 100,
        "min-samples-leaf": 3,
    },
    sagemaker_session = pipeline_session,
)

In [37]:
train_args = sklearn_estimator.fit(
    inputs = {
        "train": TrainingInput(
            s3_data = train_test_split_step.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
#             s3_data = f"s3://{bucket}/{prefix}/data/train-test/train.csv",
            content_type = "text/csv"
        ),
        "test": TrainingInput(
            s3_data = train_test_split_step.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
#             s3_data = f"s3://{bucket}/{prefix}/data/train-test/test.csv",
            content_type = "text/csv",
        )
    }
)

In [38]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

training_step = TrainingStep(
    name = "PdM-ModelTraininig",
    step_args = train_args,
    depends_on = [train_test_split_step.name]
)

### Define a Create Model Step to Create a Model

In [205]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    entry_point = "scripts/rf_script.py",
    model_data = preprocess_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session = pipeline_session,
    framework_version = "1.2-1",
    role = sagemaker_role,
)

In [206]:
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep

step_create_model = ModelStep(
    name = "PredMaintCreateModel",
    step_args = model.create(instance_type = "ml.m5.large"),
    depends_on = [training_step.name]
)

### Define a Transform Step to Perform Batch Transformation

In [207]:
from sagemaker.transformer import Transformer

transformer = Transformer(
    model_name = step_create_model.properties.ModelName,
    instance_type = "ml.m5.xlarge",
    instance_count = 1,
    output_path = f"s3://{bucket}/{prefix}/PredMaintTransform",
)

In [208]:
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

step_transform = TransformStep(
    name = "PredMaintTransform", transformer = transformer, inputs = TransformInput(data = batch_data)
)

### Define a Register Model Step to Create a Model Package

In [209]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics

register_args = model.register(
    content_types = ["text/csv"],
    response_types = ["text/csv"],
    inference_instances = ["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances = ["ml.m5.xlarge"],
    model_package_group_name = "PredMaintModelPackageGroupName",
    approval_status = model_approval_status,
)
step_register = ModelStep(name = "PredMaintRegisterModel", step_args = register_args)

### Define a Pipeline of Parameters, Steps, and Conditions

In [39]:
from sagemaker.workflow.pipeline import Pipeline


pipeline_name = "PredMaintPipeline"
pipeline = Pipeline(
    name = pipeline_name,
    parameters = [
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_data,
        telemetry_data,
        errors_data,
        maint_data,
        failures_data,
        machines_data,
        batch_data,
        mse_threshold,
    ],
    steps = [preprocess_step, feature_store_step, train_test_split_step, training_step],
#         steps = [step_process, step_train],
)

In [None]:
definition = json.loads(pipeline.definition())
definition

In [None]:
pipeline.upsert(role_arn = sagemaker_role)

In [41]:
execution = pipeline.start()

In [None]:
execution.describe()

In [None]:
execution.list_steps()

In [None]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer


viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

In [7]:
# Convert to datetime datatype
def datetime_datatype(df):
    print("Converting to type datetime")
    df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%d %H:%M:%S")
    return df

# Convert to category datatype
def category_datatype(df, column_name):
    print("Converting to type category")
    df[column_name] = df[column_name].astype('category')
    return df

In [8]:
# Lag Features from Telemetry
def telemetry_features(df):
    df = datetime_datatype(df)
    # Calculate mean values for telemetry features -- 3 hours rolling window
    print("Calculate mean values for telemetry features -- 3 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col).resample('3H', closed = 'left', label = 'right').mean().unstack())
    telemetry_mean_3h = pd.concat(temp, axis = 1)
    telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
    telemetry_mean_3h.reset_index(inplace = True)

    # repeat for standard deviation
    print("Calculate standard deviation for telemetry features -- 3 hours rolling window")
    temp = []
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col).resample('3H', closed = 'left', label = 'right').std().unstack())
    telemetry_sd_3h = pd.concat(temp, axis = 1)
    telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
    telemetry_sd_3h.reset_index(inplace = True)
    
    # Calculate mean values for telemetry features -- 24 hours rolling window
    print("Calculate mean values for telemetry features -- 24 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed = 'left', label = 'right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).mean())
    telemetry_mean_24h = pd.concat(temp, axis = 1)
    telemetry_mean_24h.columns = [i + 'mean_24h' for i in fields]
    telemetry_mean_24h.reset_index(inplace = True)
    telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

    # repeat for standard deviation
    print("Calculate standard deviation for telemetry features -- 24 hours rolling window")
    temp = []
    fields = ['volt', 'rotate', 'pressure', 'vibration']
    for col in fields:
        temp.append(pd.pivot_table(df,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).std())
    telemetry_sd_24h = pd.concat(temp, axis = 1)
    telemetry_sd_24h.columns = [i + 'sd_24h' for i in fields]
    telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]
    telemetry_sd_24h.reset_index(inplace = True)
    
    telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis = 1).dropna()
    
    return telemetry_feat

In [9]:
# Lag Features for Errors
def errors_lag_features(df):
    df = datetime_datatype(df)
    df = category_datatype(df, 'errorID')
    print("Lag features for errors")
    error_count = pd.get_dummies(df.set_index('datetime')).reset_index()
    error_count.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']
    error_count = error_count.groupby(['machineID', 'datetime']).sum().reset_index()
    error_count = telemetry[['datetime', 'machineID']].merge(error_count, on = ['machineID', 'datetime'], how = 'left').fillna(0.0)
    temp = []
    fields = ['error%d' % i for i in range(1, 6)]
    for col in fields:
        temp.append(pd.pivot_table(error_count,
                                   index = 'datetime',
                                   columns = 'machineID',
                                   values = col)
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack()
                    .rolling(window = 24, center = False).sum())
    error_count = pd.concat(temp, axis = 1)
    error_count.columns = [i + 'count' for i in fields]
    error_count.reset_index(inplace = True)
    error_count = error_count.dropna()
    
    return error_count

In [10]:
# Maintenance Features
def maintenance_features(df):
    df = datetime_datatype(df)
    df = category_datatype(df, 'comp')
    print("Maintenance Features -- Days since last replacement")
    comp_rep = pd.get_dummies(df.set_index('datetime')).reset_index()
    comp_rep.columns = ['datetime', 'machineID', 'comp1', 'comp2', 'comp3', 'comp4']

    # combine repairs for a given machine in a given hour
    comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

    # add timepoints where no components were replaced
    comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                          on=['datetime', 'machineID'],
                                                          how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])
    components = ['comp1', 'comp2', 'comp3', 'comp4']
    for comp in components:
        comp_rep.loc[comp_rep[comp] < 1, comp] = None
        comp_rep.loc[-comp_rep[comp].isnull(),
                     comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
        comp_rep[comp] = comp_rep[comp].fillna(method = 'ffill')

    comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]
    for comp in components:
        comp_rep[comp] = (comp_rep["datetime"] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, "D")
    
    return comp_rep

In [16]:
# Failures Features
def failure_features(df):
    print("Failure features")
    df = datetime_datatype(df)
    df = category_datatype(df, 'failure')
    return df

In [17]:
# Final Features
def final_features(telemetry_df, errors_df, maint_df, machines_df):
    print("Final features")
    final_feat = telemetry_df.merge(errors_df, on = ['datetime', 'machineID'], how = 'left')
    final_feat = final_feat.merge(maint_df, on = ['datetime', 'machineID'], how = 'left')
    final_feat = final_feat.merge(machines_df, on = ['machineID'], how = 'left')
    return final_feat

In [125]:
telemetry_df = telemetry_features(telemetry)
errors_df = errors_lag_features(errors)
maint_df = maintenance_features(maint)
failures_df = failure_features(failures)
machines_df = category_datatype(machines, 'model')

Converting to type datetime
Calculate mean values for telemetry features -- 3 hours rolling window
Calculate standard deviation for telemetry features -- 3 hours rolling window
Calculate mean values for telemetry features -- 24 hours rolling window
Calculate standard deviation for telemetry features -- 24 hours rolling window
Converting to type datetime
Converting to type category
Lag features for errors
Converting to type datetime
Converting to type category
Maintenance Features -- Days since last replacement
Failure features
Converting to type datetime
Converting to type category
Converting to type category


In [126]:
# Label Construction
def label_construct(tele_df, error_df, maint_df, machine_df, failure_df):
    print("----- Final Features -----")
    final_feat = final_features(tele_df, error_df, maint_df, machine_df)
    display(final_feat.head())
    
    print("----- Label Construction -----")
    labeled_features = pd.DataFrame()
    labeled_features = final_feat.merge(
        failure_df, on = ['datetime', 'machineID'], how = 'left')
    labeled_features['failure'] = labeled_features['failure'].astype(str)
    labeled_features['failure'] = labeled_features['failure'].fillna(method = 'bfill', limit = 7)
    labeled_features['failure'] = labeled_features['failure'].replace('nan', 'none')
    print("----- Preprocessing completed -----")
    
    return labeled_features

labeled_features = label_construct(telemetry_df, errors_df, maint_df, machines_df, failures_df)

----- Final Features -----
Final features


Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error2count,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age
0,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,...,0.0,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18
1,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,0.0,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18
2,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,0.0,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18
3,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,0.0,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18
4,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,0.0,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18


----- Label Construction -----
----- Preprocessing completed -----


In [127]:
labeled_features.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,...,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18,none
1,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18,none
2,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18,none
3,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18,none
4,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18,none


### Feature Store

In [128]:
featurestore_runtime = boto_session.client(
    service_name = "sagemaker-featurestore-runtime", region_name = region
)

feature_store_session = sagemaker.Session(
    boto_session = boto_session,
    sagemaker_client = sagemaker_boto_client,
    sagemaker_featurestore_runtime_client = featurestore_runtime,
)

In [24]:
telemetry_feature_group = FeatureGroup(name = 'telemetry_fg', sagemaker_session = sagemaker_session)
errors_feature_group = FeatureGroup(name = 'errors_fg', sagemaker_session = sagemaker_session)
maintenanace_feature_group = FeatureGroup(name = 'maintenance_fg', sagemaker_session = sagemaker_session)
machines_feature_group = FeatureGroup(name = 'machines_fg', sagemaker_session = sagemaker_session)
failures_feature_group = FeatureGroup(name = 'failures_fg', sagemaker_session = sagemaker_session)

### Feature Deifinitions

In [52]:
telemetry_definitions = {
    'datetime': float,
    'machineID': int,
    'voltmean_3h': float,
    'rotatemean_3h': float,
    'pressuremean_3h': float,
    'vibrationmean_3h': float,
    'voltsd_3h': float,
    'rotatesd_3h': float,
    'pressuresd_3h': float,
    'vibrationsd_3h': float,
    'voltmean_24h': float,
    'rotatemean_24h': float,
    'pressuremean_24h': float,
    'vibrationmean_24h': float,
    'voltsd_24h': float,
    'rotatesd_24h': float,
    'pressuresd_24h': float,
    'vibrationsd_24h': float
}
errors_definitions = {
    "machineID": int,
    "datetime": float,
    "error1count": float,
    "error2count": float,
    "error3count": float,
    "error4count": float,
    "error5count": float
}
maint_definitions = {
    "machineID": int,
    "datetime": float,
    "comp1": int,
    "comp2": int,
    "comp3": int,
    "comp4": int
}
machines_definitions = {
    "machineID": int,
    "datetime": float,
    "model": str,
    "age": int
}
failure_definitions = {
    "machineID": int,
    "datetime": str,
    "failure": str,
}

In [55]:
failures_feature_group.load_feature_definitions(data_frame = fail)
failures_feature_group.create(
        s3_uri=f"s3://{bucket}/{prefix}",
        record_identifier_name="machineID",
        event_time_feature_name="datetime",
        role_arn=sagemaker_role,
        enable_online_store=True,
    )

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:451633145432:feature-group/failures_fg',
 'ResponseMetadata': {'RequestId': '2bcfaaf4-d1c4-48d9-8933-1a21d96a22bb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2bcfaaf4-d1c4-48d9-8933-1a21d96a22bb',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Fri, 21 Apr 2023 08:04:56 GMT'},
  'RetryAttempts': 0}}

### Upload Scripts

In [69]:
s3_client.upload_file(
    Filename = "scripts/rf_script.py", Bucket = bucket, Key = f"{prefix}/scripts/rf_script.py"
)
s3_client.upload_file(
    Filename = "scripts/train_test_split_script.py", Bucket = bucket, Key = f"{prefix}/scripts/train_test_split_script.py"
)

In [70]:
rf_script_uri = f"s3://{bucket}/{prefix}/scripts/rf_script.py"
train_test_script_uri = f"s3://{bucket}/{prefix}/scripts/train_test_split_script.py"

### Train-Test Split Data

In [131]:
from scripts.train_test_split_script import train_test_split_script

In [132]:
train_data, test_data = train_test_split_script(labeled_features)

  'machineID',], 1))
  'machineID'], 1))


In [None]:
from io import StringIO

with StringIO() as csv_buffer:
    train_data.to_csv(csv_buffer, index = False)

    response = s3_client.put_object(
        Bucket = bucket, Key = f"{prefix}/data/train-test/train.csv", Body = csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

In [None]:
from io import StringIO

with StringIO() as csv_buffer:
    test_data.to_csv(csv_buffer, index = False)

    response = s3_client.put_object(
        Bucket = bucket, Key = f"{prefix}/data/train-test/test.csv", Body = csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

In [None]:
train_data_uri = f"s3://{bucket}/{prefix}/data/train-test/train.csv"
test_data_uri = f"s3://{bucket}/{prefix}/data/train-test/test.csv"

### SKLearn Estimator

In [None]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"

sklearn_estimator = SKLearn(
    entry_point = "scripts/rf_script.py",
    role = sagemaker_role,
    instance_count = 1,
    instance_type = "ml.c5.xlarge",
    framework_version = FRAMEWORK_VERSION,
    base_job_name = "rf-scikit",
    output_path = f"s3://{bucket}/{prefix}/estimatoe-output/rf-scikit",
    hyperparameters = {
        "n-estimators": 100,
        "min-samples-leaf": 3,
    },
)

In [None]:
sklearn_estimator.fit(inputs = {"train": train_data_uri, "test": test_data_uri}, wait = True)

In [119]:
sklearn_estimator.latest_training_job.wait(logs = "None")
artifact = sagemaker_boto_client.describe_training_job(
    TrainingJobName = sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

AttributeError: 'NoneType' object has no attribute 'wait'

In [124]:
test_data.columns

Index(['voltmean_3h', 'rotatemean_3h', 'pressuremean_3h', 'vibrationmean_3h',
       'voltsd_3h', 'rotatesd_3h', 'pressuresd_3h', 'vibrationsd_3h',
       'voltmean_24h', 'rotatemean_24h', 'pressuremean_24h',
       'vibrationmean_24h', 'voltsd_24h', 'rotatesd_24h', 'pressuresd_24h',
       'vibrationsd_24h', 'error1count', 'error2count', 'error3count',
       'error4count', 'error5count', 'comp1', 'comp2', 'comp3', 'comp4', 'age',
       'model_model1', 'model_model2', 'model_model3', 'model_model4',
       'failure_comp1', 'failure_comp2', 'failure_comp3', 'failure_comp4',
       'failure_none'],
      dtype='object')