## Predictive Maintenance

In [3]:
import pandas as pd
import numpy as np

## Data Sources  

Common data sources for predictive maintenance problems are :
- Failure history: The failure history of a machine or component within the machine.
- Maintenance history: The repair history of a machine, e.g. error codes, previous maintenance activities or component replacements.
- Machine conditions and usage: The operating conditions of a machine e.g. data collected from sensors.
- Machine features: The features of a machine, e.g. engine size, make and model, location.
- Operator features: The features of the operator, e.g. gender, past experience The data for this example comes from 4 different sources which are real-time telemetry data collected from machines, error messages, historical maintenance records that include failures and machine information such as type and age.

In [4]:
telemetry = pd.read_csv('datasets/PdM_telemetry.csv')
errors = pd.read_csv('datasets/PdM_errors.csv')
maint = pd.read_csv('datasets/PdM_maint.csv')
failures = pd.read_csv('datasets/PdM_failures.csv')
machines = pd.read_csv('datasets/PdM_machines.csv')

In [5]:
telemetry.sample(5)

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
621725,2015-12-19 13:00:00,71,171.431609,445.68262,99.568816,43.52174
71195,2015-02-16 09:00:00,9,162.166099,390.209178,106.828032,30.534673
759248,2015-08-31 00:00:00,87,197.918898,381.510455,100.290983,39.812708
581777,2015-05-29 05:00:00,67,176.097584,480.763771,113.66739,46.800604
197154,2015-07-04 02:00:00,23,195.167628,486.505762,97.492119,42.604918


In [6]:
errors.sample(5)

Unnamed: 0,datetime,machineID,errorID
2731,2015-07-24 06:00:00,71,error5
1300,2015-09-25 08:00:00,34,error2
3909,2015-10-24 23:00:00,100,error1
1991,2015-08-01 06:00:00,52,error2
1328,2015-04-05 06:00:00,35,error4


In [7]:
maint.sample(5)

Unnamed: 0,datetime,machineID,comp
1604,2015-05-15 06:00:00,49,comp2
2262,2015-06-19 06:00:00,69,comp1
1213,2015-06-03 06:00:00,37,comp4
2150,2015-04-04 06:00:00,66,comp4
1902,2015-12-12 06:00:00,58,comp2


In [8]:
failures.sample(5)

Unnamed: 0,datetime,machineID,failure
480,2015-01-16 06:00:00,67,comp2
159,2015-12-04 06:00:00,21,comp2
521,2015-03-24 06:00:00,72,comp1
419,2015-12-25 06:00:00,56,comp1
507,2015-12-09 06:00:00,70,comp2


In [9]:
machines.sample(5)

Unnamed: 0,machineID,model,age
43,44,model4,7
8,9,model4,7
46,47,model2,6
18,19,model3,17
74,75,model3,19


### Telemetry

In [10]:
telemetry.dtypes

datetime      object
machineID      int64
volt         float64
rotate       float64
pressure     float64
vibration    float64
dtype: object

In [5]:
telemetry['datetime'] = pd.to_datetime(telemetry['datetime'], format="%Y-%m-%d %H:%M:%S")
print("Total number of telemetry records: %d" % len(telemetry.index))

Total number of telemetry records: 876100


In [6]:
telemetry.describe()

Unnamed: 0,machineID,volt,rotate,pressure,vibration
count,876100.0,876100.0,876100.0,876100.0,876100.0
mean,50.5,170.777736,446.605119,100.858668,40.385007
std,28.866087,15.509114,52.673886,11.048679,5.370361
min,1.0,97.333604,138.432075,51.237106,14.877054
25%,25.75,160.304927,412.305714,93.498181,36.777299
50%,50.5,170.607338,447.55815,100.425559,40.237247
75%,75.25,181.004493,482.1766,107.555231,43.784938
max,100.0,255.124717,695.020984,185.951998,76.791072


### Errors

In [7]:
errors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919 entries, 0 to 3918
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   datetime   3919 non-null   object
 1   machineID  3919 non-null   int64 
 2   errorID    3919 non-null   object
dtypes: int64(1), object(2)
memory usage: 92.0+ KB


In [8]:
errors['datetime'] = pd.to_datetime(errors['datetime'],format = '%Y-%m-%d %H:%M:%S')
errors['errorID'] = errors['errorID'].astype('category')
print("Total Number of error records: %d" %len(errors.index))
errors.info()

Total Number of error records: 3919
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919 entries, 0 to 3918
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   3919 non-null   datetime64[ns]
 1   machineID  3919 non-null   int64         
 2   errorID    3919 non-null   category      
dtypes: category(1), datetime64[ns](1), int64(1)
memory usage: 65.4 KB


### Maintenance

In [9]:
maint.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3286 entries, 0 to 3285
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   datetime   3286 non-null   object
 1   machineID  3286 non-null   int64 
 2   comp       3286 non-null   object
dtypes: int64(1), object(2)
memory usage: 77.1+ KB


In [10]:
maint['datetime'] = pd.to_datetime(maint['datetime'], format='%Y-%m-%d %H:%M:%S')
maint['comp'] = maint['comp'].astype('category')
print("Total Number of maintenance Records: %d" %len(maint.index))
maint.info()

Total Number of maintenance Records: 3286
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3286 entries, 0 to 3285
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   3286 non-null   datetime64[ns]
 1   machineID  3286 non-null   int64         
 2   comp       3286 non-null   category      
dtypes: category(1), datetime64[ns](1), int64(1)
memory usage: 54.9 KB


### Machines

In [11]:
machines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  100 non-null    int64 
 1   model      100 non-null    object
 2   age        100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [12]:
machines['model'] = machines['model'].astype('category')

print("Total number of machines: %d" % len(machines.index))
machines.info()

Total number of machines: 100
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   machineID  100 non-null    int64   
 1   model      100 non-null    category
 2   age        100 non-null    int64   
dtypes: category(1), int64(2)
memory usage: 2.0 KB


### Failures

In [13]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   datetime   761 non-null    object
 1   machineID  761 non-null    int64 
 2   failure    761 non-null    object
dtypes: int64(1), object(2)
memory usage: 18.0+ KB


In [14]:
failures['datetime'] = pd.to_datetime(failures['datetime'], format="%Y-%m-%d %H:%M:%S")
failures['failure'] = failures['failure'].astype('category')

print("Total number of failures: %d" % len(failures.index))
failures.info()

Total number of failures: 761
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   761 non-null    datetime64[ns]
 1   machineID  761 non-null    int64         
 2   failure    761 non-null    category      
dtypes: category(1), datetime64[ns](1), int64(1)
memory usage: 13.0 KB


### Lag Features from Telemetry

In [15]:
# Calculate mean values for telemetry features -- 3 hours rolling window
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index = 'datetime',
                               columns = 'machineID',
                               values = col).resample('3H', closed = 'left', label = 'right').mean().unstack())
telemetry_mean_3h = pd.concat(temp, axis = 1)
telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
telemetry_mean_3h.reset_index(inplace = True)

# repeat for standard deviation
temp = []
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index = 'datetime',
                               columns = 'machineID',
                               values = col).resample('3H', closed = 'left', label = 'right').std().unstack())
telemetry_sd_3h = pd.concat(temp, axis = 1)
telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
telemetry_sd_3h.reset_index(inplace = True)

telemetry_mean_3h.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h
0,1,2015-01-01 09:00:00,170.028993,449.533798,94.592122,40.893502
1,1,2015-01-01 12:00:00,164.192565,403.949857,105.687417,34.255891
2,1,2015-01-01 15:00:00,168.134445,435.781707,107.793709,41.239405
3,1,2015-01-01 18:00:00,165.514453,430.472823,101.703289,40.373739
4,1,2015-01-01 21:00:00,168.809347,437.11112,90.91106,41.738542


In [16]:
# Calculate mean values for telemetry features -- 24 hours rolling window
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index = 'datetime',
                               columns = 'machineID',
                               values = col)
                .resample('3H', closed = 'left', label = 'right')
                .first()
                .unstack()
                .rolling(window = 24, center = False).mean())
telemetry_mean_24h = pd.concat(temp, axis = 1)
telemetry_mean_24h.columns = [i + 'mean_24h' for i in fields]
telemetry_mean_24h.reset_index(inplace = True)
telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

# repeat for standard deviation
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index = 'datetime',
                               columns = 'machineID',
                               values = col)
                .resample('3H', closed='left', label='right')
                .first()
                .unstack()
                .rolling(window = 24, center = False).std())
telemetry_sd_24h = pd.concat(temp, axis = 1)
telemetry_sd_24h.columns = [i + 'sd_24h' for i in fields]
telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]
telemetry_sd_24h.reset_index(inplace = True)

# Notice that a 24h rolling average is not available at the earliest timepoints
telemetry_mean_24h.head(10)

Unnamed: 0,machineID,datetime,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h
23,1,2015-01-04 06:00:00,171.536044,456.036706,101.652072,44.017022
24,1,2015-01-04 09:00:00,171.069056,457.285237,101.011726,44.148324
25,1,2015-01-04 12:00:00,170.859615,461.116153,101.172241,44.672216
26,1,2015-01-04 15:00:00,171.566669,457.893518,100.708151,44.993232
27,1,2015-01-04 18:00:00,171.536866,457.67211,99.826551,45.16057
28,1,2015-01-04 21:00:00,172.800672,454.497453,100.896227,45.690929
29,1,2015-01-05 00:00:00,171.963248,452.687991,101.312313,45.658369
30,1,2015-01-05 03:00:00,171.206225,448.104961,101.030466,46.457982
31,1,2015-01-05 06:00:00,171.999801,449.729553,101.47285,46.879346
32,1,2015-01-05 09:00:00,171.247302,451.93097,101.368307,47.831655


In [17]:
telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis = 1).dropna()
telemetry_feat.describe()

Unnamed: 0,machineID,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h,voltsd_24h,rotatesd_24h,pressuresd_24h,vibrationsd_24h
count,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0,291955.0
mean,50.50017,170.777502,446.605022,100.858814,40.38473,13.299258,44.455681,8.88607,4.441115,170.738705,446.622565,100.872047,40.38251,15.056106,50.681911,10.330027,5.103398
std,28.861802,9.500999,33.130538,7.414677,3.478377,6.96608,23.216953,4.656127,2.320281,4.178886,15.686776,3.983214,1.764289,2.383735,8.368825,2.129605,0.921546
min,1.0,125.532506,211.811184,72.118639,26.569635,0.025509,0.078991,0.027417,0.015278,156.713608,310.118604,91.162625,35.800869,6.178154,18.363177,4.275651,2.108104
25%,26.0,164.449993,427.55942,96.238853,38.147754,8.027626,26.903326,5.370745,2.684638,168.101158,440.859559,98.730297,39.379242,13.410078,44.994988,8.983969,4.488578
50%,51.0,170.4346,448.382424,100.2344,40.145815,12.49582,41.793246,8.346136,4.173944,170.28595,448.773288,100.196215,40.107313,14.942623,50.157638,10.009403,5.008349
75%,75.0,176.612089,468.447483,104.406778,42.227522,17.688638,59.10351,11.790435,5.899842,172.609456,456.129385,101.780792,40.908788,16.556913,55.65888,11.198901,5.589321
max,100.0,241.420717,586.682904,162.309656,69.311324,58.444332,179.903039,35.659369,18.305595,206.333895,491.081522,138.291979,55.266429,30.806053,117.198342,30.665847,12.757609


In [18]:
telemetry_feat.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h,voltsd_24h,rotatesd_24h,pressuresd_24h,vibrationsd_24h
23,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,171.536044,456.036706,101.652072,44.017022,15.08632,49.157643,11.230126,6.584861
24,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,171.069056,457.285237,101.011726,44.148324,14.471995,48.474806,11.145824,6.157739
25,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,170.859615,461.116153,101.172241,44.672216,14.563107,48.952343,11.72392,6.293381
26,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,171.566669,457.893518,100.708151,44.993232,14.286084,48.829331,11.922544,6.364104
27,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,171.536866,457.67211,99.826551,45.16057,14.312823,51.561108,12.031973,6.624033


### Lag Features for Errors

In [19]:
error_count = pd.get_dummies(errors.set_index('datetime')).reset_index()
error_count.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']
error_count.head()

Unnamed: 0,datetime,machineID,error1,error2,error3,error4,error5
0,2015-01-03 07:00:00,1,1,0,0,0,0
1,2015-01-03 20:00:00,1,0,0,1,0,0
2,2015-01-04 06:00:00,1,0,0,0,0,1
3,2015-01-10 15:00:00,1,0,0,0,1,0
4,2015-01-22 10:00:00,1,0,0,0,1,0


In [20]:
error_count = error_count.groupby(['machineID', 'datetime']).sum().reset_index()
error_count.head()

Unnamed: 0,machineID,datetime,error1,error2,error3,error4,error5
0,1,2015-01-03 07:00:00,1,0,0,0,0
1,1,2015-01-03 20:00:00,0,0,1,0,0
2,1,2015-01-04 06:00:00,0,0,0,0,1
3,1,2015-01-10 15:00:00,0,0,0,1,0
4,1,2015-01-22 10:00:00,0,0,0,1,0


In [21]:
error_count = telemetry[['datetime', 'machineID']].merge(error_count, on = ['machineID', 'datetime'], how = 'left').fillna(0.0)
print(error_count.info())
error_count.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 876100 entries, 0 to 876099
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   876100 non-null  datetime64[ns]
 1   machineID  876100 non-null  int64         
 2   error1     876100 non-null  float64       
 3   error2     876100 non-null  float64       
 4   error3     876100 non-null  float64       
 5   error4     876100 non-null  float64       
 6   error5     876100 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 53.5 MB
None


Unnamed: 0,datetime,machineID,error1,error2,error3,error4,error5
0,2015-01-01 06:00:00,1,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 07:00:00,1,0.0,0.0,0.0,0.0,0.0
2,2015-01-01 08:00:00,1,0.0,0.0,0.0,0.0,0.0
3,2015-01-01 09:00:00,1,0.0,0.0,0.0,0.0,0.0
4,2015-01-01 10:00:00,1,0.0,0.0,0.0,0.0,0.0


In [22]:
temp = []
fields = ['error%d' % i for i in range(1, 6)]
for col in fields:
    temp.append(pd.pivot_table(error_count,
                               index = 'datetime',
                               columns = 'machineID',
                               values = col)
                .resample('3H', closed='left', label='right')
                .first()
                .unstack()
                .rolling(window = 24, center = False).sum())
error_count = pd.concat(temp, axis = 1)
error_count.columns = [i + 'count' for i in fields]
error_count.reset_index(inplace = True)
error_count = error_count.dropna()
error_count.describe()

Unnamed: 0,machineID,error1count,error2count,error3count,error4count,error5count
count,292077.0,292077.0,292077.0,292077.0,292077.0,292077.0
mean,50.503898,0.038538,0.040181,0.036073,0.02613,0.019885
std,28.863914,0.196051,0.199343,0.189151,0.161103,0.140315
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,26.0,0.0,0.0,0.0,0.0,0.0
50%,51.0,0.0,0.0,0.0,0.0,0.0
75%,76.0,0.0,0.0,0.0,0.0,0.0
max,100.0,2.0,2.0,2.0,3.0,2.0


In [23]:
error_count.sample(10)

Unnamed: 0,machineID,datetime,error1count,error2count,error3count,error4count,error5count
78228,27,2015-10-13 15:00:00,0.0,0.0,0.0,0.0,0.0
214844,74,2015-07-21 18:00:00,0.0,1.0,1.0,0.0,0.0
147253,51,2015-05-31 18:00:00,0.0,1.0,1.0,0.0,0.0
236323,81,2015-11-27 18:00:00,0.0,0.0,0.0,0.0,0.0
63814,22,2015-11-06 12:00:00,0.0,0.0,0.0,0.0,0.0
76559,27,2015-03-19 00:00:00,0.0,0.0,0.0,0.0,0.0
201947,70,2015-02-20 03:00:00,0.0,0.0,0.0,0.0,0.0
85604,30,2015-04-23 06:00:00,0.0,0.0,0.0,0.0,0.0
98882,34,2015-11-08 12:00:00,0.0,0.0,0.0,0.0,0.0
247666,85,2015-10-16 03:00:00,0.0,0.0,0.0,0.0,0.0


### Days Since Last Replacement from Maintenance

In [24]:
maint.sample(5)

Unnamed: 0,datetime,machineID,comp
1998,2015-12-24 06:00:00,61,comp4
593,2015-12-26 06:00:00,18,comp1
2802,2015-11-11 06:00:00,85,comp1
1482,2015-07-21 06:00:00,45,comp3
1206,2015-03-05 06:00:00,37,comp4


In [25]:
import numpy as np

# create a column for each error type
comp_rep = pd.get_dummies(maint.set_index('datetime')).reset_index()
comp_rep.columns = ['datetime', 'machineID', 'comp1', 'comp2', 'comp3', 'comp4']

# combine repairs for a given machine in a given hour
comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

# add timepoints where no components were replaced
comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                      on=['datetime', 'machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])

comp_rep.sample(10)

Unnamed: 0,datetime,machineID,comp1,comp2,comp3,comp4
660479,2015-05-23 02:00:00,76,0.0,0.0,0.0,0.0
465566,2015-02-21 15:00:00,54,0.0,0.0,0.0,0.0
229382,2015-03-08 18:00:00,27,0.0,0.0,0.0,0.0
168439,2015-03-24 18:00:00,20,0.0,0.0,0.0,0.0
717932,2015-12-12 17:00:00,82,0.0,0.0,0.0,0.0
464716,2015-01-17 05:00:00,54,0.0,0.0,0.0,0.0
43574,2015-12-22 16:00:00,5,0.0,0.0,0.0,0.0
778369,2015-11-05 15:00:00,89,0.0,0.0,0.0,0.0
530284,2015-07-12 22:00:00,61,0.0,0.0,0.0,0.0
375433,2015-11-08 13:00:00,43,0.0,0.0,0.0,0.0


In [26]:
components = ['comp1', 'comp2', 'comp3', 'comp4']
for comp in components:
    comp_rep.loc[comp_rep[comp] < 1, comp] = None
    comp_rep.loc[-comp_rep[comp].isnull(),
                 comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
    comp_rep[comp] = comp_rep[comp].fillna(method='ffill')

comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]

comp_rep.sample(10)

Unnamed: 0,datetime,machineID,comp1,comp2,comp3,comp4
758931,2015-08-17 19:00:00,87,2015-08-10 06:00:00,2015-07-26 06:00:00,2015-06-26 06:00:00,2015-07-26 06:00:00
651148,2015-04-29 08:00:00,75,2015-04-22 06:00:00,2015-03-23 06:00:00,2015-04-07 06:00:00,2015-02-06 06:00:00
634958,2015-06-23 20:00:00,73,2015-05-17 06:00:00,2015-06-01 06:00:00,2015-06-16 06:00:00,2015-04-17 06:00:00
65481,2015-06-23 08:00:00,8,2015-06-05 06:00:00,2015-06-20 06:00:00,2015-06-20 06:00:00,2015-03-22 06:00:00
268489,2015-08-25 01:00:00,31,2015-08-05 06:00:00,2015-08-20 06:00:00,2015-07-06 06:00:00,2015-08-20 06:00:00
529870,2015-06-25 16:00:00,61,2015-05-13 06:00:00,2015-05-28 06:00:00,2015-05-28 06:00:00,2015-03-14 06:00:00
305064,2015-10-27 20:00:00,35,2015-09-18 06:00:00,2015-08-19 06:00:00,2015-07-05 06:00:00,2015-10-03 06:00:00
432429,2015-05-12 02:00:00,50,2015-03-16 06:00:00,2015-04-30 06:00:00,2015-04-30 06:00:00,2015-03-31 06:00:00
516180,2015-12-02 08:00:00,59,2015-12-01 06:00:00,2015-11-01 06:00:00,2015-10-02 06:00:00,2015-10-02 06:00:00
440678,2015-04-20 18:00:00,51,2015-01-16 06:00:00,2015-04-16 06:00:00,2015-03-17 06:00:00,2015-03-02 06:00:00


In [27]:
for comp in components:
    comp_rep[comp] = (comp_rep["datetime"] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, "D") 

comp_rep.describe()

Unnamed: 0,machineID,comp1,comp2,comp3,comp4
count,876100.0,876100.0,876100.0,876100.0,876100.0
mean,50.5,53.525185,51.540806,52.725962,53.834191
std,28.866087,62.491679,59.269254,58.873114,59.707978
min,1.0,0.0,0.0,0.0,0.0
25%,25.75,13.291667,12.125,13.125,13.0
50%,50.5,32.791667,29.666667,32.291667,32.5
75%,75.25,68.708333,66.541667,67.333333,70.458333
max,100.0,491.958333,348.958333,370.958333,394.958333


In [28]:
comp_rep.head()

Unnamed: 0,datetime,machineID,comp1,comp2,comp3,comp4
0,2015-01-01 06:00:00,1,19.0,214.0,154.0,169.0
1,2015-01-01 07:00:00,1,19.041667,214.041667,154.041667,169.041667
2,2015-01-01 08:00:00,1,19.083333,214.083333,154.083333,169.083333
3,2015-01-01 09:00:00,1,19.125,214.125,154.125,169.125
4,2015-01-01 10:00:00,1,19.166667,214.166667,154.166667,169.166667


### Machine Features

In [29]:
final_feat = telemetry_feat.merge(error_count, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(comp_rep, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(machines, on=['machineID'], how='left')

final_feat.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error2count,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age
0,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,...,0.0,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18
1,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,0.0,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18
2,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,0.0,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18
3,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,0.0,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18
4,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,0.0,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18


### Label Construction

In [30]:
labeled_features = pd.DataFrame()
labeled_features = final_feat.merge(
    failures, on = ['datetime', 'machineID'], how = 'left')
labeled_features.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,...,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18,
1,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18,
2,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18,
3,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18,
4,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18,


In [31]:
labeled_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291997 entries, 0 to 291996
Data columns (total 30 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   machineID          291997 non-null  int64         
 1   datetime           291997 non-null  datetime64[ns]
 2   voltmean_3h        291997 non-null  float64       
 3   rotatemean_3h      291997 non-null  float64       
 4   pressuremean_3h    291997 non-null  float64       
 5   vibrationmean_3h   291997 non-null  float64       
 6   voltsd_3h          291997 non-null  float64       
 7   rotatesd_3h        291997 non-null  float64       
 8   pressuresd_3h      291997 non-null  float64       
 9   vibrationsd_3h     291997 non-null  float64       
 10  voltmean_24h       291997 non-null  float64       
 11  rotatemean_24h     291997 non-null  float64       
 12  pressuremean_24h   291997 non-null  float64       
 13  vibrationmean_24h  291997 non-null  float64 

In [32]:
labeled_features['failure'] = labeled_features['failure'].astype(str)
labeled_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291997 entries, 0 to 291996
Data columns (total 30 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   machineID          291997 non-null  int64         
 1   datetime           291997 non-null  datetime64[ns]
 2   voltmean_3h        291997 non-null  float64       
 3   rotatemean_3h      291997 non-null  float64       
 4   pressuremean_3h    291997 non-null  float64       
 5   vibrationmean_3h   291997 non-null  float64       
 6   voltsd_3h          291997 non-null  float64       
 7   rotatesd_3h        291997 non-null  float64       
 8   pressuresd_3h      291997 non-null  float64       
 9   vibrationsd_3h     291997 non-null  float64       
 10  voltmean_24h       291997 non-null  float64       
 11  rotatemean_24h     291997 non-null  float64       
 12  pressuremean_24h   291997 non-null  float64       
 13  vibrationmean_24h  291997 non-null  float64 

In [33]:
labeled_features['failure'] = labeled_features['failure'].fillna(method='bfill',limit=7)
labeled_features.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,...,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18,
1,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18,
2,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18,
3,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18,
4,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18,


In [34]:
labeled_features['failure'] = labeled_features['failure'].replace('nan', 'none')
labeled_features.head()

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-04 06:00:00,186.092896,451.641253,107.989359,55.308074,13.48909,62.185045,5.118176,4.904365,...,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18,none
1,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18,none
2,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18,none
3,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18,none
4,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18,none


In [35]:
labeled_features['failure'] = labeled_features.failure.astype('category')
labeled_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291997 entries, 0 to 291996
Data columns (total 30 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   machineID          291997 non-null  int64         
 1   datetime           291997 non-null  datetime64[ns]
 2   voltmean_3h        291997 non-null  float64       
 3   rotatemean_3h      291997 non-null  float64       
 4   pressuremean_3h    291997 non-null  float64       
 5   vibrationmean_3h   291997 non-null  float64       
 6   voltsd_3h          291997 non-null  float64       
 7   rotatesd_3h        291997 non-null  float64       
 8   pressuresd_3h      291997 non-null  float64       
 9   vibrationsd_3h     291997 non-null  float64       
 10  voltmean_24h       291997 non-null  float64       
 11  rotatemean_24h     291997 non-null  float64       
 12  pressuremean_24h   291997 non-null  float64       
 13  vibrationmean_24h  291997 non-null  float64 

In [49]:
labeled_features.to_csv("datasets/final_data.csv", index = False)

In [52]:
s3_client.upload_file(Filename="datasets/final_data.csv", Bucket=bucket, Key=f"{prefix}/data/final_data.csv")

In [42]:
labeled_features.loc[labeled_features['failure'] == 'comp4'][:16]

Unnamed: 0,machineID,datetime,voltmean_3h,rotatemean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,rotatesd_3h,pressuresd_3h,vibrationsd_3h,...,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age,failure
8,1,2015-01-05 06:00:00,185.782709,439.531288,99.41366,51.558082,14.495664,45.663743,4.289212,7.330397,...,0.0,0.0,1.0,0.0,218.0,158.0,0.0,model3,18,comp4
1328,1,2015-06-19 06:00:00,172.059069,463.24261,96.90505,53.701413,14.75788,55.874,3.204981,2.329615,...,0.0,0.0,1.0,0.0,30.0,15.0,0.0,model3,18,comp4
1928,1,2015-09-02 06:00:00,165.530175,413.54713,103.631528,60.142846,9.088618,49.918671,14.438808,5.389723,...,0.0,0.0,1.0,0.0,60.0,90.0,0.0,model3,18,comp4
2768,1,2015-12-16 06:00:00,188.941806,421.756494,86.684047,51.410144,35.295874,7.83911,9.139511,3.5987,...,0.0,0.0,1.0,75.0,30.0,15.0,0.0,model3,18,comp4
17682,7,2015-01-24 06:00:00,177.669082,361.502737,109.447331,46.975392,8.531048,29.773424,4.384169,5.187506,...,1.0,0.0,1.0,207.0,0.0,207.0,0.0,model3,20,comp4
18522,7,2015-05-09 06:00:00,175.911702,467.458682,103.58991,52.610306,12.40014,69.604586,11.16258,1.658487,...,0.0,0.0,1.0,0.0,30.0,45.0,0.0,model3,20,comp4
19602,7,2015-09-21 06:00:00,169.123329,428.763286,99.428189,45.397202,21.723826,100.568119,22.697296,3.416446,...,0.0,0.0,1.0,0.0,30.0,60.0,0.0,model3,20,comp4
21058,8,2015-03-22 06:00:00,173.026438,429.852151,102.408383,54.340943,11.452078,42.822464,11.195405,7.115879,...,0.0,0.0,1.0,60.0,15.0,204.0,0.0,model3,16,comp4
22138,8,2015-08-04 06:00:00,178.611054,425.076676,102.481539,48.617995,12.652412,22.014775,18.517554,2.918507,...,0.0,0.0,1.0,30.0,45.0,15.0,0.0,model3,16,comp4
22738,8,2015-10-18 06:00:00,166.642406,452.842842,104.470047,52.080654,5.920407,46.523492,5.792959,4.005665,...,0.0,0.0,1.0,15.0,60.0,90.0,0.0,model3,16,comp4


## Sagemaker Sessions

In [70]:
!pip install -Uq pip
!pip install -q awswrangler==2.20.1 imbalanced-learn==0.10.1 sagemaker==2.139.0 boto3==1.26.97

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is not installed.[0m[31m
[0m

In [43]:
import json
import time
import boto3
import string
import sagemaker
import pandas as pd
import awswrangler as wr

from sagemaker.feature_store.feature_group import FeatureGroup

In [44]:
region = sagemaker.Session().boto_region_name
print("Using AWS Region: {}".format(region))

Using AWS Region: us-east-1


In [45]:
boto3.setup_default_session(region_name = region)
boto_session = boto3.Session(region_name = region)

s3_client = boto3.client("s3", region_name = region)

sagemaker_boto_client = boto_session.client("sagemaker")
sagemaker_session = sagemaker.session.Session(
    boto_session = boto_session, sagemaker_client = sagemaker_boto_client
)

sagemaker_role = sagemaker.get_execution_role()

In [46]:
bucket = "ideaaiml-demo"
prefix = "mlops/predictive-maintenance"

In [75]:
s3_client.upload_file(
    Filename="datasets/PdM_telemetry.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_telemetry.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_errors.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_errors.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_maint.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_maint.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_failures.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_failures.csv"
)
s3_client.upload_file(
    Filename="datasets/PdM_machines.csv", Bucket=bucket, Key=f"{prefix}/data/raw/PdM_machines.csv"
)

### Feature Store

In [46]:
account_id = boto3.client("sts").get_caller_identity()["Account"]

In [80]:
featurestore_runtime = boto_session.client(
    service_name = "sagemaker-featurestore-runtime", region_name = region
)

feature_store_session = sagemaker.Session(
    boto_session = boto_session,
    sagemaker_client = sagemaker_boto_client,
    sagemaker_featurestore_runtime_client = featurestore_runtime,
)

In [128]:
machines_data = pd.read_csv("datasets/PdM_machines.csv")
machines_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  100 non-null    int64 
 1   model      100 non-null    object
 2   age        100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [131]:
# Configure the feature groups
timestamp = pd.to_datetime("now").timestamp()
machines_fg = "predictive-maintenance-machines"
%store machines_fg
machines_data["event_time"] = timestamp
machines_feature_group = FeatureGroup(name = machines_fg, sagemaker_session = feature_store_session)

machines_feature_group.load_feature_definitions(data_frame = machines_data)

Stored 'machines_fg' (str)


[FeatureDefinition(feature_name='machineID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='model', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='age', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>)]

In [133]:
record_identifier_feature_name = "machineID"
event_time_feature_name = "event_time"

try:
    print(f"\n Using s3://{bucket}/{prefix}")
    machines_feature_group.create(
        s3_uri = f"s3://{bucket}/{prefix}",
        record_identifier_name = record_identifier_feature_name,
        event_time_feature_name = event_time_feature_name,
        role_arn = sagemaker_role,
        enable_online_store = True,
    )
    print(f'Create "claims" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print(f"Using existing feature group: {machines_fg}")
    else:
        raise (e)


 Using s3://ideaaiml-demo/mlops/predictive-maintenance
Create "claims" feature group: SUCCESS


In [134]:
machines_feature_group.ingest(data_frame = machines_data, max_workers = 3, wait = True)

IngestionManagerPandas(feature_group_name='predictive-maintenance-machines', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fb2487caf90>, sagemaker_session=<sagemaker.session.Session object at 0x7fb248803ed0>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7fb24860e290>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [137]:
machines_data_table = machines_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"][
        "TableName"
    ]
print(machines_data_table)
machines_data_table = machines_data_table.replace("_", "-")

predictive_maintenance_machines_1681900560


In [140]:
machines_feature_group_s3_prefix = (
    f"{prefix}/{account_id}/sagemaker/{region}/offline-store/{machines_data_table}/data"
)
offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket = bucket, Prefix = machines_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...")
        time.sleep(60)

print("\nData available.")

Waiting for data in offline store...

Data available.


In [142]:
machines_query = machines_feature_group.athena_query()

machines_table = machines_query.table_name
database_name = machines_query.database
%store machines_table
%store database_name

feature_columns = list(set(machines_data.columns))
feature_columns_string = ", ".join(f'"{c}"' for c in feature_columns)
# feature_columns_string = f'"{machines_table}".machineID as machineID, ' + feature_columns_string

query_string = f"""
SELECT DISTINCT {feature_columns_string}
FROM "{machines_table}"
"""
machines_query.run(query_string = query_string, output_location = f"s3://{bucket}/{prefix}/query_results")
machines_query.wait()
dataset = machines_query.as_dataframe()
dataset

Stored 'machines_table' (str)
Stored 'database_name' (str)


Unnamed: 0,machineID,age,model,event_time
0,17,14,model1,1.681901e+09
1,5,2,model3,1.681901e+09
2,74,4,model4,1.681901e+09
3,23,17,model1,1.681901e+09
4,50,4,model4,1.681901e+09
...,...,...,...,...
95,48,10,model4,1.681901e+09
96,36,5,model4,1.681901e+09
97,16,3,model1,1.681901e+09
98,8,16,model3,1.681901e+09


### Tain-Test Split

In [36]:
threshold_dates = [[pd.to_datetime('2015-07-31 01:00:00'), pd.to_datetime('2015-08-01 01:00:00')],
                   [pd.to_datetime('2015-08-31 01:00:00'), pd.to_datetime('2015-09-01 01:00:00')],
                   [pd.to_datetime('2015-09-30 01:00:00'), pd.to_datetime('2015-10-01 01:00:00')]]
threshold_dates

[[Timestamp('2015-07-31 01:00:00'), Timestamp('2015-08-01 01:00:00')],
 [Timestamp('2015-08-31 01:00:00'), Timestamp('2015-09-01 01:00:00')],
 [Timestamp('2015-09-30 01:00:00'), Timestamp('2015-10-01 01:00:00')]]

In [37]:
test_results = []
anai_models = []
train_dfs = []
for last_train_date, first_test_date in threshold_dates:
    print('Training on %s to %s' % (last_train_date, first_test_date))
    train_y = labeled_features.loc[labeled_features['datetime']
                                   < last_train_date, 'failure']
    train_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date].drop(['datetime',
                                                                                                        'machineID',
                                                                                                        'failure'], 1))
    df = pd.concat([train_X, train_y], axis=1)
    train_dfs.append(df)
# train_dfs

Training on 2015-07-31 01:00:00 to 2015-08-01 01:00:00


  # Remove the CWD from sys.path while we load stuff.


Training on 2015-08-31 01:00:00 to 2015-09-01 01:00:00
Training on 2015-09-30 01:00:00 to 2015-10-01 01:00:00


In [None]:
for last_train_date, first_test_date in threshold_dates:
    # split out training and test data
    
    train_y = labeled_features.loc[labeled_features['datetime'] < last_train_date, 'failure']
    train_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date].drop(['datetime',
                                                                                                        'machineID',
                                                                                                        'failure'], 1))
    test_y = labeled_features.loc[labeled_features['datetime'] > last_train_date, 'failure']
    test_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] > first_test_date].drop(['datetime',
                                                                                                       'machineID',
                                                                                                       'failure'], 1))

  import sys
  # This is added back by InteractiveShellApp.init_path()


In [None]:
train_X.info()

In [None]:
train_X['failure'] = train_y
test_X['failure'] = test_y

### Manually run model prediction

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

train_df = pd.read_csv("datasets/train-test/train.csv")
test_df = pd.read_csv("datasets/train-test/test.csv")

X_train = train_df.drop("failure", axis = 1)
X_test = test_df.drop("failure", axis = 1)
y_train = train_df["failure"]
y_test = test_df["failure"]

# train
print("training model")
model = RandomForestClassifier(
    n_estimators=10, min_samples_leaf=3, n_jobs=-1
)

model.fit(X_train.values, y_train)
predictions = model.predict(X_test.values)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy Score: {accuracy}")

training model
Accuracy Score: 0.9998102646807703


In [185]:
train_X.to_csv("datasets/train-test/train.csv", index = False)
test_X.to_csv("datasets/train-test/test.csv", index = False)

In [1]:
# Upload to S3
train_path = f"s3://{bucket}/{prefix}/data/train-test/train.csv"
test_path = f"s3://{bucket}/{prefix}/data/train-test/test.csv"
# s3_client.upload_file(
#     Filename = "datasets/train-test/train.csv", Bucket = bucket, Key = f"{prefix}/data/train-test/train.csv"
# )
# s3_client.upload_file(
#     Filename = "datasets/train-test/test.csv", Bucket = bucket, Key = f"{prefix}/data/train-test/test.csv"
# )

NameError: name 'bucket' is not defined

### SageMaker Training

In [4]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"

sklearn_estimator = SKLearn(
    entry_point = "scripts/rf_script-no-featurenames.py",
    role = sagemaker_role,
    instance_count = 1,
    instance_type = "ml.c5.xlarge",
    framework_version = FRAMEWORK_VERSION,
    base_job_name = "rf-scikit",
    hyperparameters = {
        "n-estimators": 100,
        "min-samples-leaf": 3,
    },
)

NameError: name 'sagemaker_role' is not defined

In [58]:
sklearn_estimator.fit(inputs = {"train": train_path, "test": test_path}, wait = True)
# sklearn_estimator.fit({"train": train_path}, wait = True)

INFO:sagemaker:Creating training-job with name: rf-scikit-2023-05-04-09-57-15-131


2023-05-04 09:57:16 Starting - Starting the training job...
2023-05-04 09:57:31 Starting - Preparing the instances for training......
2023-05-04 09:58:30 Downloading - Downloading input data...
2023-05-04 09:59:11 Training - Training image download completed. Training in progress..[34m2023-05-04 09:59:13,229 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-05-04 09:59:13,231 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-04 09:59:13,239 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-05-04 09:59:13,440 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-04 09:59:13,450 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-04 09:59:13,461 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-04 09:59:13,

### Deploy to a real-time endpoint

In [61]:
sklearn_estimator.latest_training_job.wait(logs = "None")
artifact = sagemaker_boto_client.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(type(artifact))


2023-05-04 09:59:57 Starting - Preparing the instances for training
2023-05-04 09:59:57 Downloading - Downloading input data
2023-05-04 09:59:57 Training - Training image download completed. Training in progress.
2023-05-04 09:59:57 Uploading - Uploading generated training model
2023-05-04 09:59:57 Completed - Training job completed
<class 'str'>


In [62]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data = artifact,
    role = sagemaker_role,
    entry_point = "scripts/rf_script.py",
    framework_version = FRAMEWORK_VERSION,
)

In [65]:
from sagemaker.model_monitor import DataCaptureConfig
data_capture_config = DataCaptureConfig(
    enable_capture = True, 
    sampling_percentage = 100, 
    destination_s3_uri = f"s3://{bucket}/{prefix}/data-capture-model-monitor"
)

model.deploy(instance_type = "ml.c5.large", initial_instance_count = 1, 
    data_capture_config = data_capture_config,)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-05-04-10-44-18-540
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2023-05-04-10-44-19-243
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2023-05-04-10-44-19-243


----!

<sagemaker.sklearn.model.SKLearnPredictor at 0x7f5f05407690>

In [71]:
model.name

'sagemaker-scikit-learn-2023-05-04-10-44-18-540'

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer, JSONSerializer

new_test_X = test_X.drop("failure", axis = 1)
new_test_Y = test_X["failure"]
predictor = Predictor(
    endpoint_name = "PdM-SKLearn-Model-Pipeline", 
    sagemaker_session = sagemaker_session, 
    serializer = JSONSerializer()
)
results = predictor.predict(new_test_X.values)

In [60]:
type(new_test_X.values[0])

array([170.30101698, 449.03699492,  94.80520453,  40.81679659,
        11.0616672 ,  58.42505515,   4.93130533,   2.42874017,
       176.84437584, 456.59810686, 100.65744001,  39.20591517,
        13.01510513,  53.2529348 ,   9.68170571,   5.9161934 ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,  28.875     ,  13.875     , 118.875     ,
        28.875     ,  18.        ,   0.        ,   0.        ,
         1.        ,   0.        ])