# lstm_regressor_with_unstack

In [1]:
from draco.demo import load_demo

train_target_times, test_target_times, readings = load_demo()

In [None]:
pipeline_name = 'lstm_regressor_with_unstack'

In [4]:
from draco import DracoPipeline

pipeline = DracoPipeline(pipeline_name)

In [None]:
pipeline.template['primitives']

# Step by Step execution

## Input Data

In [2]:
readings.head()

Unnamed: 0,turbine_id,timestamp,signal_id,value
0,1,2013-01-12 00:10:00,operational setting 1,-0.0007
1,1,2013-01-12 00:20:00,operational setting 1,0.0019
2,1,2013-01-12 00:30:00,operational setting 1,-0.0043
3,1,2013-01-12 00:40:00,operational setting 1,0.0007
4,1,2013-01-12 00:50:00,operational setting 1,-0.0019


In [39]:
train_target_times.head()

Unnamed: 0,turbine_id,cutoff_time,target
0,1,2013-01-12 04:20:00,166
1,1,2013-01-12 04:30:00,165
2,1,2013-01-12 04:40:00,164
3,1,2013-01-12 04:50:00,163
4,1,2013-01-12 05:00:00,162


In [40]:
test_target_times.head()

Unnamed: 0,turbine_id,cutoff_time,target
0,1,2013-01-13 13:10:00,112.0
1,2,2013-01-14 08:00:00,98.0
2,3,2013-01-14 02:50:00,69.0
3,4,2013-01-14 01:10:00,82.0
4,5,2013-01-14 13:10:00,91.0


In [3]:
print("training shape", train_target_times.shape)
print("testing shape", test_target_times.shape)

training shape (18131, 3)
testing shape (100, 3)


## Data Preparation (part of Draco Pipeline)

* Input: target_times, readings, turbines
* Output: X, y, readings, turbines
* Effect: target_times has been split into X and y

## mlblocks.MLPipeline

### pandas.DataFrame.resample

* Input: readings
* Output: readings (resampled)
* Effect: readings have been resampled to the indicated resample rule and turbine_id,
  signal_id and timestamp have been set as a multi-index
  
### pandas.DataFrame.unstack

* Input: readings (resampled)
* Output: readings (unstacked)
* Effect: readings have been unstacked

In [5]:
context = pipeline.fit(train_target_times, readings, output_=0)

In [6]:
context.keys()

dict_keys(['readings', 'turbines', 'X', 'y'])

In [7]:
context['readings'].head()

Unnamed: 0,turbine_id,timestamp,value_operational setting 1,value_operational setting 2,value_operational setting 3,value_sensor measurement 1,value_sensor measurement 10,value_sensor measurement 11,value_sensor measurement 12,value_sensor measurement 13,...,value_sensor measurement 2,value_sensor measurement 20,value_sensor measurement 21,value_sensor measurement 3,value_sensor measurement 4,value_sensor measurement 5,value_sensor measurement 6,value_sensor measurement 7,value_sensor measurement 8,value_sensor measurement 9
0,1,2013-01-12 00:10:00,-0.0007,-0.0004,100.0,518.67,1.3,47.47,521.66,2388.02,...,641.82,39.06,23.419,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19
1,1,2013-01-12 00:20:00,0.0019,-0.0003,100.0,518.67,1.3,47.49,522.28,2388.07,...,642.15,39.0,23.4236,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07
2,1,2013-01-12 00:30:00,-0.0043,0.0003,100.0,518.67,1.3,47.27,522.42,2388.03,...,642.35,38.95,23.3442,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94
3,1,2013-01-12 00:40:00,0.0007,0.0,100.0,518.67,1.3,47.13,522.86,2388.08,...,642.35,38.88,23.3739,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48
4,1,2013-01-12 00:50:00,-0.0019,-0.0002,100.0,518.67,1.3,47.28,522.19,2388.04,...,642.37,38.9,23.4044,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15


## pandas.DataFrame.pop

* Input: readings (unstacked)
* Output: readings (without turbine_id), turbine_id
* Effect: turbine_id has been popped from readings

In [8]:
step = 1
context = pipeline.fit(**context, output_=step, start_=step)

In [9]:
context.keys()

dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])

In [10]:
context['turbine_id'].head()

0    1
1    1
2    1
3    1
4    1
Name: turbine_id, dtype: int64

In [11]:
context['readings'].head()

Unnamed: 0,timestamp,value_operational setting 1,value_operational setting 2,value_operational setting 3,value_sensor measurement 1,value_sensor measurement 10,value_sensor measurement 11,value_sensor measurement 12,value_sensor measurement 13,value_sensor measurement 14,...,value_sensor measurement 2,value_sensor measurement 20,value_sensor measurement 21,value_sensor measurement 3,value_sensor measurement 4,value_sensor measurement 5,value_sensor measurement 6,value_sensor measurement 7,value_sensor measurement 8,value_sensor measurement 9
0,2013-01-12 00:10:00,-0.0007,-0.0004,100.0,518.67,1.3,47.47,521.66,2388.02,8138.62,...,641.82,39.06,23.419,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19
1,2013-01-12 00:20:00,0.0019,-0.0003,100.0,518.67,1.3,47.49,522.28,2388.07,8131.49,...,642.15,39.0,23.4236,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07
2,2013-01-12 00:30:00,-0.0043,0.0003,100.0,518.67,1.3,47.27,522.42,2388.03,8133.23,...,642.35,38.95,23.3442,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94
3,2013-01-12 00:40:00,0.0007,0.0,100.0,518.67,1.3,47.13,522.86,2388.08,8133.83,...,642.35,38.88,23.3739,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48
4,2013-01-12 00:50:00,-0.0019,-0.0002,100.0,518.67,1.3,47.28,522.19,2388.04,8133.8,...,642.37,38.9,23.4044,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15


## pandas.DataFrame.pop

* Input: readings (without turbine_id)
* Output: readings (without timestamp), timestamp
* Effect: timestamp has been popped from readings

In [12]:
step = 2
context = pipeline.fit(**context, output_=step, start_=step)

In [13]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])

In [14]:
context['timestamp'].head()

0   2013-01-12 00:10:00
1   2013-01-12 00:20:00
2   2013-01-12 00:30:00
3   2013-01-12 00:40:00
4   2013-01-12 00:50:00
Name: timestamp, dtype: datetime64[ns]

In [15]:
context['readings'].head()

Unnamed: 0,value_operational setting 1,value_operational setting 2,value_operational setting 3,value_sensor measurement 1,value_sensor measurement 10,value_sensor measurement 11,value_sensor measurement 12,value_sensor measurement 13,value_sensor measurement 14,value_sensor measurement 15,...,value_sensor measurement 2,value_sensor measurement 20,value_sensor measurement 21,value_sensor measurement 3,value_sensor measurement 4,value_sensor measurement 5,value_sensor measurement 6,value_sensor measurement 7,value_sensor measurement 8,value_sensor measurement 9
0,-0.0007,-0.0004,100.0,518.67,1.3,47.47,521.66,2388.02,8138.62,8.4195,...,641.82,39.06,23.419,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19
1,0.0019,-0.0003,100.0,518.67,1.3,47.49,522.28,2388.07,8131.49,8.4318,...,642.15,39.0,23.4236,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07
2,-0.0043,0.0003,100.0,518.67,1.3,47.27,522.42,2388.03,8133.23,8.4178,...,642.35,38.95,23.3442,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94
3,0.0007,0.0,100.0,518.67,1.3,47.13,522.86,2388.08,8133.83,8.3682,...,642.35,38.88,23.3739,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48
4,-0.0019,-0.0002,100.0,518.67,1.3,47.28,522.19,2388.04,8133.8,8.4294,...,642.37,38.9,23.4044,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15


## sklearn.impute.SimpleImputer

* Input: readings (unstacked, no turbine_id, no timestamp)
* Output: readings (imputed, numpy array)
* Effect: readings have been imputed and converted to numpy array

In [16]:
step = 3
context = pipeline.fit(**context, output_=step, start_=step)

In [17]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])

In [18]:
context['readings'][0:5]

array([[-7.00000e-04, -4.00000e-04,  1.00000e+02,  5.18670e+02,
         1.30000e+00,  4.74700e+01,  5.21660e+02,  2.38802e+03,
         8.13862e+03,  8.41950e+00,  3.00000e-02,  3.92000e+02,
         2.38800e+03,  1.00000e+02,  6.41820e+02,  3.90600e+01,
         2.34190e+01,  1.58970e+03,  1.40060e+03,  1.46200e+01,
         2.16100e+01,  5.54360e+02,  2.38806e+03,  9.04619e+03],
       [ 1.90000e-03, -3.00000e-04,  1.00000e+02,  5.18670e+02,
         1.30000e+00,  4.74900e+01,  5.22280e+02,  2.38807e+03,
         8.13149e+03,  8.43180e+00,  3.00000e-02,  3.92000e+02,
         2.38800e+03,  1.00000e+02,  6.42150e+02,  3.90000e+01,
         2.34236e+01,  1.59182e+03,  1.40314e+03,  1.46200e+01,
         2.16100e+01,  5.53750e+02,  2.38804e+03,  9.04407e+03],
       [-4.30000e-03,  3.00000e-04,  1.00000e+02,  5.18670e+02,
         1.30000e+00,  4.72700e+01,  5.22420e+02,  2.38803e+03,
         8.13323e+03,  8.41780e+00,  3.00000e-02,  3.90000e+02,
         2.38800e+03,  1.00000e+02,  6

## sklearn.preprocessing.MinMaxScaler

* Input: (imputed, array)
* Output: readings (scaled, array)
* Effect: readings have been scaled to [-1, 1] range

In [19]:
step = 4
context = pipeline.fit(**context, output_=step, start_=step)

In [20]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])

In [21]:
context['readings'][0:5]

array([[-0.08045977, -0.69230769, -1.        , -1.        , -1.        ,
        -0.22543353,  0.17159763, -0.58823529, -0.60078439, -0.2720277 ,
        -1.        , -0.33333333, -1.        , -1.        , -0.59411765,
         0.42635659,  0.40377157, -0.13682891, -0.38048616, -1.        ,
         1.        ,  0.45249597, -0.49253731, -0.78048999],
       [ 0.2183908 , -0.53846154, -1.        , -1.        , -1.        ,
        -0.20231214,  0.41617357, -0.44117647, -0.674373  , -0.17737591,
        -1.        , -0.33333333, -1.        , -1.        , -0.4       ,
         0.33333333,  0.41607597, -0.04825569, -0.29473329, -1.        ,
         1.        ,  0.25603865, -0.55223881, -0.79951539],
       [-0.49425287,  0.38461538, -1.        , -1.        , -1.        ,
        -0.4566474 ,  0.47140039, -0.55882353, -0.65641449, -0.28510966,
        -1.        , -0.66666667, -1.        , -1.        , -0.28235294,
         0.25581395,  0.20369132, -0.2082724 , -0.25894666, -1.        ,
  

## pandas.DataFrame

* Input: readings (scaled, array)
* Output: readings (dataframe)
* Effect: readings have been converted into a dataframe

In [22]:
step = 5
context = pipeline.fit(**context, output_=step, start_=step)

In [23]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])

In [24]:
context['readings'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,-0.08046,-0.692308,-1.0,-1.0,-1.0,-0.225434,0.171598,-0.588235,-0.600784,-0.272028,...,-0.594118,0.426357,0.403772,-0.136829,-0.380486,-1.0,1.0,0.452496,-0.492537,-0.78049
1,0.218391,-0.538462,-1.0,-1.0,-1.0,-0.202312,0.416174,-0.441176,-0.674373,-0.177376,...,-0.4,0.333333,0.416076,-0.048256,-0.294733,-1.0,1.0,0.256039,-0.552239,-0.799515
2,-0.494253,0.384615,-1.0,-1.0,-1.0,-0.456647,0.4714,-0.558824,-0.656414,-0.28511,...,-0.282353,0.255814,0.203691,-0.208272,-0.258947,-1.0,1.0,0.42029,-0.432836,-0.719914
3,0.08046,-0.076923,-1.0,-1.0,-1.0,-0.618497,0.64497,-0.411765,-0.650222,-0.666795,...,-0.282353,0.147287,0.283135,-0.425527,-0.33761,-1.0,1.0,0.481481,-0.343284,-0.750965
4,-0.218391,-0.384615,-1.0,-1.0,-1.0,-0.445087,0.380671,-0.529412,-0.650532,-0.195845,...,-0.270588,0.178295,0.364718,-0.423021,-0.190749,-1.0,1.0,0.336554,-0.492537,-0.700081


## pandas.DataFrame.set

* Input: readings (dataframe)
* Output: readings (dataframe with turbine_id)
* Effect: turbine_id has been set as a readings column

In [25]:
step = 6
context = pipeline.fit(**context, output_=step, start_=step)

In [26]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])

In [27]:
context['readings'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,turbine_id
0,-0.08046,-0.692308,-1.0,-1.0,-1.0,-0.225434,0.171598,-0.588235,-0.600784,-0.272028,...,0.426357,0.403772,-0.136829,-0.380486,-1.0,1.0,0.452496,-0.492537,-0.78049,1
1,0.218391,-0.538462,-1.0,-1.0,-1.0,-0.202312,0.416174,-0.441176,-0.674373,-0.177376,...,0.333333,0.416076,-0.048256,-0.294733,-1.0,1.0,0.256039,-0.552239,-0.799515,1
2,-0.494253,0.384615,-1.0,-1.0,-1.0,-0.456647,0.4714,-0.558824,-0.656414,-0.28511,...,0.255814,0.203691,-0.208272,-0.258947,-1.0,1.0,0.42029,-0.432836,-0.719914,1
3,0.08046,-0.076923,-1.0,-1.0,-1.0,-0.618497,0.64497,-0.411765,-0.650222,-0.666795,...,0.147287,0.283135,-0.425527,-0.33761,-1.0,1.0,0.481481,-0.343284,-0.750965,1
4,-0.218391,-0.384615,-1.0,-1.0,-1.0,-0.445087,0.380671,-0.529412,-0.650532,-0.195845,...,0.178295,0.364718,-0.423021,-0.190749,-1.0,1.0,0.336554,-0.492537,-0.700081,1


## pandas.DataFrame.set

* Input: readings (dataframe with turbine_id)
* Output: readings (dataframe with turbine_id and timestamp)
* Effect: timestamp has been set as a readings column

In [28]:
step = 7
context = pipeline.fit(**context, output_=step, start_=step)

In [29]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])

In [30]:
context['readings'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,turbine_id,timestamp
0,-0.08046,-0.692308,-1.0,-1.0,-1.0,-0.225434,0.171598,-0.588235,-0.600784,-0.272028,...,0.403772,-0.136829,-0.380486,-1.0,1.0,0.452496,-0.492537,-0.78049,1,2013-01-12 00:10:00
1,0.218391,-0.538462,-1.0,-1.0,-1.0,-0.202312,0.416174,-0.441176,-0.674373,-0.177376,...,0.416076,-0.048256,-0.294733,-1.0,1.0,0.256039,-0.552239,-0.799515,1,2013-01-12 00:20:00
2,-0.494253,0.384615,-1.0,-1.0,-1.0,-0.456647,0.4714,-0.558824,-0.656414,-0.28511,...,0.203691,-0.208272,-0.258947,-1.0,1.0,0.42029,-0.432836,-0.719914,1,2013-01-12 00:30:00
3,0.08046,-0.076923,-1.0,-1.0,-1.0,-0.618497,0.64497,-0.411765,-0.650222,-0.666795,...,0.283135,-0.425527,-0.33761,-1.0,1.0,0.481481,-0.343284,-0.750965,1,2013-01-12 00:40:00
4,-0.218391,-0.384615,-1.0,-1.0,-1.0,-0.445087,0.380671,-0.529412,-0.650532,-0.195845,...,0.364718,-0.423021,-0.190749,-1.0,1.0,0.336554,-0.492537,-0.700081,1,2013-01-12 00:50:00


## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences

* Input: X, readings (dataframe with turbine_id and timestamp)
* Output: X
* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape
  (window_size x num_signals) for each one of the target times.

In [31]:
pipeline._pipeline.get_hyperparameters()[
    'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']

{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}

In [32]:
step = 8
context = pipeline.fit(**context, output_=step, start_=step)

In [33]:
context.keys()

dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])

In [34]:
context['readings'].shape

(33727, 26)

In [35]:
context['y'].shape

(18131,)

In [36]:
context['X'].shape

(18131, 24, 24)

In [37]:
context['X'][0][:3]

array([[ 0.2183908 , -0.53846154, -1.        , -1.        , -1.        ,
        -0.20231214,  0.41617357, -0.44117647, -0.674373  , -0.17737591,
        -1.        , -0.33333333, -1.        , -1.        , -0.4       ,
         0.33333333,  0.41607597, -0.04825569, -0.29473329, -1.        ,
         1.        ,  0.25603865, -0.55223881, -0.79951539],
       [-0.49425287,  0.38461538, -1.        , -1.        , -1.        ,
        -0.4566474 ,  0.47140039, -0.55882353, -0.65641449, -0.28510966,
        -1.        , -0.66666667, -1.        , -1.        , -0.28235294,
         0.25581395,  0.20369132, -0.2082724 , -0.25894666, -1.        ,
         1.        ,  0.42028986, -0.43283582, -0.71991385],
       [ 0.08045977, -0.07692308, -1.        , -1.        , -1.        ,
        -0.61849711,  0.64497041, -0.41176471, -0.6502219 , -0.66679492,
        -1.        , -0.33333333, -1.        , -1.        , -0.28235294,
         0.14728682,  0.28313495, -0.42552747, -0.33760972, -1.        ,
  

## keras.Sequential.LSTMTimeSeriesRegressor

* Input: X, y
* Output: 
* Effect: LSTM has been fitted.

In [38]:
step = 9
context = pipeline.fit(**context, output_=step, start_=step)

2022-02-01 10:08:21.044547: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-01 10:08:21.080727: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f8579596430 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-02-01 10:08:21.080742: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
