# dfs_xgb_with_unstack_normalization

In [1]:
from draco.demo import load_demo

target_times, readings = load_demo()

In [2]:
pipeline_name = 'dfs_xgb_with_unstack_normalization'

In [3]:
from draco.pipeline import DracoPipeline

pipeline = DracoPipeline(pipeline_name)

  import pandas.util.testing as tm


In [4]:
pipeline.template['primitives']

['mlblocks.MLPipeline',
 'mlblocks.MLPipeline',
 'featuretools.dfs',
 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
 'xgboost.XGBClassifier']

# Step by Step execution

## Input Data

In [5]:
readings.head()

Unnamed: 0,turbine_id,signal_id,timestamp,value
0,T001,S01,2013-01-10,323.0
1,T001,S02,2013-01-10,320.0
2,T001,S03,2013-01-10,284.0
3,T001,S04,2013-01-10,348.0
4,T001,S05,2013-01-10,273.0


In [6]:
target_times.head()

Unnamed: 0,turbine_id,cutoff_time,target
0,T001,2013-01-12,0
1,T001,2013-01-13,0
2,T001,2013-01-14,0
3,T001,2013-01-15,1
4,T001,2013-01-16,0


## Data Preparation (part of Draco Pipeline)

* Input: target_times, readings, turbines
* Output: X, y, readings, turbines
* Effect: target_times has been split into X and y

## mlblocks.MLPipeline 1

### pandas.DataFrame.resample

* Input: readings
* Output: readings (resampled)
* Effect: readings have been resampled to the indicated resample rule and turbine_id,
  signal_id and timestamp have been set as a multi-index
  
### pandas.DataFrame.unstack

* Input: readings (resampled)
* Output: readings (unstacked)
* Effect: readings have been unstacked

In [7]:
step = 0
context = pipeline.fit(target_times, readings, output_=step)

In [8]:
context.keys()

dict_keys(['readings', 'turbines', 'X', 'y'])

In [9]:
context['readings'].head()

Unnamed: 0,turbine_id,timestamp,value_S01,value_S02,value_S03,value_S04,value_S05,value_S06,value_S07,value_S08,...,value_S17,value_S18,value_S19,value_S20,value_S21,value_S22,value_S23,value_S24,value_S25,value_S26
0,T001,2013-01-10 00:00:00,323.0,320.0,284.0,348.0,273.0,342.0,280.0,3197842.0,...,11.7,3131020.0,55.0,55.0,47.0,58.0,45.0,58.0,47.0,356.0
1,T001,2013-01-10 00:10:00,346.0,384.0,367.0,411.0,331.0,360.0,249.0,3197900.0,...,10.2,3131420.0,58.0,63.0,62.0,67.0,55.0,61.0,42.0,400.0
2,T001,2013-01-10 00:20:00,407.0,363.0,407.0,393.0,275.0,335.0,270.0,3197968.0,...,9.5,3131822.0,68.0,61.0,67.0,66.0,46.0,55.0,45.0,402.0
3,T001,2013-01-10 00:30:00,257.0,307.0,315.0,361.0,317.0,354.0,271.0,3198011.0,...,10.5,3132179.0,43.0,51.0,53.0,62.0,53.0,60.0,45.0,357.0
4,T001,2013-01-10 00:40:00,267.0,309.0,314.0,355.0,262.0,246.0,212.0,3198056.0,...,9.6,3132501.0,45.0,51.0,54.0,59.0,43.0,41.0,36.0,322.0


## mlblocks.MLPipeline 2

###  featuretools.EntitySet.entity_from_dataframe

* Input: readings (resampled)
* Output: entityset
* Effect: Entityset has been generated from readings

### featuretools.EntitySet.normalize_entity

* Input: entityset
* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)
* Effect: establish relation between readings and turbines

In [10]:
step = 1
context = pipeline.fit(**context, output_=step, start_=step)

In [11]:
context.keys()

dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])

In [12]:
context['entityset']

Entityset: entityset
  Entities:
    readings [Rows: 51121, Columns: 29]
    turbines [Rows: 1, Columns: 1]
  Relationships:
    readings.turbine_id -> turbines.turbine_id

## featuretools.dfs

* Input: entityset (unstacked, no turbine_id, no timestamp)
* Output: X (has additional features)
* Effect: build features for relational dataset using DFS

In [13]:
step = 2
context = pipeline.fit(**context, output_=step, start_=step)

In [14]:
context.keys() 

dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])

In [15]:
context['X'].head()

Unnamed: 0_level_0,SUM(readings.value_S14),SUM(readings.value_S11),SUM(readings.value_S25),SUM(readings.value_S23),SUM(readings.value_S17),SUM(readings.value_S19),SUM(readings.value_S04),SUM(readings.value_S05),SUM(readings.value_S21),SUM(readings.value_S16),...,MEAN(readings.value_S20),COUNT(readings),NUM_UNIQUE(readings.YEAR(timestamp)),NUM_UNIQUE(readings.MONTH(timestamp)),NUM_UNIQUE(readings.DAY(timestamp)),NUM_UNIQUE(readings.WEEKDAY(timestamp)),MODE(readings.YEAR(timestamp)),MODE(readings.MONTH(timestamp)),MODE(readings.DAY(timestamp)),MODE(readings.WEEKDAY(timestamp))
turbine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T001,465421817.0,496362516.0,2743.0,2780.0,994.6,3174.0,19412.0,17083.0,3061.0,550.4,...,22.326389,144,1,1,2,2,2013,1,11,4
T001,465897578.0,496952628.0,4237.0,4640.0,1166.7,5112.0,38289.0,34344.0,4919.0,713.7,...,35.166667,144,1,1,2,2,2013,1,12,5
T001,466806830.0,498019072.0,9008.0,9179.0,1581.7,9134.0,86707.0,78749.0,9863.0,916.3,...,53.381944,144,1,1,2,2,2013,1,13,6
T001,468250434.0,499530451.0,10073.0,10310.0,1690.9,10674.0,87907.0,83264.0,10638.0,970.6,...,61.423611,144,1,1,2,2,2013,1,14,0
T001,371675934.0,400196323.0,7381.0,8228.0,1666.0,8831.0,68811.0,64088.0,8629.0,948.8,...,87.575221,144,1,1,2,2,2013,1,15,1


In [16]:
# features generated (the turbine_id is set as index).
len(context['X'].columns)

165

## mlprimitives.custom.feature_extraction.CategoricalEncoder

* Input: X
* Output: X (label encoded)
* Effect: encodes categorical features using OneHotLabelEncoder

In [17]:
step = 3
context = pipeline.fit(**context, output_=step, start_=step)

In [18]:
context.keys()

dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])

In [19]:
context['X'].head()

Unnamed: 0_level_0,SUM(readings.value_S14),SUM(readings.value_S11),SUM(readings.value_S25),SUM(readings.value_S23),SUM(readings.value_S17),SUM(readings.value_S19),SUM(readings.value_S04),SUM(readings.value_S05),SUM(readings.value_S21),SUM(readings.value_S16),...,MEAN(readings.value_S20),COUNT(readings),NUM_UNIQUE(readings.YEAR(timestamp)),NUM_UNIQUE(readings.MONTH(timestamp)),NUM_UNIQUE(readings.DAY(timestamp)),NUM_UNIQUE(readings.WEEKDAY(timestamp)),MODE(readings.YEAR(timestamp)),MODE(readings.MONTH(timestamp)),MODE(readings.DAY(timestamp)),MODE(readings.WEEKDAY(timestamp))
turbine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T001,465421817.0,496362516.0,2743.0,2780.0,994.6,3174.0,19412.0,17083.0,3061.0,550.4,...,22.326389,144,1,1,2,2,2013,1,11,4
T001,465897578.0,496952628.0,4237.0,4640.0,1166.7,5112.0,38289.0,34344.0,4919.0,713.7,...,35.166667,144,1,1,2,2,2013,1,12,5
T001,466806830.0,498019072.0,9008.0,9179.0,1581.7,9134.0,86707.0,78749.0,9863.0,916.3,...,53.381944,144,1,1,2,2,2013,1,13,6
T001,468250434.0,499530451.0,10073.0,10310.0,1690.9,10674.0,87907.0,83264.0,10638.0,970.6,...,61.423611,144,1,1,2,2,2013,1,14,0
T001,371675934.0,400196323.0,7381.0,8228.0,1666.0,8831.0,68811.0,64088.0,8629.0,948.8,...,87.575221,144,1,1,2,2,2013,1,15,1


In [20]:
context['readings'].head()

Unnamed: 0,turbine_id,timestamp,value_S01,value_S02,value_S03,value_S04,value_S05,value_S06,value_S07,value_S08,...,value_S17,value_S18,value_S19,value_S20,value_S21,value_S22,value_S23,value_S24,value_S25,value_S26
0,T001,2013-01-10 00:00:00,323.0,320.0,284.0,348.0,273.0,342.0,280.0,3197842.0,...,11.7,3131020.0,55.0,55.0,47.0,58.0,45.0,58.0,47.0,356.0
1,T001,2013-01-10 00:10:00,346.0,384.0,367.0,411.0,331.0,360.0,249.0,3197900.0,...,10.2,3131420.0,58.0,63.0,62.0,67.0,55.0,61.0,42.0,400.0
2,T001,2013-01-10 00:20:00,407.0,363.0,407.0,393.0,275.0,335.0,270.0,3197968.0,...,9.5,3131822.0,68.0,61.0,67.0,66.0,46.0,55.0,45.0,402.0
3,T001,2013-01-10 00:30:00,257.0,307.0,315.0,361.0,317.0,354.0,271.0,3198011.0,...,10.5,3132179.0,43.0,51.0,53.0,62.0,53.0,60.0,45.0,357.0
4,T001,2013-01-10 00:40:00,267.0,309.0,314.0,355.0,262.0,246.0,212.0,3198056.0,...,9.6,3132501.0,45.0,51.0,54.0,59.0,43.0,41.0,36.0,322.0


## xgboost.XGBClassifier

* Input: X (label encoded and featurized)
* Output: None
* Effect: trained model

In [21]:
step = 4
context = pipeline.fit(**context, output_=step, start_=step)

In [22]:
context.keys()

dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])