# normalize_dfs_xgb_classifier

In [1]:
from greenguard.demo import load_demo

target_times, readings = load_demo()

Using TensorFlow backend.


In [2]:
pipeline_name = 'normalize_dfs_xgb_classifier'

In [3]:
from greenguard.pipeline import GreenGuardPipeline

pipeline = GreenGuardPipeline(pipeline_name)

In [4]:
pipeline.template['primitives']

['pandas.DataFrame.resample',
 'featuretools.EntitySet.entity_from_dataframe',
 'featuretools.EntitySet.normalize_entity',
 'featuretools.EntitySet.normalize_entity',
 'featuretools.dfs',
 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
 'xgboost.XGBClassifier']

# Step by Step execution

## Input Data

In [5]:
readings.head()

Unnamed: 0,turbine_id,signal_id,timestamp,value
0,T001,S01,2013-01-10,323.0
1,T001,S02,2013-01-10,320.0
2,T001,S03,2013-01-10,284.0
3,T001,S04,2013-01-10,348.0
4,T001,S05,2013-01-10,273.0


In [6]:
target_times.head()

Unnamed: 0,turbine_id,cutoff_time,target
0,T001,2013-01-12,0
1,T001,2013-01-13,0
2,T001,2013-01-14,0
3,T001,2013-01-15,1
4,T001,2013-01-16,0


## Data Preparation (part of GreenGuard Pipeline)

* Input: target_times, readings, turbines
* Output: X, y, readings, turbines
* Effect: target_times has been split into X and y

## pandas.DataFrame.resample

* Input: readings
* Output: readings (resampled)
* Effect: readings have been resampled to the indicated resample rule and turbine_id,
  signal_id and timestamp have been set as a multi-index

In [7]:
step = 0
context = pipeline.fit(target_times, readings, output_=step)

In [8]:
context.keys()

dict_keys(['readings', 'turbines', 'X', 'y'])

In [9]:
context['readings'].head()

Unnamed: 0,turbine_id,signal_id,timestamp,value
0,T001,S01,2013-01-10 00:00:00,323.0
1,T001,S01,2013-01-10 00:10:00,346.0
2,T001,S01,2013-01-10 00:20:00,407.0
3,T001,S01,2013-01-10 00:30:00,257.0
4,T001,S01,2013-01-10 00:40:00,267.0


##  featuretools.EntitySet.entity_from_dataframe

* Input: readings (resampled)
* Output: entityset
* Effect: Entityset has been generated from readings

In [10]:
step = 1
context = pipeline.fit(**context, output_=step, start_=step)

In [11]:
context.keys()

dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])

In [12]:
context['entityset']

Entityset: entityset
  Entities:
    readings [Rows: 1329146, Columns: 5]
  Relationships:
    No relationships

In [13]:
context['readings'].head()

Unnamed: 0,turbine_id,signal_id,timestamp,value
0,T001,S01,2013-01-10 00:00:00,323.0
1,T001,S01,2013-01-10 00:10:00,346.0
2,T001,S01,2013-01-10 00:20:00,407.0
3,T001,S01,2013-01-10 00:30:00,257.0
4,T001,S01,2013-01-10 00:40:00,267.0


## featuretools.EntitySet.normalize_entity

* Input: entityset
* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)
* Effect: establish relation between readings and turbines

In [14]:
step = 2
context = pipeline.fit(**context, output_=step, start_=step)

In [15]:
context.keys()

dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])

In [16]:
context['entityset']

Entityset: entityset
  Entities:
    readings [Rows: 1329146, Columns: 5]
    turbines [Rows: 1, Columns: 1]
  Relationships:
    readings.turbine_id -> turbines.turbine_id

## featuretools.EntitySet.normalize_entity

* Input: entityset
* Output: entityset with relationship (readings.signal_id with signals.signal_id)
* Effect: establish relationship between readings and signals

In [17]:
step = 3
context = pipeline.fit(**context, output_=step, start_=step)

In [18]:
context.keys()

dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])

In [19]:
context['entityset']

Entityset: entityset
  Entities:
    readings [Rows: 1329146, Columns: 5]
    turbines [Rows: 1, Columns: 1]
    signals [Rows: 26, Columns: 1]
  Relationships:
    readings.turbine_id -> turbines.turbine_id
    readings.signal_id -> signals.signal_id

## featuretools.dfs

* Input: entityset (unstacked, no turbine_id, no timestamp)
* Output: X (has additional features)
* Effect: build features for relational dataset using DFS

In [None]:
step = 4
context = pipeline.fit(**context, output_=step, start_=step)

In [None]:
context.keys()

In [None]:
context['X'].head()

In [None]:
# features generated (the turbine_id is set as index).
len(context['X'].columns)

In [None]:
context['readings'].head()

## mlprimitives.custom.feature_extraction.CategoricalEncoder

* Input: X
* Output: X (label encoded)
* Effect: encodes categorical features using OneHotLabelEncoder

In [None]:
step = 5
context = pipeline.fit(**context, output_=step, start_=step)

In [None]:
context.keys()

In [None]:
context['X'].head()

In [None]:
context['readings'].head()

## xgboost.XGBClassifier

* Input: X (label encoded and featurized)
* Output: None
* Effect: trained model

In [None]:
step = 6
context = pipeline.fit(**context, output_=step, start_=step)

In [None]:
context.keys()