In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import tempfile
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
import apache_beam as beam
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from apache_beam.options.pipeline_options import PipelineOptions
import tensorflow_transform.beam.impl as beam_impl
 
from measurements import measure

tf.__version__

  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '


'1.13.1'

## Preprocessing measurement data for Machine Learning

A little trick: Take a note of the temporary directory containing the data for re-use in other notebooks.

In [3]:
temp_dir = "temp_dir = '%s'" % tempfile.mkdtemp()
with open('temp_dir.py', 'w') as file:
    file.write(temp_dir)
from temp_dir import temp_dir

In [36]:
temp_dir = './tmp'
temp_dir

'./tmp'

In [37]:

# Signature data: Just the way it'll arrive at prediction time
signature_csv_train = os.path.join(temp_dir, "signature_train.csv")
signature_csv_eval = os.path.join(temp_dir, "signature_eval.csv")
signature_csv_test = os.path.join(temp_dir, "signature_test.csv")

# Training data: maybe scaled or further pre-processed.
training_csv = os.path.join(temp_dir, "training.csv")
eval_csv = os.path.join(temp_dir, "eval.csv")

# TFRecord: Allows for high performance input into computational graphs
training_tfr = os.path.join(temp_dir, "training.tfr")
eval_tfr = os.path.join(temp_dir, "eval.tfr")

print("You can find the files at:")
print(signature_csv_train, training_csv, training_tfr)

You can find the files at:
./tmp/signature_train.csv ./tmp/training.csv ./tmp/training.tfr


## The orginal data

In [38]:
data = measure(5)
data.to_csv(signature_csv_train, index=None)
data = pd.read_csv(signature_csv_train)
data.head()

Unnamed: 0,beta1,beta2,hour,humidity,weekday
0,3.601713,4.718712,15,23.2554,0
1,1.777682,-4.6178,21,24.903056,4
2,3.559258,-2.819708,6,26.00116,1
3,3.988322,4.907078,23,24.06136,1
4,-2.560512,3.170083,2,14.927021,0


### Specify the input and output formats

In [39]:
ORDERED_SIGNATURE_COLUMNS=["beta1", "beta2", "hour", "humidity", "weekday"]
header = bytes(",".join(ORDERED_SIGNATURE_COLUMNS), 'UTF-8')

In [40]:
feature_spec = {
    'beta1': tf.io.FixedLenFeature([1], tf.float32),
    'beta2': tf.io.FixedLenFeature([1], tf.float32),
    'weekday': tf.io.FixedLenFeature([1], tf.int64),
    'hour': tf.io.FixedLenFeature([1], tf.int64),
    'humidity': tf.io.FixedLenFeature([1], tf.float32)
}
schema = dataset_schema.from_feature_spec(feature_spec)

### Create an encoder and test it

In [41]:
csv_encoder = tft.coders.CsvCoder(ORDERED_SIGNATURE_COLUMNS, schema)
records = csv_encoder.decode("10.201, 10.101, 3,1.234,4")
print(records)
csv_encoder.encode(records)

{'humidity': array([1.234], dtype=float32), 'beta1': array([10.201], dtype=float32), 'beta2': array([10.101], dtype=float32), 'weekday': array([4]), 'hour': array([3])}


b'10.201,10.101,3,1.234,4'

---
# The Apache Beam pipeline 

In [42]:
def process_data(row):
    print(row)
    return row

### Dry run - Everything working?

In [43]:
with beam.Pipeline('DirectRunner', PipelineOptions()) as p:

    csv_encoder = tft.coders.CsvCoder(ORDERED_SIGNATURE_COLUMNS, schema)    

    _ = (p 
         | 'read_from_csv' >> beam.io.ReadFromText(
             file_pattern=signature_csv_train, coder=csv_encoder, skip_header_lines=1)
         
         | 'process_records' >> beam.Map(process_data)
         
         | 'write_to_csv' >> beam.io.WriteToText(
             file_path_prefix=training_csv, coder=csv_encoder, header=header)
        )


{'humidity': array([23.2554], dtype=float32), 'beta1': array([3.6017134], dtype=float32), 'beta2': array([4.718712], dtype=float32), 'weekday': array([0]), 'hour': array([15])}
{'humidity': array([24.903057], dtype=float32), 'beta1': array([1.7776823], dtype=float32), 'beta2': array([-4.6177998], dtype=float32), 'weekday': array([4]), 'hour': array([21])}
{'humidity': array([26.00116], dtype=float32), 'beta1': array([3.5592577], dtype=float32), 'beta2': array([-2.8197079], dtype=float32), 'weekday': array([1]), 'hour': array([6])}
{'humidity': array([24.061361], dtype=float32), 'beta1': array([3.988322], dtype=float32), 'beta2': array([4.907078], dtype=float32), 'weekday': array([1]), 'hour': array([23])}
{'humidity': array([14.927021], dtype=float32), 'beta1': array([-2.5605125], dtype=float32), 'beta2': array([3.1700833], dtype=float32), 'weekday': array([0]), 'hour': array([2])}


In [44]:
!echo "Reading from: " $training_csv*
!cat $training_csv* | tail -10

Reading from:  ./tmp/training.csv-00000-of-00001
beta1,beta2,hour,humidity,weekday
3.6017134,4.718712,15,23.2554,0
1.7776823,-4.6177998,21,24.903057,4
3.5592577,-2.8197079,6,26.00116,1
3.988322,4.907078,23,24.061361,1
-2.5605125,3.1700833,2,14.927021,0


### Serious transformation: Scale $\beta_1$ and $\beta_2$

In [45]:
for phase_csv in [signature_csv_train, signature_csv_eval, signature_csv_test]:
    measure(20000).to_csv(phase_csv, index=None)

In [46]:
def process_data(row):
    for c in ['beta1', 'beta2']:
        row[c] = tft.scale_to_0_1(row[c])
    return row

In [47]:
signature_metadata = dataset_metadata.DatasetMetadata(schema)

In [48]:
csv_encoder = tft.coders.CsvCoder(ORDERED_SIGNATURE_COLUMNS, schema)    
tfr_encoder = tft.coders.ExampleProtoCoder(schema)            

metadata_dir = os.path.join(temp_dir, "metadata")
with beam.Pipeline('DirectRunner', PipelineOptions()) as p:

    #
    # The context is provided for the AnalyseAndTransform step.
    # That step needs a hand to do its magic.
    #
    with tft_beam.Context(temp_dir=temp_dir):

        #
        # Read from csv, skip headers. Note that we use ordered columns in the encoder
        # 
        signature_data = ( p | 'read_from_csv' 
            >> beam.io.ReadFromText(
                 file_pattern=signature_csv_train, 
                coder=csv_encoder, skip_header_lines=1))

        #
        # attach the metadata: required for AnalyzeAndTransform
        #
        signature_data = ( signature_data, signature_metadata)

        #
        # Do the magic two steps and return also the transform-function
        #
        data_and_metadata, transform_fn = ( signature_data | "AnalyzeAndTransform" 
                         >> beam_impl.AnalyzeAndTransformDataset(process_data))
        
        #
        # split data and metadata
        #
        training_data, training_metadata = data_and_metadata

        #
        # Write the resulting data to a csv file
        #
        _ = (training_data | 'write_to_csv' 
             >> beam.io.WriteToText(
                 file_path_prefix=training_csv, coder=csv_encoder, header=header))

        #
        # For production purposes, we use the TFRecord format
        #
        _ = (training_data | 'write_to_tfr' 
             >> beam.io.WriteToTFRecord(
                 file_path_prefix=training_tfr, coder=tfr_encoder))

        
        #  Process evaluation data with the obtained transform_fn
        #
        signature_data = ( p | 'read_from_csv_eval' 
            >> beam.io.ReadFromText(
                 file_pattern=signature_csv_eval, coder=csv_encoder, skip_header_lines=1))

        signature_data = (signature_data, signature_metadata)

        # Use the transform_fn of the previous step here
        eval_data, _ = ((signature_data, transform_fn) 
                     | "TransformEval" >> tft_beam.TransformDataset())

        _ = (eval_data | 'write_to_tfr_eval' 
             >> beam.io.WriteToTFRecord(
                 file_path_prefix=eval_tfr, coder=tfr_encoder))

        
        
        #
        # Eventually, save the transform function for re-use at prediction time.
        #
        _ = (transform_fn | 'WriteTransformFn' 
             >> transform_fn_io.WriteTransformFn(metadata_dir))


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: ./tmp/tftransform_tmp/5841c14891624509a576983de1444949/saved_model.pb


INFO:tensorflow:SavedModel written to: ./tmp/tftransform_tmp/5841c14891624509a576983de1444949/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: ./tmp/tftransform_tmp/965f1fb86e0d4e518be6d55f6aff6e5a/saved_model.pb


INFO:tensorflow:SavedModel written to: ./tmp/tftransform_tmp/965f1fb86e0d4e518be6d55f6aff6e5a/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: ./tmp/tftransform_tmp/6f82da2732644be889fc912f463b56e4/saved_model.pb


INFO:tensorflow:SavedModel written to: ./tmp/tftransform_tmp/6f82da2732644be889fc912f463b56e4/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [17]:
!echo "Reading from: " $training_csv*
!echo
!cat $training_csv* | tail -5
!echo
!echo metadata is here:
!ls $metadata_dir
!echo 
!echo "TFRecords are here: " $training_tfr*

Reading from:  /tmp/tmpqmz6u8ab/training.csv-00000-of-00001

0.28593916,0.44638932,12,14.948654,6
0.08173424,0.4983325,15,10.790115,2
0.31320256,0.51273495,2,13.007976,3
0.07270126,0.94722366,17,9.778644,3
0.62228477,0.73450196,3,21.455841,5

metadata is here:
transformed_metadata  transform_fn

TFRecords are here:  /tmp/tmpqmz6u8ab/training.tfr-00000-of-00001


### Wrap up

We're now able to analyze and process any size of data, scale particular features to the interval $[0, 1]$ while saving the function that actually did it for later. We'll need that function to apply exactly the same scaling to the incoming data at prediction time. 
Note that by simply swapping ```'DirectRunner'``` in the Apache Beam pipeline by ```'DataFlowRunner'``` in an adequately configured GCP environment, we could have the pipeline executed on an arbitrarily large cluster.