In [15]:
import tempfile
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
from tensorflow_transform.tf_metadata import dataset_schema
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import tensorflow_transform.beam.impl as beam_impl

from measurements import measure

In [16]:
tf.__version__

'1.13.1'

In [3]:
filename = tempfile.mktemp()

In [4]:
from measurements import measure

In [5]:
data = measure(5)
data.to_csv("signature.csv", index=None)
data = pd.read_csv("signature.csv")
data.head()

Unnamed: 0,beta1,beta2,weekday,hour,humidity
0,2.228445,2.956259,5,23,20.439816
1,4.155775,0.999272,2,13,27.018319
2,-1.913392,1.461341,5,2,14.403221
3,-2.427776,-4.657069,4,9,14.52531
4,-2.47451,-4.080336,1,2,17.129479


### Specify the input and output formats

In [6]:
ORDERED_SIGNATURE_COLUMNS=["beta1", "beta2", "weekday", "hour", "humidity"]
header = bytes(",".join(ORDERED_SIGNATURE_COLUMNS), 'UTF-8')

In [7]:
feature_spec = {
    'beta1': tf.io.FixedLenFeature([1], tf.float32),
    'beta2': tf.io.FixedLenFeature([1], tf.float32),
    'weekday': tf.io.FixedLenFeature([1], tf.int64),
    'hour': tf.io.FixedLenFeature([1], tf.int64),
    'humidity': tf.io.FixedLenFeature([1], tf.float32)
}
schema = dataset_schema.from_feature_spec(feature_spec)

### Create an encoder and test it

In [8]:
csv_encoder = tft.coders.CsvCoder(ORDERED_SIGNATURE_COLUMNS, schema)
records = csv_encoder.decode("10.0, 10.0, 3,4,1.0")
print(records)
csv_encoder.encode(records)

{'beta1': array([10.], dtype=float32), 'beta2': array([10.], dtype=float32), 'hour': array([4], dtype=int64), 'humidity': array([1.], dtype=float32), 'weekday': array([3], dtype=int64)}


b'10.0,10.0,3,4,1.0'

### The Apache Beam pipeline 

In [9]:
!rm -rf beam-temp-*
!rm -f training.csv-00000-of-* 

In [27]:
with beam.Pipeline('DirectRunner', PipelineOptions()) as p:

    csv_encoder = tft.coders.CsvCoder(ORDERED_SIGNATURE_COLUMNS, schema)    

    _ = (p 
         | 'read_from_csv' >> beam.io.ReadFromText(
             file_pattern='signature.csv', coder=csv_encoder, skip_header_lines=1)
         
         | 'process_records' >> beam.Map(process_data)
         
         | 'write_to_csv' >> beam.io.WriteToText(
             file_path_prefix='training.csv', coder=csv_encoder, header=header)
        )


{'beta1': array([0.22094154], dtype=float32), 'beta2': array([1.0594496], dtype=float32), 'hour': array([9], dtype=int64), 'humidity': array([16.77154], dtype=float32), 'weekday': array([5], dtype=int64)}
{'beta1': array([2.2204974], dtype=float32), 'beta2': array([4.0277033], dtype=float32), 'hour': array([3], dtype=int64), 'humidity': array([19.62251], dtype=float32), 'weekday': array([6], dtype=int64)}
{'beta1': array([-0.09811807], dtype=float32), 'beta2': array([0.11373159], dtype=float32), 'hour': array([7], dtype=int64), 'humidity': array([15.606735], dtype=float32), 'weekday': array([0], dtype=int64)}
{'beta1': array([4.593776], dtype=float32), 'beta2': array([3.3665817], dtype=float32), 'hour': array([0], dtype=int64), 'humidity': array([25.40757], dtype=float32), 'weekday': array([2], dtype=int64)}
{'beta1': array([-0.00353715], dtype=float32), 'beta2': array([-2.6694198], dtype=float32), 'hour': array([18], dtype=int64), 'humidity': array([29.535917], dtype=float32), 'weekda

In [21]:
!cat training.csv-00000-of-00001

beta1,beta2,weekday,hour,humidity
0.22094154,1.0594496,5,9,16.77154
2.2204974,4.0277033,6,3,19.62251
-0.098118074,0.11373159,0,7,15.606735
4.593776,3.3665817,2,0,25.40757
-0.003537154,-2.6694198,2,18,29.535917


In [26]:
def process_data(inp):
    print(inp)
    
    return inp

In [14]:
signature_metadata = dataset_metadata.DatasetMetadata(schema)

In [None]:
with beam.Pipeline('DirectRunner', PipelineOptions()) as p:

    csv_encoder = tft.coders.CsvCoder(ORDERED_SIGNATURE_COLUMNS, schema)    

    signature_data = 
        p | 'read_from_csv' 
        >> beam.io.ReadFromText(
             file_pattern='signature.csv', coder=csv_encoder, skip_header_lines=1)

    signature_data = ( signature_data, signature_metadata)
    
    training_data, transform_fn = ( signature_data | "AnalyzeAndTransform" 
                     >> beam_impl.AnalyzeAndTransformDataset(process_data))

    training_data
    
         | 'write_to_csv' >> beam.io.WriteToText(
             file_path_prefix='training.csv', coder=csv_encoder, header=header)
        )


In [10]:
from tensorflow_transform.tf_metadata import dataset_metadata

In [12]:
TRAINING_METADATA

{'_schema': Schema(feature {
  name: "beta1"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "beta2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "hour"
  type: INT
  presence {
    min_fraction: 1.0
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "humidity"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "weekday"
  type: INT
  presence {
    min_fraction: 1.0
  }
  shape {
    dim {
      size: 1
    }
  }
}
)}