In [1]:
import sys

# Confirm that we're using Python 3
assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'

In [2]:
import argparse
import os
import pprint
import tempfile
import urllib.request
import zipfile

# print("Installing dependencies for Colab environment")
# !pip install -q -Uq grpcio==1.26.0

import tensorflow as tf

# print('Installing Apache Beam')
# !pip install -q -Uq apache_beam==2.16.0
import apache_beam as beam

# print('Installing TensorFlow Transform')
# !pip install -q -Uq tensorflow-transform==0.15.0
import tensorflow_transform as tft

import apache_beam.io.iobase
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

In [3]:
raw_data = [
      {'x': 1, 'y': 1, 's': 'hello'},
      {'x': 2, 'y': 2, 's': 'world'},
      {'x': 3, 'y': 3, 's': 'hello'}
  ]

In [7]:
raw_data_metadata = dataset_metadata.DatasetMetadata(
    tft.tf_metadata.schema_utils.schema_from_feature_spec({
        'y': tf.io.FixedLenFeature([], tf.float32),
        'x': tf.io.FixedLenFeature([], tf.float32),
        's': tf.io.FixedLenFeature([], tf.string),
    }))

In [8]:
def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        's_integerized': s_integerized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
    }

In [22]:
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

In [59]:
def main():
  tmp_dir = tempfile.mkdtemp()
  # Ignore the warnings
  with tft_beam.Context(temp_dir=tmp_dir):
    transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))
    
    transform_fn | 'Write Transform Artefacts' >> transform_fn_io.WriteTransformFn(tmp_dir)

  transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

  print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
  print('Transformed data:\n{}'.format(pprint.pformat(transformed_data)))
  print(type(transformed_data[0]['s_integerized']))

if __name__ == '__main__':
  main()









INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /tmp/tmpgbmbt3c1/tftransform_tmp/0336adc6ec9b420497756deb6ac44b1a/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpgbmbt3c1/tftransform_tmp/0336adc6ec9b420497756deb6ac44b1a/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /tmp/tmpgbmbt3c1/tftransform_tmp/227f4b719b3b466eb0242344105ca280/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpgbmbt3c1/tftransform_tmp/227f4b719b3b466eb0242344105ca280/saved_model.pb






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets written to: /tmp/tmpgbmbt3c1/tftransform_tmp/72d750f5721b41a0be03f07a01b542c2/assets


INFO:tensorflow:Assets written to: /tmp/tmpgbmbt3c1/tftransform_tmp/72d750f5721b41a0be03f07a01b542c2/assets


INFO:tensorflow:SavedModel written to: /tmp/tmpgbmbt3c1/tftransform_tmp/72d750f5721b41a0be03f07a01b542c2/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpgbmbt3c1/tftransform_tmp/72d750f5721b41a0be03f07a01b542c2/saved_model.pb


value: "\n\013\n\tConst_5:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_5:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_5:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_5:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore



Raw data:
[{'s': 'hello', 'x': 1, 'y': 1},
 {'s': 'world', 'x': 2, 'y': 2},
 {'s': 'hello', 'x': 3, 'y': 3}]

Transformed data:
[{'s_integerized': 0,
  'x_centered': -1.0,
  'x_centered_times_y_normalized': -0.0,
  'y_normalized': 0.0},
 {'s_integerized': 1,
  'x_centered': 0.0,
  'x_centered_times_y_normalized': 0.0,
  'y_normalized': 0.5},
 {'s_integerized': 0,
  'x_centered': 1.0,
  'x_centered_times_y_normalized': 1.0,
  'y_normalized': 1.0}]
<class 'numpy.int64'>


In [30]:
%cat /tmp/tmp9j56nfm6/tftransform_tmp/b199519c7ef04ca795796fcdcf1d8430/assets/vocab_compute_and_apply_vocabulary_vocabulary

hello
world


In [36]:
%ls /tmp/tmp9j56nfm6/tran

[0m[01;34mtftransform_tmp[0m/  [01;34mtransform_fn[0m/  [01;34mtransformed_metadata[0m/


In [39]:
%ls /tmp/tmp9j56nfm6/transform_fn/

[0m[01;34massets[0m/  saved_model.pb  [01;34mvariables[0m/


In [40]:
%ls /tmp/tmp9j56nfm6/transformed_metadata

schema.pbtxt


In [42]:
%cat /tmp/tmp9j56nfm6/transformed_metadata/schema.pbtxt

feature {
  name: "s_integerized"
  type: INT
  int_domain {
    min: -1
    max: 1
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "x_centered"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "x_centered_times_y_normalized"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "y_normalized"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
