In [63]:
import tempfile
import math
from io import BytesIO
from google.cloud import storage
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import tensorflow_transform.beam as tft_beam
import tensorflow_transform as tft
import apache_beam as beam

In [2]:
client = storage.Client()
bucket = client.get_bucket('ames-house-dataset')
blob = storage.Blob('train.csv', bucket)
content = blob.download_as_string()
data = pd.read_csv(BytesIO(content), index_col=0)

In [170]:
CAT_FEATURES = ['GarageFinish', 'Neighborhood', 'BsmtQual',
                'KitchenQual', 'GarageQual', 'CentralAir',
                'GarageCond', 'LandContour', 'BsmtCond',
                'GarageType', 'MSZoning']

NUM_FEATURES = ['OverallQual', 'GarageArea', 'Fireplaces',
                '2ndFlrSF', 'TotRmsAbvGrd', 'GarageCars',
                'BsmtFinSF1', 'TotalBsmtSF', 'GarageYrBlt',
                '1stFlrSF', 'GrLivArea', 'LotArea']

TARGET = 'SalePrice'

RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string)) for name in CAT_FEATURES]
    + [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUM_FEATURES]
    + [(TARGET, tf.io.VarLenFeature(tf.float32))])

In [171]:
RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))

In [172]:
RAW_DATA_FEATURE_SPEC

{'GarageFinish': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'Neighborhood': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'BsmtQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'KitchenQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'GarageQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'CentralAir': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'GarageCond': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'LandContour': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'BsmtCond': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'GarageType': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'MSZoning': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'OverallQual': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'GarageArea': FixedLenFeature(shape=[], dtype=tf.

In [255]:
def preprocessing_fn(inputs):
    outputs = inputs.copy()

    for key in ['1stFlrSF', 'GrLivArea', 'LotArea']:
        outputs[key] = tf.math.log(outputs[key])

    return {'1stFlrSF': outputs['1stFlrSF'],
            'GrLivArea': outputs['GrLivArea'],
            'LotArea': outputs['LotArea']}

In [256]:
with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((data.to_dict(orient='records'), RAW_DATA_METADATA) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset









INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpeht2wkwg/tftransform_tmp/a981f8deaef74708b459673b8164e77b/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpeht2wkwg/tftransform_tmp/a981f8deaef74708b459673b8164e77b/saved_model.pb






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [257]:
transformed_data[:5]

[{'1stFlrSF': 6.75227, 'GrLivArea': 7.4442487, 'LotArea': 9.041922},
 {'1stFlrSF': 7.140453, 'GrLivArea': 7.140453, 'LotArea': 9.169518},
 {'1stFlrSF': 6.8243737, 'GrLivArea': 7.487734, 'LotArea': 9.328123},
 {'1stFlrSF': 6.8679743, 'GrLivArea': 7.4483337, 'LotArea': 9.164296},
 {'1stFlrSF': 7.04316, 'GrLivArea': 7.695303, 'LotArea': 9.565214}]