In [1]:
import tempfile
import math
from io import BytesIO
from google.cloud import storage
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import tensorflow_transform.beam as tft_beam
import tensorflow_transform as tft
import apache_beam as beam
from sklearn import model_selection

In [2]:
client = storage.Client()
bucket = client.get_bucket('ames-house-dataset')
blob = storage.Blob('train.csv', bucket)
content = blob.download_as_string()
data = pd.read_csv(BytesIO(content), index_col=0)

In [3]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

In [4]:
CAT_FEATURES = ['GarageFinish', 'Neighborhood', 'BsmtQual',
                'KitchenQual', 'GarageQual', 'CentralAir',
                'GarageCond', 'LandContour', 'BsmtCond',
                'GarageType', 'MSZoning']

NUM_FEATURES = ['OverallQual', 'GarageArea', 'Fireplaces',
                '2ndFlrSF', 'TotRmsAbvGrd', 'GarageCars',
                'BsmtFinSF1', 'TotalBsmtSF', 'GarageYrBlt',
                '1stFlrSF', 'GrLivArea', 'LotArea']

OPT_CAT_FEATURES = ['GarageFinish', 'GarageQual', 'GarageCond',
                    'GarageType', 'BsmtQual', 'BsmtCond']

OPT_NUM_FEATURES = ['GarageYrBlt']

TARGET = 'SalePrice'

RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string)) for name in CAT_FEATURES]
    + [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUM_FEATURES]
    + [(TARGET, tf.io.VarLenFeature(tf.float32))])

In [5]:
RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))

In [6]:
RAW_DATA_FEATURE_SPEC

{'GarageFinish': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'Neighborhood': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'BsmtQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'KitchenQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'GarageQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'CentralAir': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'GarageCond': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'LandContour': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'BsmtCond': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'GarageType': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'MSZoning': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'OverallQual': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'GarageArea': FixedLenFeature(shape=[], dtype=tf.

In [7]:
def preprocessing_fn(inputs):
    outputs = inputs.copy()

    for key in ['1stFlrSF', 'GrLivArea', 'LotArea']:
        outputs[key] = tf.math.log(outputs[key])
    
    for key in ['Neighborhood']:
        outputs[key] = tft.compute_and_apply_vocabulary(outputs[key])

    return {'1stFlrSF': outputs['1stFlrSF'],
            'GrLivArea': outputs['GrLivArea'],
            'LotArea': outputs['LotArea'],
            'Neighborhood': outputs['Neighborhood']}

In [8]:
with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((train.to_dict(orient='records'), RAW_DATA_METADATA) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_train, transformed_metadata = transformed_dataset









Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/250e557d20e7489bad54f1b972f3ad46/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/250e557d20e7489bad54f1b972f3ad46/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/9aa1bbe5bf79442ba749067b7e8ab96d/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/9aa1bbe5bf79442ba749067b7e8ab96d/saved_model.pb






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/8acaf3622d4e44b7b55408486576803e/assets


INFO:tensorflow:Assets written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/8acaf3622d4e44b7b55408486576803e/assets


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/8acaf3622d4e44b7b55408486576803e/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpt1yjgg92/tftransform_tmp/8acaf3622d4e44b7b55408486576803e/saved_model.pb


value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [9]:
transformed_train[:5]

[{'1stFlrSF': 7.1808314,
  'GrLivArea': 7.1808314,
  'LotArea': 9.035987,
  'Neighborhood': 0},
 {'1stFlrSF': 6.683361,
  'GrLivArea': 7.3594675,
  'LotArea': 8.966612,
  'Neighborhood': 6},
 {'1stFlrSF': 6.6795993,
  'GrLivArea': 6.6795993,
  'LotArea': 9.07989,
  'Neighborhood': 3},
 {'1stFlrSF': 6.8885727,
  'GrLivArea': 7.4776044,
  'LotArea': 8.881836,
  'Neighborhood': 16},
 {'1stFlrSF': 6.933423,
  'GrLivArea': 7.4330754,
  'LotArea': 8.517193,
  'Neighborhood': 16}]