In [1]:
import tempfile
import math
from io import BytesIO
from google.cloud import storage
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_transform import coders
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import tensorflow_transform.beam as tft_beam
import tensorflow_transform as tft
import apache_beam as beam
from sklearn import model_selection

In [2]:
client = storage.Client()
bucket = client.get_bucket('ames-house-dataset')
blob = storage.Blob('train.csv', bucket)
content = blob.download_as_string()
data = pd.read_csv(BytesIO(content))

In [3]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

In [4]:
INPUT_COLS = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
              'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
              'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
              'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
              'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
              'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation',
              'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
              'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
              'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
              'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
              'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
              'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
              'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
              'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
              '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature',
              'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']


OPT_CAT_FEATURES = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                    'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish',
                    'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
                    'MiscFeature']


OPT_NUM_FEATURES = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


CAT_FEATURES = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
                'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
                'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
                'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual',
                'ExterCond', 'Foundation', 'Heating', 'HeatingQC',
                'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
                'SaleType', 'SaleCondition']


NUM_FEATURES = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
                'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2',
                'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
                'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea',
                'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


TARGET = 'SalePrice'


RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string)) for name in CAT_FEATURES]
    + [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUM_FEATURES]
    + [(name, tf.io.VarLenFeature(tf.string)) for name in OPT_CAT_FEATURES]
    + [(name, tf.io.VarLenFeature(tf.float32)) for name in OPT_NUM_FEATURES]
    + [(TARGET, tf.io.FixedLenFeature([], tf.float32))])

RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))
INPUT_SCHEMA = schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)
CONV_INPUT = coders.CsvCoder(INPUT_COLS, INPUT_SCHEMA)

In [5]:
class MapAndFilterErrors(beam.PTransform):
    """Like beam.Map but filters out errors in the map_fn."""

    class _MapAndFilterErrorsDoFn(beam.DoFn):
        """Count the bad examples using a beam metric."""

        def __init__(self, fn):
            self._fn = fn
            # Create a counter to measure number of bad elements.
            self._bad_elements_counter = beam.metrics.Metrics.counter('dataset', 'bad_elements')

        def process(self, element):
            try:
                yield self._fn(element)
            except Exception:  # pylint: disable=broad-except
                # Catch any exception the above call.
                self._bad_elements_counter.inc(1)

    def __init__(self, fn):
        self._fn = fn

    def expand(self, pcoll):
        return pcoll | beam.ParDo(self._MapAndFilterErrorsDoFn(self._fn))

In [6]:
def preprocessing_fn(inputs):
    outputs = inputs.copy()
    for key in ['1stFlrSF', 'GrLivArea', 'LotArea']:
        outputs[key] = tf.math.log(outputs[key])
    
    for key in ['Neighborhood']:
        outputs[key] = tft.compute_and_apply_vocabulary(outputs[key])
    
    return outputs

In [7]:
with beam.Pipeline() as p:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.CsvCoder(INPUT_COLS, RAW_DATA_METADATA.schema)
        lines = (p | 'read' >> beam.io.ReadFromText('gs://ames-house-dataset/train.csv', skip_header_lines=1)
                   | 'decode' >> MapAndFilterErrors(converter.decode))

        dataset = (lines, RAW_DATA_METADATA)
        transformed_dataset, transform_fn = (dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset









Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/5cfebb4e7d344a7ca4711ed0ab1f20aa/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/5cfebb4e7d344a7ca4711ed0ab1f20aa/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/53d26000af4344deba41c07709d675ba/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/53d26000af4344deba41c07709d675ba/saved_model.pb






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/2a501345a9894955820af50033932eb0/assets


INFO:tensorflow:Assets written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/2a501345a9894955820af50033932eb0/assets


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/2a501345a9894955820af50033932eb0/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpvreydj9s/tftransform_tmp/2a501345a9894955820af50033932eb0/saved_model.pb


value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_1:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore
