In [63]:
import tempfile
import math
from io import BytesIO
from google.cloud import storage
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import tensorflow_transform.beam as tft_beam
import tensorflow_transform as tft
import apache_beam as beam

In [2]:
client = storage.Client()
bucket = client.get_bucket('ames-house-dataset')
blob = storage.Blob('train.csv', bucket)
content = blob.download_as_string()
data = pd.read_csv(BytesIO(content), index_col=0)

In [3]:
CAT_FEATURES = ['OverallQual']
NUM_FEATURES = ['LotArea', 'LotFrontage', '1stFlrSF', 'GrLivArea']
TARGET = 'SalePrice'

RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string))
     for name in CAT_FEATURES]
    + [(name, tf.io.FixedLenFeature([], tf.float32))
       for name in NUM_FEATURES]
    + [(TARGET, tf.io.VarLenFeature(tf.float32))])

In [5]:
RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))

In [7]:
RAW_DATA_FEATURE_SPEC

{'OverallQual': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'LotArea': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'LotFrontage': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 '1stFlrSF': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'GrLivArea': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'SalePrice': VarLenFeature(dtype=tf.float32)}

In [134]:
def preprocessing_fn(inputs):
    outputs = inputs.copy()

    # outputs.drop(['1stFlrSF', 'OverallQual'], axis=1, inplace=True)

    # for key in ['LotArea', 'LotFrontage', 'GrLivArea']:
    #     outputs[key] = tft.apply_pyfunc(func=lambda x: x+1, *outputs[key])
    lot_area = outputs['LotArea']
    lot_area = tft.apply_function(lambda x: x+1, *[lot_area])
    # lot_area = lot_area - tft.mean(lot_area)

    return {'LotArea': lot_area}

In [135]:
temp_data = [{'OverallQual': 'A', 'LotArea': 10, 'LotFrontage': 5, '1stFlrSF': 0, 'GrLivArea': 20, 'SalePrice':10000},
             {'OverallQual': 'B', 'LotArea': 50, 'LotFrontage': 5, '1stFlrSF': 0, 'GrLivArea': 20, 'SalePrice':20000},
             {'OverallQual': 'C', 'LotArea': 20, 'LotFrontage': 5, '1stFlrSF': 0, 'GrLivArea': 20, 'SalePrice':30000}]

In [136]:
with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((temp_data, RAW_DATA_METADATA) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset









Instructions for updating:
apply_function is no longer needed.  `apply_function(fn, *args)` is now equvalent to `fn(*args)`


Instructions for updating:
apply_function is no longer needed.  `apply_function(fn, *args)` is now equvalent to `fn(*args)`


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpmndxvub4/tftransform_tmp/16abc163a5da4d91a23d69a6ae661667/saved_model.pb


INFO:tensorflow:SavedModel written to: /var/folders/kv/kjpm8x653fz2vnzbl7q4bwhmbf_6tj/T/tmpmndxvub4/tftransform_tmp/16abc163a5da4d91a23d69a6ae661667/saved_model.pb






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [137]:
transformed_data

[{'LotArea': 11.0}, {'LotArea': 51.0}, {'LotArea': 21.0}]