In [27]:
import tempfile
import math
from io import BytesIO
from google.cloud import storage
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_transform import coders
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import tensorflow_transform.beam as tft_beam
import tensorflow_transform as tft
import apache_beam as beam
from sklearn import model_selection

In [3]:
client = storage.Client()
bucket = client.get_bucket('ames-house-dataset')
blob = storage.Blob('train.csv', bucket)
content = blob.download_as_string()
data = pd.read_csv(BytesIO(content), index_col=0)

In [4]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

In [18]:
INPUT_COLS = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
              'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
              'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
              'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
              'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
              'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
              'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
              'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
              'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
              'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
              '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
              'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
              'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
              'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
              'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
              'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF',
              'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
              'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',
              'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']


OPT_CAT_FEATURES = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                    'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish',
                    'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
                    'MiscFeature']


OPT_NUM_FEATURES = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


CAT_FEATURES = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
                'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
                'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
                'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual',
                'ExterCond', 'Foundation', 'Heating', 'HeatingQC',
                'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
                'SaleType', 'SaleCondition']


NUM_FEATURES = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
                'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2',
                'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
                'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea',
                'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


TARGET = 'SalePrice'


RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string)) for name in CAT_FEATURES]
    + [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUM_FEATURES]
    + [(name, tf.io.VarLenFeature(tf.string)) for name in OPT_CAT_FEATURES]
    + [(name, tf.io.VarLenFeature(tf.float32)) for name in OPT_NUM_FEATURES]
    + [(TARGET, tf.io.FixedLenFeature([], tf.float32))])

In [19]:
RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))

In [25]:
INPUT_SCHEMA = schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)

In [28]:
CONV_INPUT = coders.CsvCoder(INPUT_COLS, INPUT_SCHEMA)

In [8]:
def preprocessing_fn(inputs):
    outputs = inputs.copy()

    for key in ['1stFlrSF', 'GrLivArea', 'LotArea']:
        outputs[key] = tf.math.log(outputs[key])
    
    for key in ['Neighborhood']:
        outputs[key] = tft.compute_and_apply_vocabulary(outputs[key])

    return {'1stFlrSF': outputs['1stFlrSF'],
            'GrLivArea': outputs['GrLivArea'],
            'LotArea': outputs['LotArea'],
            'Neighborhood': outputs['Neighborhood']}

In [21]:
# with beam.Pipeline() as pipeline:
#     with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
#         transformed_dataset, transform_fn = ((train.to_dict(orient='records'), RAW_DATA_METADATA) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
#         transformed_train, transformed_metadata = transformed_dataset

In [34]:
with beam.Pipeline():
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        # read data and join by key
        raw_data_input = (
            pipeline
            | 'ReadInputData' >> beam.io.ReadFromText('gs://ames-house-dataset/train.csv', skip_header_lines=1)
            | 'ParseInputCSV' >> beam.Map(CONV_INPUT.decode))