![ga4](https://www.google-analytics.com/collect?v=2&tid=G-6VDTYWLKX6&cid=1&en=page_view&sid=1&dl=statmike%2Fvertex-ai-mlops%2FDev%2Fnew&dt=Autoencoders+-+Data.ipynb)

# Autoencoders - Data

How to retrieve data for training, and using, an autoencoder.



---
## Colab Setup

To run this notebook in Colab click [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/statmike/vertex-ai-mlops/blob/main/Applied%20Autoencoders/Autoencoders%20-%20Data.ipynb) and run the cells in this section.  Otherwise, skip this section.

This cell will authenticate to GCP (follow prompts in the popup).

In [43]:
PROJECT_ID = 'statmike-mlops-349915' # replace with project ID

In [44]:
try:
    import google.colab
    from google.colab import auth
    auth.authenticate_user()
    !gcloud config set project {PROJECT_ID}
except Exception:
    pass

---
## Installs

The list `packages` contains tuples of package import names and install names.  If the import name is not found then the install name is used to install quitely for the current user.

In [45]:
# tuples of (import name, install name)
packages = [
    ('google.cloud.bigquery', 'google-cloud-bigquery'),
    ('google.cloud.bigquery_storage', 'google-cloud-bigquery-storage'),
    ('bigframes', 'bigframes'),
    ('pandas_gbq', 'pandas-gbq'),
    ('tensorflow', 'tensorflow', '2.10'),
    ('tensorflow_io', '--no-deps tensorflow-io'),
    ('graphviz', 'graphviz'),
    ('pydot', 'pydot')
]

import importlib
install = False
for package in packages:
    if not importlib.util.find_spec(package[0]):
        print(f'installing package {package[1]}')
        install = True
        !pip install {package[1]} -U -q --user
    elif len(package) == 3:
        if importlib.metadata.version(package[0]) < package[2]:
            print(f'updating package {package[1]}')
            install = True
            !pip install {package[1]} -U -q --user

In [46]:
#!sudo apt-get -qq install graphviz

### Restart Kernel (If Installs Occured)

After a kernel restart the code submission can start with the next cell after this one.

In [47]:
if install:
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

---
## Setup

inputs:

In [48]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'statmike-mlops-349915'

In [49]:
REGION = 'us-central1'
EXPERIMENT = 'data'
SERIES = 'applied-autoencoders'

# source data
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'fraud'
BQ_TABLE = 'fraud_prepped'

# specify a GCS Bucket
GCS_BUCKET = PROJECT_ID

# Model Training
VAR_TARGET = 'Class'
VAR_OMIT = 'transaction_id,splits' # add more variables to the string with comma delimiters

packages:

In [72]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from google.cloud import bigquery
from google.cloud import bigquery_storage
import bigframes.pandas as bpd
import pandas as pd
import concurrent.futures

from tensorflow.python.framework import dtypes
from tensorflow_io.bigquery import BigQueryClient
import tensorflow as tf

#from datetime import datetime

#from google.protobuf import json_format
#from google.protobuf.struct_pb2 import Value
#import json
#import numpy as np


clients:

In [51]:
bq = bigquery.Client(project = PROJECT_ID)
bqstorage = bigquery_storage.BigQueryReadClient()
bpd.options.bigquery.project = PROJECT_ID

---
## Methods to Retrieve Training Data From BigQuery

### Common Query

In [108]:
query = f'''
SELECT * EXCEPT({','.join(VAR_OMIT.replace(' ', '').split(','))})
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
WHERE splits = 'TRAIN'
'''
print(query)


SELECT * EXCEPT(transaction_id,splits)
FROM `statmike-mlops-349915.fraud.fraud_prepped`
WHERE splits = 'TRAIN'



### BigQuery Cell Magic

https://cloud.google.com/python/docs/reference/bigquery/latest/magics

In [109]:
%%bigquery bq_data_magic
SELECT * EXCEPT(transaction_id,splits)
FROM `statmike-mlops-349915.fraud.fraud_prepped`
WHERE splits = 'TRAIN'

Query is running:   0%|          |

Downloading:   0%|          |

In [110]:
bq_data_magic.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,2812,-0.633403,0.963616,2.494946,2.099051,-0.404331,0.235862,-0.007932,0.211442,-0.209817,...,0.014676,0.016278,-0.061462,0.355196,-0.179086,-0.106947,-0.215039,0.050698,0.0,0
1,3150,1.313281,-0.257923,0.118463,-0.735557,-0.569308,-0.733577,-0.138659,-0.141641,1.708019,...,-0.082467,0.126066,-0.223157,-0.074977,0.92194,-0.528283,0.064476,0.013132,0.0,0
2,16676,1.15848,0.168947,0.536345,1.187908,-0.265547,-0.076325,-0.355844,0.144615,1.462346,...,0.016492,0.263518,-0.076711,-0.079402,0.502827,-0.270819,-0.004966,-0.003372,0.0,0
3,17701,-1.279231,-0.153303,3.29631,3.320441,1.139018,0.542343,-0.729928,-0.051774,0.922712,...,-0.409746,-0.342575,-0.493297,-0.017046,-0.107404,0.101164,-0.19794,-0.435654,0.0,0
4,28131,1.069507,-0.000362,1.448936,2.874498,-0.736266,0.831932,-0.762267,0.406772,0.626473,...,0.035393,0.444433,-0.085413,0.09909,0.506438,0.246418,0.057864,0.021133,0.0,0


In [111]:
type(bq_data_magic)

pandas.core.frame.DataFrame

In [112]:
bq_data_magic.shape

(228061, 31)

### BigQuery Python Client

https://cloud.google.com/python/docs/reference/bigquery/latest

In [113]:
bq_data_client = bq.query(query = query).to_dataframe()
bq_data_client.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,2812,-0.633403,0.963616,2.494946,2.099051,-0.404331,0.235862,-0.007932,0.211442,-0.209817,...,0.014676,0.016278,-0.061462,0.355196,-0.179086,-0.106947,-0.215039,0.050698,0.0,0
1,3150,1.313281,-0.257923,0.118463,-0.735557,-0.569308,-0.733577,-0.138659,-0.141641,1.708019,...,-0.082467,0.126066,-0.223157,-0.074977,0.92194,-0.528283,0.064476,0.013132,0.0,0
2,16676,1.15848,0.168947,0.536345,1.187908,-0.265547,-0.076325,-0.355844,0.144615,1.462346,...,0.016492,0.263518,-0.076711,-0.079402,0.502827,-0.270819,-0.004966,-0.003372,0.0,0
3,17701,-1.279231,-0.153303,3.29631,3.320441,1.139018,0.542343,-0.729928,-0.051774,0.922712,...,-0.409746,-0.342575,-0.493297,-0.017046,-0.107404,0.101164,-0.19794,-0.435654,0.0,0
4,28131,1.069507,-0.000362,1.448936,2.874498,-0.736266,0.831932,-0.762267,0.406772,0.626473,...,0.035393,0.444433,-0.085413,0.09909,0.506438,0.246418,0.057864,0.021133,0.0,0


In [114]:
type(bq_data_client)

pandas.core.frame.DataFrame

In [115]:
bq_data_client.shape

(228061, 31)

### BigQuery BigFrames Client

https://cloud.google.com/python/docs/reference/bigframes/latest

In [116]:
bq_data_bigframes = bpd.read_gbq(query)
bq_data_bigframes.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,72890,-1.22258,-0.017622,2.317581,-1.547722,-0.958068,-0.370571,-0.583838,0.384328,-0.72238,...,0.430025,1.217131,-0.463494,0.456253,0.385304,-0.104713,-0.303068,-0.300302,5.9,0
1,131206,1.967597,-1.009301,-1.970656,-0.406056,1.614598,3.92548,-1.209586,0.952736,-0.429297,...,-0.288566,-0.420307,0.258054,0.632264,-0.148758,-0.656398,0.077885,-0.027551,59.0,0
2,122831,2.290614,-1.288035,-1.091499,-1.591945,-0.983697,-0.58711,-0.952236,-0.272064,-1.392405,...,-0.161871,0.04143,0.225622,0.672485,-0.105101,-0.194599,0.007488,-0.040725,30.0,0
3,68397,1.258859,0.440981,0.331167,0.681581,-0.267935,-1.046229,0.163925,-0.269223,-0.142249,...,-0.27486,-0.734847,0.116306,0.376938,0.25547,0.090629,-0.015355,0.033149,0.89,0
4,152137,2.023988,-0.351874,-0.494781,0.36047,-0.400929,-0.202362,-0.544039,-0.078031,1.364484,...,0.160192,0.774027,0.021697,-0.601828,0.029147,-0.175735,0.04743,-0.041086,9.99,0


In [117]:
type(bq_data_bigframes)

bigframes.dataframe.DataFrame

In [118]:
bq_data_bigframes.shape

(228061, 31)

In [119]:
bq_data_bigframes = bq_data_bigframes.to_pandas()
type(bq_data_bigframes)

pandas.core.frame.DataFrame

In [120]:
bq_data_bigframes.shape

(228061, 31)

### BigQuery Storage Client

https://cloud.google.com/python/docs/reference/bigquerystorage/latest

In [204]:
read_session = bqstorage.create_read_session(
    request = dict(
        parent = f'projects/{PROJECT_ID}',
        read_session = dict(
            table = f"projects/{BQ_PROJECT}/datasets/{BQ_DATASET}/tables/{BQ_TABLE}",
            data_format = bigquery_storage.types.DataFormat.ARROW,
            read_options = dict(
                row_restriction = "splits = 'TRAIN'",
                selected_fields = bq_data_bigframes.columns.tolist()
            )
        ),
        max_stream_count = 0
    )
)

In [205]:
len(read_session.streams)

1

In [206]:
def read_stream(stream):
    # setup a reader
    reader = bqstorage.read_rows(name = stream.name)
    # read rows from reader into a dataframe.  Note this is actually multiple operations - read and convert
    return reader.to_dataframe()


bq_data_storage = []
with concurrent.futures.ThreadPoolExecutor(max_workers = len(read_session.streams)) as executor:
    futures = {
        executor.submit(read_stream, stream): stream for stream in read_session.streams
    }
    for future in concurrent.futures.as_completed(futures):
        stream = futures[future]
        bq_data_storage.append(future.result())

In [207]:
len(bq_data_storage)

1

In [208]:
bq_data_storage[0].shape

(228061, 31)

In [209]:
bq_data_storage = pd.concat(bq_data_storage)
bq_data_storage.shape

(228061, 31)

In [210]:
type(bq_data_storage)

pandas.core.frame.DataFrame

In [211]:
bq_data_storage.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,2812,-0.633403,0.963616,2.494946,2.099051,-0.404331,0.235862,-0.007932,0.211442,-0.209817,...,0.014676,0.016278,-0.061462,0.355196,-0.179086,-0.106947,-0.215039,0.050698,0.0,0
1,3150,1.313281,-0.257923,0.118463,-0.735557,-0.569308,-0.733577,-0.138659,-0.141641,1.708019,...,-0.082467,0.126066,-0.223157,-0.074977,0.92194,-0.528283,0.064476,0.013132,0.0,0
2,16676,1.15848,0.168947,0.536345,1.187908,-0.265547,-0.076325,-0.355844,0.144615,1.462346,...,0.016492,0.263518,-0.076711,-0.079402,0.502827,-0.270819,-0.004966,-0.003372,0.0,0
3,17701,-1.279231,-0.153303,3.29631,3.320441,1.139018,0.542343,-0.729928,-0.051774,0.922712,...,-0.409746,-0.342575,-0.493297,-0.017046,-0.107404,0.101164,-0.19794,-0.435654,0.0,0
4,28131,1.069507,-0.000362,1.448936,2.874498,-0.736266,0.831932,-0.762267,0.406772,0.626473,...,0.035393,0.444433,-0.085413,0.09909,0.506438,0.246418,0.057864,0.021133,0.0,0


### Indirect BigQuery with `pandas-gbq`

When working with [Pandas](https://pandas.pydata.org/docs/user_guide/index.html#user-guide) the methods above show the client returning data to pandas dataframes.  This section will show a pandas mudule, [pandas-gbq](https://pandas-gbq.readthedocs.io/en/latest/) that wraps the BigQuery client so that pandas can retrieve BigQuery data to dataframes.

References:
- [Comparison of BigQuery Client with pandas-gbq](https://cloud.google.com/bigquery/docs/pandas-gbq-migration)

In [133]:
bq_data_pandasgbq = pd.read_gbq(query, project_id = PROJECT_ID)
bq_data_pandasgbq.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,2812,-0.633403,0.963616,2.494946,2.099051,-0.404331,0.235862,-0.007932,0.211442,-0.209817,...,0.014676,0.016278,-0.061462,0.355196,-0.179086,-0.106947,-0.215039,0.050698,0.0,0
1,3150,1.313281,-0.257923,0.118463,-0.735557,-0.569308,-0.733577,-0.138659,-0.141641,1.708019,...,-0.082467,0.126066,-0.223157,-0.074977,0.92194,-0.528283,0.064476,0.013132,0.0,0
2,16676,1.15848,0.168947,0.536345,1.187908,-0.265547,-0.076325,-0.355844,0.144615,1.462346,...,0.016492,0.263518,-0.076711,-0.079402,0.502827,-0.270819,-0.004966,-0.003372,0.0,0
3,17701,-1.279231,-0.153303,3.29631,3.320441,1.139018,0.542343,-0.729928,-0.051774,0.922712,...,-0.409746,-0.342575,-0.493297,-0.017046,-0.107404,0.101164,-0.19794,-0.435654,0.0,0
4,28131,1.069507,-0.000362,1.448936,2.874498,-0.736266,0.831932,-0.762267,0.406772,0.626473,...,0.035393,0.444433,-0.085413,0.09909,0.506438,0.246418,0.057864,0.021133,0.0,0


In [134]:
type(bq_data_pandasgbq)

pandas.core.frame.DataFrame

In [135]:
bq_data_pandasgbq.shape

(228061, 31)

### TensorFlow I/O

https://www.tensorflow.org/io

In [177]:
query = f'''
SELECT *
FROM {BQ_PROJECT}.{BQ_DATASET}.INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = '{BQ_TABLE}'
    AND COLUMN_NAME NOT IN ('transaction_id', 'splits')
'''
schema = bq.query(query).to_dataframe()

In [178]:
schema.data_type.unique().tolist()

['INT64', 'FLOAT64']

In [179]:
types = {
    'FLOAT64' : dtypes.float64,
    'INT64' : dtypes.int64
}

In [180]:
bq_data_tfio = BigQueryClient()
type(bq_data_tfio)

tensorflow_io.python.ops.bigquery_dataset_ops.BigQueryClient

In [181]:
bq_data_tfio = bq_data_tfio.read_session(
    parent = f"projects/{PROJECT_ID}",
    project_id = BQ_PROJECT,
    table_id = BQ_TABLE,
    dataset_id = BQ_DATASET,
    selected_fields = [x for x in schema.column_name.tolist()],
    output_types = [types[x] for x in schema.data_type.tolist()],
    row_restriction = f"splits='TRAIN'",
    requested_streams = 3
)
type(bq_data_tfio)

tensorflow_io.python.ops.bigquery_dataset_ops.BigQueryReadSession

In [182]:
bq_data_tfio = bq_data_tfio.parallel_read_rows(sloppy = True, num_parallel_calls = tf.data.experimental.AUTOTUNE)
type(bq_data_tfio)

tensorflow.python.data.ops.interleave_op._ParallelInterleaveDataset

In [188]:
for rows in bq_data_tfio.batch(5).take(1):
    print(type(rows))
    print(list(rows.keys()))
    for item in rows.items():
        print(item)

<class 'collections.OrderedDict'>
['Amount', 'Class', 'Time', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
('Amount', <tf.Tensor: shape=(5,), dtype=float64, numpy=array([0., 0., 0., 0., 0.])>)
('Class', <tf.Tensor: shape=(5,), dtype=int64, numpy=array([0, 0, 0, 0, 0])>)
('Time', <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 2812,  3150, 16676, 17701, 28131])>)
('V1', <tf.Tensor: shape=(5,), dtype=float64, numpy=array([-0.63340299,  1.31328087,  1.15847976, -1.27923083,  1.06950736])>)
('V10', <tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 0.3082976 , -1.10329377, -0.17276001,  0.84594969,  0.37324618])>)
('V11', <tf.Tensor: shape=(5,), dtype=float64, numpy=array([-1.20499231, -1.08782009,  2.05305928,  1.38923569, -1.32944263])>)
('V12', <tf.Tensor: shape=(5,), dtype=float64, numpy=array([-0.47470781,  0.64467588, -2.73649895, -2.44018

In [187]:
type(bq_data_tfio.batch(5).take(1))

tensorflow.python.data.ops.take_op._TakeDataset

## Data To Training

The methods above result in either a Pandas dataframe or a `tf.data` object in the case of TensorFlow I/O.  This section shows how to use both each input type to train a model with TensorFlow.

### From Pandas To TensorFlow

In [212]:
features = bq_data_storage.copy()
target = features.pop(VAR_TARGET)

### From TensorFlow I/O to TensorFlow