In [None]:
import tensorflow as tf
from tdx.components import CsvExampleGen, StatisticGen, SchemaGen, ExampleValidator, Transform, Trainer, Tuner
from tfx.proto import example_gen_pb2
from tfx.orchestration.experimental.interacrive.interactive_context import InteractiveContext
import os

## Set Variable

In [None]:
PIPELINE_NAME = "sarcasm-pipeline"
SCHAME_PIPELINE_NAME = "sarcasm-tfdv-schema"

# Directory untuk menyimpan artifact yang akan dihasilkan
PIPELINE_ROOT = os.path.join('pipeline', PIPELINE_NAME)

# path to SQLite DB file to use as an MLMD storage
METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')

# Output directory where created models from the pipeline will be exported
SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)

# from abs1 import logging
# logging.set_varbosity(logging.INFO)

In [None]:
DATA_ROOT = "data"

In [None]:
interactive_context = InteractiveContext(pipeline_root=PIPELINE_ROOT)

## Membuat Tahapan Data Ingestion

In [None]:
output = example_gen_pb2.Output(
    split_config = example_gen_pb2.SplitConfig(splits=[
        # membuat rasio split 8:2
        example_gen_pb2.SplitConfig.Split(name="train", hash_bucket=8),
        example_gen_pb2.SplitConfig.Split(name="valid", hash_bucket=2)
    ])
)

example_gen = CsvExampleGen(input_base = DATA_ROOT, output_config=output)

In [None]:
# jika mau melihat komponen ExampleGen secara interaktif
interactive_context.run(example_gen)

## Membuat Tahapan Data Validation

1.membuat summary statistics

In [None]:
statistics_gen = StatisticGen(
    example=example_gen.outputs["examples"]
)

interactive_context.run(statistics_gen)

2.menampilkan summary statistics yang sudah dibuat

In [None]:
interactive_context.show(statistics_gen.output["statistics"])

3.menampilkan data schema

In [None]:
schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'])
interactive_context.run(schema_gen)

In [None]:
# menampilkan data schema yang sudah dibuat
interactive_context.show(schema_gen)

4.mengidentifikasi anomali pada dataset

In [None]:
example_validator = ExampleValidator(
    statistics=statistics_gen.output['statistics'],
    schema=schema_gen.outputs['schema']
)
interactive_context.run(example_validator)

In [None]:
# menampilkan hasil validasi komponen
interactive_context.show(example_validator.output['anomalies'])

## Membuat Tahapan Data Preprocessing
menggunakan TFT dan komponen Transform

In [None]:
# definisikan modul dulu
TRANSFORM_MODULE_FILE = "sarcasm_transform.py"

In [None]:
# magic coammand untuk membuat modul, ini khusus hanya di jupyter
%%writefile {TRANSFORM_MODULE_FILE}

import tensorflow as td
LABEL_KEY = "is_sarcastic"
FEATURE_KEY = "headline"

# digunakan untuk mengubah nama fitur yang sudah di transform
def transformed_name(key):
    """Remaining transformed features"""
    return key + "_xf"

# menggunakan preprocessing sedergana. transform "headline" ke dalam bentuk lowercase dan "is_sarcasic" ke dalam bentuk integer
def preprocessing_fn(inputs):
    """
    Preprocess input features into transformed features

    Args:
        inputs: map from feature keys to raw featyres.
        
    Return:
        outputs: map form feature keys to transformed features
    """
    output = {}
    
    outpus[transformed_name(FEATURE_KEY)] = tf.strings.lower(inputs[FEATURE_KEY])
    
    ouputs[transformed_name(LABEL_KEY)] = TF.cast(inputs[LABEL_KEY], tf.int64)
    
    return outputs

In [None]:
# mendefinisikan transform
transform = Transform(
    example_gen = example_gen.outputs['examples'],
    schema = schema_gen.outputs['schema'],
    module_file = os.path.abspath(TRANSFORM_MODULE_FILE)
)
interactive_context.run(transform)

## Pengunaan ExampleGen di GCP

1. TFRecord Files

In [None]:
from tfx.components import ImportExampleGen
example_gen = ImportExampleGen(input_base = "tfrecord_dir")

2. Cloud storage Google Cloud

In [None]:
from tfx.components import CsvExampleGen

example_gen = CsvExampleGen(input_base = "gs://bucket_path")

3. Google Cloud BigQuery: platform DWH dari GCP

In [None]:
import os
from tfx.components import BigQueryExampleGen

# Mengatur Google Cloud credential
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path.json"

query = "SELECT * FROM <project_id>.<database>.<table_name>"

example_gen = BigQueryExampleGen(query=query)

## Melakukan data Splitting menggunakan Subdirectory

In [None]:
""" 
└── data
├── train
│ └─ iris-training.csv
├── eval
│ └─ iris-eval.csv
└── test
└─ iris-test.csv
"""

In [None]:
input = example_gen_pb2.Input(splits=[
    example_gen_pb2.Input.Split(name='train', pattern='train/*'),
    example_gen_pb2.Input.Split(name='eval', pattern='eval/*'),
    example_gen_pb2.Input.Split(name='test', pattern='test/*')
])

example_gen = CsvExampleGen(input_base="data", input_config=input)
context.run(example_gen)

## Spanning datasets

In [None]:
"""
└── data
├── export-0
│ ├── train
│ │ └─ 20k-iris-training.csv
│ └── eval
│ └─ 2k-iris-eval.csv
├── export-1
│ ├── train
│ │ └─ 24k-iris-training.csv
│ └── eval
│ └─ 3k-iris-eval.csv
└── export-2
├── train
│ └─ 26k-iris-training.csv
└── eval
  └─ 4k-iris-eval.csv
"""

In [None]:
input = example_gen_pb2.Input(splits=[
    example_gen_pb2.Input.Split(name='train', pattern='export-{SPAM}/train/*'),
    example_gen_pb2.Input.Split(name='eval', pattern='export-{SPAN}/eval/*')
])

example_gen = CsvExampleGen(input_base="data", input_config=input)
context.run(example_gen)

## Mengolah text data
- tft.compute_and_apply_vocabulary(): fungsi ini membuat sebuah vocabulary yang menghubungkan sebuah string dengan suatu integer.

- tft.ngrams(): fungsi ini menerima input berupa berupa token dengan tipe data SparseTensor. Selanjutnya fungsi ini akan menghasilkan sebuah SparseTensor yang berisi n-grams.

- tft.bag_of_words(): fungsi ini akan menghasilkan sebuah vektor bag-of-words berdasarkan n-grams.

- tft.tfidf(): fungsi ini akan melakukan proses TFIDF.


In [None]:
def preprocessing_fn(inputs):
  s = inputs['s']
 
  s_integerized = tft.compute_and_apply_vocabulary(s)
 
  return {
      's_integerized': s_integerized
  }

## Mengolah image data
- tf.image menyediakan beberapa fungsi yang dapat digunakan untuk memanipulasi image seperti resize, convert color, image transformation

- tf.io digunakan untuk melakukan dekode gambar menjadi bentuk tensor

In [None]:
def preprocessing_fn(new_image):
    raw_image = tf.reshape(raw_image, [-1])
    
    image_rgb = tf.io.decode_jpeg(raw_image, channels=3)
    image_gray = tf.image.rgb_to_greyscale(image_rgb)
    
    image = tf.image_convert_image_dtype(image_gray, tf.float32)
    resize_image = tf.image.resize_with_pad(
        image,
        target_height = 150,
        target_width = 150
    )
    
    return tf.reshape(resize_image, [-1, 150, 150, 1])