In [4]:
import os

from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

# use this to skip the warnings
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip

In [13]:
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

input_file = "/home/avisaha/machine-learning-pipelines/data/kinglear.txt"
output_file = "/home/avisaha/machine-learning-pipelines/data/kinglear_ouput.txt"
# Define pipeline options object.
pipeline_options = PipelineOptions([])

try:
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file or file pattern into a PCollection.
        lines = p | ReadFromText(input_file)

        # Check if lines is None
        if lines is None:
            raise ValueError("Failed to read input file")

        # Count the occurrences of each word.
        counts = (
        lines
        | 'Split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return "{}: {}".format(word, count)
        
        output = counts | 'Format' >> beam.Map(format_result)
        # Write the output using a "Write" transform that has side effects.
        output | WriteToText(output_file)

except Exception as e:
    print("An error occurred in the pipeline: ", e)

# Data Ingestion

In [16]:
import tensorflow as tf
with tf.io.TFRecordWriter("../data/test.tfrecord") as w:
    w.write(b"First record")
    w.write(b"Second record")

for record in tf.data.TFRecordDataset("../data/test.tfrecord"):
    print(record)

tf.Tensor(b'First record', shape=(), dtype=string)
tf.Tensor(b'Second record', shape=(), dtype=string)


## Converting comma-separated data to tf.Example

In [22]:
import os
from tfx.components import CsvExampleGen
from tfx.utils.dsl_utils import external_input

base_dir = os.getcwd()
data_dir = os.path.join(os.pardir, "data")
examples = external_input(os.path.join(base_dir, data_dir))

example_gen = CsvExampleGen(input=examples)

context = InteractiveContext()
context.run(example_gen)

ModuleNotFoundError: No module named 'tfx.utils.dsl_utils'