## This notebook demonstrates bundles to seqex example

Install packages required for apache-beam and protobuf

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from absl import app
from absl import flags
import apache_beam as beam
from google.protobuf import text_format
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from proto.stu3 import version_config_pb2
from tensorflow.core.example import example_pb2
from py.google.fhir.seqex import bundle_to_seqex
import time

  from ._conv import register_converters as _register_converters
  from ._conv import register_converters as _register_converters
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
  from . import _ni_label


In [3]:
def _get_version_config(version_config_path):
  with open(version_config_path) as f:
    return text_format.Parse(f.read(), version_config_pb2.VersionConfig())

# Initialize variables

In [2]:
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'dp-workspace'
google_cloud_options.job_name = 'generate_seqex'
google_cloud_options.staging_location = 'gs://hdlcluster3/staging'
google_cloud_options.temp_location = 'gs://hdlcluster3/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'

In [7]:
input_path = 'gs://hdlcluster3/synthea/bundles/bundles*'
label_path = 'gs://hdlcluster3/synthea/labels/label-00000-of-00001.tfrecords'
output_path = 'gs://hdlcluster3/synthea/seqex/seqex'

In [8]:
%%bash
gsutil rm gs://hdlcluster3/synthea/seqex/*
gsutil ls gs://hdlcluster3/synthea/seqex

CommandException: No URLs matched: gs://hdlcluster3/synthea/seqex/*


## Transform definition

In [9]:
p = beam.Pipeline(options=options)
version_config = _get_version_config("/usr/local/fhir/proto/stu3/version_config.textproto")

keyed_bundles = ( 
    p 
    | 'readBundles' >> beam.io.ReadFromTFRecord(
        input_path, coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    | 'KeyBundlesByPatientId' >> beam.ParDo(
        bundle_to_seqex.KeyBundleByPatientIdFn()))
event_labels = ( 
    p | 'readEventLabels' >> beam.io.ReadFromTFRecord(
        label_path,
        coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel)))
keyed_event_labels = bundle_to_seqex.CreateTriggerLabelsPairLists(
    event_labels)
bundles_and_labels = bundle_to_seqex.CreateBundleAndLabels(
    keyed_bundles, keyed_event_labels)
_ = ( 
    bundles_and_labels
    | 'Reshuffle1' >> beam.Reshuffle()
    | 'GenerateSeqex' >> beam.ParDo(
        bundle_to_seqex.BundleAndLabelsToSeqexDoFn(
            version_config=version_config,
            enable_attribution=False,
            generate_sequence_label=False))
    | 'Reshuffle2' >> beam.Reshuffle()
    | 'WriteSeqex' >> beam.io.WriteToTFRecord(
        output_path,
        coder=beam.coders.ProtoCoder(example_pb2.SequenceExample),
        file_name_suffix='.tfrecords',
        num_shards=2))

I0125 14:12:08.349047 140523616286464 gcsio.py:446] Starting the size estimation of the input
I0125 14:12:08.351886 140523616286464 client.py:614] Attempting refresh to obtain initial access_token
I0125 14:12:08.442349 140523616286464 gcsio.py:460] Finished listing 9 files in 0.0933110713959 seconds.
I0125 14:12:08.453818 140523616286464 client.py:614] Attempting refresh to obtain initial access_token
I0125 14:12:08.523930 140523616286464 client.py:614] Attempting refresh to obtain initial access_token


Run the transform

In [10]:
start = time.time()
p.run().wait_until_finish()
end = time.time()
print(end-start)

I0125 14:12:11.657558 140523616286464 fn_api_runner.py:912] Running (ref_AppliedPTransform_readEventLabels/Read_6)+((ref_AppliedPTransform_KeyEventLabelsByPatientId_7)+(GroupEventLabelsByPatientId/Write))
I0125 14:12:11.668648 140523616286464 bundle_processor.py:291] start <DataOutputOperation GroupEventLabelsByPatientId/Write >
I0125 14:12:11.671686 140523616286464 bundle_processor.py:291] start <DoOperation KeyEventLabelsByPatientId output_tags=['out']>
I0125 14:12:11.676134 140523616286464 bundle_processor.py:291] start <ReadOperation readEventLabels/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.tfrecordio._TFRecordSource object at 0x7fcde1e2b390>, start_position=None, stop_position=None)>
I0125 14:12:11.682984 140523616286464 client.py:614] Attempting refresh to obtain initial access_token
I0125 14:12:11.751384 140523616286464 client.py:614] Attempting refresh to obtain initial access_token
I0125 14:12:11.818739 140523616286464 client.py:614] Attempting refresh to obt

I0125 14:12:51.306859 140523616286464 bundle_processor.py:303] finish <DoOperation KeyBundlesByPatientId output_tags=['out'], receivers=[ConsumerSet[KeyBundlesByPatientId.out0, coder=WindowedValueCoder[TupleCoder[BytesCoder, ProtoCoder]], len(consumers)=1]]>
I0125 14:12:51.309212 140523616286464 bundle_processor.py:303] finish <DoOperation GroupBundleAndTriggers/pair_with_bundle output_tags=['out'], receivers=[ConsumerSet[GroupBundleAndTriggers/pair_with_bundle.out0, coder=WindowedValueCoder[TupleCoder[LengthPrefixCoder[FastPrimitivesCoder], LengthPrefixCoder[FastPrimitivesCoder]]], len(consumers)=1]]>
I0125 14:12:51.311158 140523616286464 bundle_processor.py:303] finish <FlattenOperation GroupBundleAndTriggers/Flatten/Transcode/1 receivers=[ConsumerSet[GroupBundleAndTriggers/Flatten/Transcode/1.out0, coder=WindowedValueCoder[TupleCoder[LengthPrefixCoder[FastPrimitivesCoder], LengthPrefixCoder[FastPrimitivesCoder]]], len(consumers)=1]]>
I0125 14:12:51.313194 140523616286464 bundle_proc

I0125 14:13:49.000247 140523616286464 bundle_processor.py:291] start <DoOperation Reshuffle2/AddRandomKeys output_tags=['out']>
I0125 14:13:49.002923 140523616286464 bundle_processor.py:291] start <DoOperation GenerateSeqex output_tags=['out']>
I0125 14:13:49.010076 140523616286464 bundle_processor.py:291] start <DoOperation Reshuffle1/RemoveRandomKeys output_tags=['out']>
I0125 14:13:49.012942 140523616286464 bundle_processor.py:291] start <DoOperation Reshuffle1/ReshufflePerKey/FlatMap(restore_timestamps) output_tags=['out']>
I0125 14:13:49.015887 140523616286464 bundle_processor.py:291] start <DataInputOperation Reshuffle1/ReshufflePerKey/GroupByKey/Read receivers=[ConsumerSet[Reshuffle1/ReshufflePerKey/GroupByKey/Read.out0, coder=WindowedValueCoder[TupleCoder[LengthPrefixCoder[FastPrimitivesCoder], IterableCoder[TupleCoder[TupleCoder[BytesCoder, TupleCoder[LengthPrefixCoder[ProtoCoder], LengthPrefixCoder[FastPrimitivesCoder]]], LengthPrefixCoder[FastPrimitivesCoder]]]]], len(consum

I0125 14:17:39.928787 140523616286464 fn_api_runner.py:912] Running ((ref_PCollection_PCollection_32/Read)+(ref_AppliedPTransform_WriteSeqex/Write/WriteImpl/PreFinalize_57))+(ref_PCollection_PCollection_40/Write)
I0125 14:17:39.944227 140523616286464 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_40/Write >
I0125 14:17:39.946661 140523616286464 bundle_processor.py:291] start <DoOperation WriteSeqex/Write/WriteImpl/PreFinalize output_tags=['out']>
I0125 14:17:39.950160 140523616286464 bundle_processor.py:291] start <DataInputOperation ref_PCollection_PCollection_32/Read receivers=[ConsumerSet[ref_PCollection_PCollection_32/Read.out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I0125 14:17:39.955734 140523616286464 gcsio.py:446] Starting the size estimation of the input
I0125 14:17:39.958194 140523616286464 client.py:614] Attempting refresh to obtain initial access_token
I0125 14:17:40.041744 140523616286464 gcsio.py:460] Finished l

329.681813002


YAY!!