## This notebook demonstrates bundles to seqex example

Install packages required for apache-beam and protobuf

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from absl import app
from absl import flags
import apache_beam as beam
from google.protobuf import text_format
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from proto.stu3 import version_config_pb2
from tensorflow.core.example import example_pb2
from py.google.fhir.seqex import bundle_to_seqex
import time

  from ._conv import register_converters as _register_converters
  from ._conv import register_converters as _register_converters
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
  from . import _ni_label


In [1]:
def _get_version_config(version_config_path):
  with open(version_config_path) as f:
    return text_format.Parse(f.read(), version_config_pb2.VersionConfig())

# Initialize variables

In [3]:
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'dp-workspace'
google_cloud_options.job_name = 'generate_seqex'
google_cloud_options.staging_location = 'gs://cluster-data/staging'
google_cloud_options.temp_location = 'gs://cluster-data/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'

In [4]:
input_path = 'gs://cluster-data/demo/data/bundles/bundles*'
label_path = 'gs://cluster-data/demo/data/output/labels-00000-of-00001.tfrecords'
output_path = 'gs://cluster-data/demo/data/output/seqex'

## Transform definition

In [5]:
p = beam.Pipeline(options=options)
version_config = _get_version_config("/usr/local/fhir/proto/stu3/version_config.textproto")

keyed_bundles = ( 
    p 
    | 'readBundles' >> beam.io.ReadFromTFRecord(
        input_path, coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    | 'KeyBundlesByPatientId' >> beam.ParDo(
        bundle_to_seqex.KeyBundleByPatientIdFn()))
event_labels = ( 
    p | 'readEventLabels' >> beam.io.ReadFromTFRecord(
        label_path,
        coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel)))
keyed_event_labels = bundle_to_seqex.CreateTriggerLabelsPairLists(
    event_labels)
bundles_and_labels = bundle_to_seqex.CreateBundleAndLabels(
    keyed_bundles, keyed_event_labels)
_ = ( 
    bundles_and_labels
    | 'Reshuffle1' >> beam.Reshuffle()
    | 'GenerateSeqex' >> beam.ParDo(
        bundle_to_seqex.BundleAndLabelsToSeqexDoFn(
            version_config=version_config,
            enable_attribution=False,
            generate_sequence_label=False))
    | 'Reshuffle2' >> beam.Reshuffle()
    | 'WriteSeqex' >> beam.io.WriteToTFRecord(
        output_path,
        coder=beam.coders.ProtoCoder(example_pb2.SequenceExample),
        file_name_suffix='.tfrecords',
        num_shards=2))

Using this argument will have no effect on the actual scopes for tokens
requested. These scopes are set at VM instance creation time and
can't be overridden in the request.

I0124 19:58:37.658966 140304687015680 gcsio.py:446] Starting the size estimation of the input
I0124 19:58:37.662934 140304687015680 client.py:614] Attempting refresh to obtain initial access_token
I0124 19:58:37.783057 140304687015680 gcsio.py:460] Finished listing 10 files in 0.12409901619 seconds.
I0124 19:58:37.802076 140304687015680 client.py:614] Attempting refresh to obtain initial access_token
I0124 19:58:37.871395 140304687015680 client.py:614] Attempting refresh to obtain initial access_token


Run the transform

In [None]:
start = time.time()
p.run().wait_until_finish()
end = time.time()
print(end-start)

I0124 19:58:46.972425 140304687015680 fn_api_runner.py:912] Running ((ref_AppliedPTransform_WriteSeqex/Write/WriteImpl/DoOnce/Read_48)+((ref_AppliedPTransform_WriteSeqex/Write/WriteImpl/InitializeWrite_49)+(ref_PCollection_PCollection_33/Write)))+(ref_PCollection_PCollection_32/Write)
I0124 19:58:46.984545 140304687015680 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_32/Write >
I0124 19:58:46.987873 140304687015680 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_33/Write >
I0124 19:58:46.989535 140304687015680 bundle_processor.py:291] start <DoOperation WriteSeqex/Write/WriteImpl/InitializeWrite output_tags=['out']>
I0124 19:58:46.993908 140304687015680 bundle_processor.py:291] start <ReadOperation WriteSeqex/Write/WriteImpl/DoOnce/Read source=SourceBundle(weight=1.0, source=<apache_beam.transforms.create_source._CreateSource object at 0x7f9ae8b49550>, start_position=None, stop_position=None)>
I0124 19:58:46.998186 14030

I0124 19:59:36.106637 140304687015680 bundle_processor.py:303] finish <DoOperation KeyEventLabelsByPatientId output_tags=['out'], receivers=[ConsumerSet[KeyEventLabelsByPatientId.out0, coder=WindowedValueCoder[TupleCoder[BytesCoder, LengthPrefixCoder[ProtoCoder]]], len(consumers)=1]]>
I0124 19:59:36.108788 140304687015680 bundle_processor.py:303] finish <DataOutputOperation GroupEventLabelsByPatientId/Write >
I0124 19:59:36.121429 140304687015680 fn_api_runner.py:912] Running (((GroupEventLabelsByPatientId/Read)+(ref_AppliedPTransform_CreateTriggerLabelsPairLists_12))+((ref_AppliedPTransform_GroupBundleAndTriggers/pair_with_trigger_labels_pair_lists_14)+(GroupBundleAndTriggers/Flatten/Transcode/0)))+(GroupBundleAndTriggers/Flatten/Write/0)
I0124 19:59:36.136590 140304687015680 bundle_processor.py:291] start <DataOutputOperation GroupBundleAndTriggers/Flatten/Write/0 >
I0124 19:59:36.138300 140304687015680 bundle_processor.py:291] start <FlattenOperation GroupBundleAndTriggers/Flatten/T

YAY!!