## This notebook demonstrates bundles to seqex example

Install packages required for apache-beam and protobuf

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from absl import app
from absl import flags
import apache_beam as beam
from google.protobuf import text_format
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from proto.stu3 import version_config_pb2
from tensorflow.core.example import example_pb2
from py.google.fhir.seqex import bundle_to_seqex

In [3]:
def _get_version_config(version_config_path):
  with open(version_config_path) as f:
    return text_format.Parse(f.read(), version_config_pb2.VersionConfig())

# Initialize variables

In [4]:
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'de-test-224618'
google_cloud_options.job_name = 'job1'
google_cloud_options.staging_location = 'gs://healthedatalab/staging'
google_cloud_options.temp_location = 'gs://healthedatalab/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'


## Transform definition

In [5]:
p = beam.Pipeline(options=options)
#version_config = _get_version_config("gs://seqex/testdata/version_config.textproto")
version_config = _get_version_config("/usr/local/fhir/proto/stu3/version_config.textproto")

keyed_bundles = ( 
    p 
    | 'readBundles' >> beam.io.ReadFromTFRecord(
        "gs://healthedatalab/bundle/test_bundle.tfrecord-00000-of-00001", coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    | 'KeyBundlesByPatientId' >> beam.ParDo(
        bundle_to_seqex.KeyBundleByPatientIdFn()))
event_labels = ( 
    p | 'readEventLabels' >> beam.io.ReadFromTFRecord(
        "gs://healthedatalab/labeldata/test-label-00000-of-00001",
        coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel)))
keyed_event_labels = bundle_to_seqex.CreateTriggerLabelsPairLists(
    event_labels)
bundles_and_labels = bundle_to_seqex.CreateBundleAndLabels(
    keyed_bundles, keyed_event_labels)
_ = ( 
    bundles_and_labels
    | 'Reshuffle1' >> beam.Reshuffle()
    | 'GenerateSeqex' >> beam.ParDo(
        bundle_to_seqex.BundleAndLabelsToSeqexDoFn(
            version_config=version_config,
            enable_attribution=False,
            generate_sequence_label=False))
    | 'Reshuffle2' >> beam.Reshuffle()
    | 'WriteSeqex' >> beam.io.WriteToTFRecord(
        "gs://healthedatalab/output/output",
        coder=beam.coders.ProtoCoder(example_pb2.SequenceExample)))


Using this argument will have no effect on the actual scopes for tokens
requested. These scopes are set at VM instance creation time and
can't be overridden in the request.

I0109 04:08:45.372737 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:08:45.462080 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:08:45.557154 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:08:45.627382 139776353011456 client.py:614] Attempting refresh to obtain initial access_token


Run the transform

In [6]:
   p.run().wait_until_finish()

I0109 04:09:15.914968 139776353011456 fn_api_runner.py:912] Running ((ref_AppliedPTransform_WriteSeqex/Write/WriteImpl/DoOnce/Read_48)+((ref_AppliedPTransform_WriteSeqex/Write/WriteImpl/InitializeWrite_49)+(ref_PCollection_PCollection_33/Write)))+(ref_PCollection_PCollection_32/Write)
I0109 04:09:15.926978 139776353011456 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_33/Write >
I0109 04:09:15.929882 139776353011456 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_32/Write >
I0109 04:09:15.932205 139776353011456 bundle_processor.py:291] start <DoOperation WriteSeqex/Write/WriteImpl/InitializeWrite output_tags=['out']>
I0109 04:09:15.936156 139776353011456 bundle_processor.py:291] start <ReadOperation WriteSeqex/Write/WriteImpl/DoOnce/Read source=SourceBundle(weight=1.0, source=<apache_beam.transforms.create_source._CreateSource object at 0x7f1fe9751e50>, start_position=None, stop_position=None)>
I0109 04:09:15.940578 13977

I0109 04:09:16.442567 139776353011456 bundle_processor.py:291] start <DoOperation GroupBundleAndTriggers/pair_with_bundle output_tags=['out']>
I0109 04:09:16.446636 139776353011456 bundle_processor.py:291] start <DoOperation KeyBundlesByPatientId output_tags=['out']>
I0109 04:09:16.449968 139776353011456 bundle_processor.py:291] start <ReadOperation readBundles/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.tfrecordio._TFRecordSource object at 0x7f1fe897fd90>, start_position=None, stop_position=None)>
I0109 04:09:16.465684 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:09:16.554418 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:09:16.763782 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:09:16.885014 139776353011456 bundle_processor.py:303] finish <ReadOperation readBundles/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.tfrecordio._TF

I0109 04:09:17.450575 139776353011456 bundle_processor.py:303] finish <DataInputOperation Reshuffle1/ReshufflePerKey/GroupByKey/Read receivers=[ConsumerSet[Reshuffle1/ReshufflePerKey/GroupByKey/Read.out0, coder=WindowedValueCoder[TupleCoder[LengthPrefixCoder[FastPrimitivesCoder], IterableCoder[TupleCoder[TupleCoder[BytesCoder, TupleCoder[LengthPrefixCoder[ProtoCoder], LengthPrefixCoder[FastPrimitivesCoder]]], LengthPrefixCoder[FastPrimitivesCoder]]]]], len(consumers)=1]]>
I0109 04:09:17.452595 139776353011456 bundle_processor.py:303] finish <DoOperation Reshuffle1/ReshufflePerKey/FlatMap(restore_timestamps) output_tags=['out'], receivers=[ConsumerSet[Reshuffle1/ReshufflePerKey/FlatMap(restore_timestamps).out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I0109 04:09:17.455130 139776353011456 bundle_processor.py:303] finish <DoOperation Reshuffle1/RemoveRandomKeys output_tags=['out'], receivers=[ConsumerSet[Reshuffle1/RemoveRandomKeys.out0, coder=WindowedValueCoder

I0109 04:09:18.274686 139776353011456 gcsio.py:446] Starting the size estimation of the input
I0109 04:09:18.277992 139776353011456 client.py:614] Attempting refresh to obtain initial access_token
I0109 04:09:18.368139 139776353011456 gcsio.py:460] Finished listing 0 files in 0.0934460163116 seconds.
I0109 04:09:18.369970 139776353011456 bundle_processor.py:303] finish <DataInputOperation ref_PCollection_PCollection_32/Read receivers=[ConsumerSet[ref_PCollection_PCollection_32/Read.out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I0109 04:09:18.372355 139776353011456 bundle_processor.py:303] finish <DoOperation WriteSeqex/Write/WriteImpl/PreFinalize output_tags=['out'], receivers=[ConsumerSet[WriteSeqex/Write/WriteImpl/PreFinalize.out0, coder=WindowedValueCoder[LengthPrefixCoder[FastPrimitivesCoder]], len(consumers)=1]]>
I0109 04:09:18.374728 139776353011456 bundle_processor.py:303] finish <DataOutputOperation ref_PCollection_PCollection_41/Write >
I0109 04:09:1

'DONE'

YAY!!