## This notebook demonstrates bundles to seqex example

Install packages required for apache-beam and protobuf

In [1]:
!pip install protobuf==3.6
import google.protobuf as proto
!pip install apache-beam==2.7.0
!pip install apache-beam[gcp]
!pip install psutil
!conda remove python-snappy -y
!conda install psutil==5.4.8 -y

Collecting protobuf==3.6
[?25l  Downloading https://files.pythonhosted.org/packages/27/e7/bf96130ebe633b08a3913da4bb25e50dac5779f1f68e51c99485423f7443/protobuf-3.6.0-cp27-cp27mu-manylinux1_x86_64.whl (7.1MB)
[K    100% |████████████████████████████████| 7.1MB 3.7MB/s eta 0:00:01
[31mtensorboard 1.8.0 has requirement bleach==1.5.0, but you'll have bleach 2.1.2 which is incompatible.[0m
[31mtensorboard 1.8.0 has requirement html5lib==0.9999999, but you'll have html5lib 1.0.1 which is incompatible.[0m
[31mgoogle-cloud-dataflow 2.0.0 has requirement httplib2<0.10,>=0.8, but you'll have httplib2 0.12.0 which is incompatible.[0m
[31mgoogle-cloud-dataflow 2.0.0 has requirement protobuf==3.2.0, but you'll have protobuf 3.6.0 which is incompatible.[0m
Installing collected packages: protobuf
  Found existing installation: protobuf 3.6.1
    Uninstalling protobuf-3.6.1:
      Successfully uninstalled protobuf-3.6.1
Successfully installed protobuf-3.6.0
Collecting pip==9.0.3
[?25l  Down

## Reset kernel

In [20]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from absl import app
from absl import flags
import apache_beam as beam
from google.protobuf import text_format
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from proto.stu3 import version_config_pb2
from tensorflow.core.example import example_pb2


In [21]:
from py.google.fhir.seqex import bundle_to_seqex

In [22]:
def _get_version_config(version_config_path):
  with open(version_config_path) as f:
    return text_format.Parse(f.read(), version_config_pb2.VersionConfig())

# Initialize variables

In [23]:
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'de-test-224618'
google_cloud_options.job_name = 'job1'
google_cloud_options.staging_location = 'gs://healthedatalab/staging'
google_cloud_options.temp_location = 'gs://healthedatalab/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'


## Transform definition

In [28]:
p = beam.Pipeline(options=options)
#version_config = _get_version_config("gs://seqex/testdata/version_config.textproto")
version_config = _get_version_config("/usr/local/fhir/proto/stu3/version_config.textproto")

keyed_bundles = ( 
    p 
    | 'readBundles' >> beam.io.ReadFromTFRecord(
        "gs://healthedatalab/bundle/test_bundle.tfrecord-00000-of-00001", coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    | 'KeyBundlesByPatientId' >> beam.ParDo(
        bundle_to_seqex.KeyBundleByPatientIdFn()))
event_labels = ( 
    p | 'readEventLabels' >> beam.io.ReadFromTFRecord(
        "gs://healthedatalab/labeldata/test-label-00000-of-00001",
        coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel)))
keyed_event_labels = bundle_to_seqex.CreateTriggerLabelsPairLists(
    event_labels)
bundles_and_labels = bundle_to_seqex.CreateBundleAndLabels(
    keyed_bundles, keyed_event_labels)
_ = ( 
    bundles_and_labels
    | 'Reshuffle1' >> beam.Reshuffle()
    | 'GenerateSeqex' >> beam.ParDo(
        bundle_to_seqex.BundleAndLabelsToSeqexDoFn(
            version_config=version_config,
            enable_attribution=False,
            generate_sequence_label=False))
    | 'Reshuffle2' >> beam.Reshuffle()
    | 'WriteSeqex' >> beam.io.WriteToTFRecord(
        "gs://healthedatalab/output/output",
        coder=beam.coders.ProtoCoder(example_pb2.SequenceExample)))


I1222 05:33:05.470644 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:05.538533 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:05.616277 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:05.689532 139993450239744 client.py:614] Attempting refresh to obtain initial access_token


Run the transform

In [29]:
   p.run().wait_until_finish()

I1222 05:33:08.443044 139993450239744 fn_api_runner.py:912] Running (ref_AppliedPTransform_readEventLabels/Read_6)+((ref_AppliedPTransform_KeyEventLabelsByPatientId_7)+(GroupEventLabelsByPatientId/Write))
I1222 05:33:08.452006 139993450239744 bundle_processor.py:291] start <DataOutputOperation GroupEventLabelsByPatientId/Write >
I1222 05:33:08.454138 139993450239744 bundle_processor.py:291] start <DoOperation KeyEventLabelsByPatientId output_tags=['out']>
I1222 05:33:08.456631 139993450239744 bundle_processor.py:291] start <ReadOperation readEventLabels/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.tfrecordio._TFRecordSource object at 0x7f5280ee5890>, start_position=None, stop_position=None)>
I1222 05:33:08.463313 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:08.532279 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:08.601000 139993450239744 client.py:614] Attempting refresh to obt

I1222 05:33:08.867156 139993450239744 bundle_processor.py:291] start <ReadOperation readBundles/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.tfrecordio._TFRecordSource object at 0x7f5280cc6ad0>, start_position=None, stop_position=None)>
I1222 05:33:08.873121 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:09.035702 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:09.132199 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:09.323936 139993450239744 bundle_processor.py:303] finish <ReadOperation readBundles/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.tfrecordio._TFRecordSource object at 0x7f5280cc6ad0>, start_position=None, stop_position=None), receivers=[ConsumerSet[readBundles/Read.out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I1222 05:33:09.325764 139993450239744 bundle_processor.py:303] finish <Do

I1222 05:33:09.536113 139993450239744 bundle_processor.py:303] finish <DataInputOperation Reshuffle1/ReshufflePerKey/GroupByKey/Read receivers=[ConsumerSet[Reshuffle1/ReshufflePerKey/GroupByKey/Read.out0, coder=WindowedValueCoder[TupleCoder[LengthPrefixCoder[FastPrimitivesCoder], IterableCoder[TupleCoder[TupleCoder[BytesCoder, TupleCoder[LengthPrefixCoder[ProtoCoder], LengthPrefixCoder[FastPrimitivesCoder]]], LengthPrefixCoder[FastPrimitivesCoder]]]]], len(consumers)=1]]>
I1222 05:33:09.538016 139993450239744 bundle_processor.py:303] finish <DoOperation Reshuffle1/ReshufflePerKey/FlatMap(restore_timestamps) output_tags=['out'], receivers=[ConsumerSet[Reshuffle1/ReshufflePerKey/FlatMap(restore_timestamps).out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I1222 05:33:09.540751 139993450239744 bundle_processor.py:303] finish <DoOperation Reshuffle1/RemoveRandomKeys output_tags=['out'], receivers=[ConsumerSet[Reshuffle1/RemoveRandomKeys.out0, coder=WindowedValueCoder

I1222 05:33:10.175589 139993450239744 gcsio.py:446] Starting the size estimation of the input
I1222 05:33:10.177778 139993450239744 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:33:10.259891 139993450239744 gcsio.py:460] Finished listing 0 files in 0.0842850208282 seconds.
I1222 05:33:10.263046 139993450239744 bundle_processor.py:303] finish <DataInputOperation ref_PCollection_PCollection_32/Read receivers=[ConsumerSet[ref_PCollection_PCollection_32/Read.out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I1222 05:33:10.266056 139993450239744 bundle_processor.py:303] finish <DoOperation WriteSeqex/Write/WriteImpl/PreFinalize output_tags=['out'], receivers=[ConsumerSet[WriteSeqex/Write/WriteImpl/PreFinalize.out0, coder=WindowedValueCoder[LengthPrefixCoder[FastPrimitivesCoder]], len(consumers)=1]]>
I1222 05:33:10.268115 139993450239744 bundle_processor.py:303] finish <DataOutputOperation ref_PCollection_PCollection_41/Write >
I1222 05:33:1

'DONE'

YAY!!