Install packages required for apache-beam and protobuf

In [11]:
!pip install protobuf==3.6
import google.protobuf as proto
!pip install apache-beam==2.7.0
!pip install apache-beam[gcp]
!pip install psutil
!conda remove python-snappy -y
!conda install psutil==5.4.8 -y

3.6.0


## RESET KERNEL

In [11]:
from absl import app
from absl import flags
import apache_beam as beam
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from py.google.fhir.labels import encounter
from py.google.fhir.labels import label


In [12]:
@beam.typehints.with_input_types(resources_pb2.Bundle)
@beam.typehints.with_output_types(google_extensions_pb2.EventLabel)
class LengthOfStayRangeLabelAt24HoursFn(beam.DoFn):
  """Converts Bundle into length of stay range at 24 hours label.

    Cohort: inpatient encounter that is longer than 24 hours
    Trigger point: 24 hours after admission
    Label: multi-label for length of stay ranges, see label.py for detail
  """

  def process(self, bundle):
    """Iterate through bundle and yield label.

    Args:
      bundle: input stu3.Bundle proto
    Yields:
      stu3.EventLabel proto.
    """
    patient = encounter.GetPatient(bundle)
    if patient is not None:
      # Cohort: inpatient encounter > 24 hours.
      for enc in encounter.Inpatient24HrEncounters(bundle):
        for one_label in label.LengthOfStayRangeAt24Hours(patient, enc):
          yield one_label



## Initialize pipeline variables

In [13]:
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'de-test-224618'
google_cloud_options.job_name = 'job1'
google_cloud_options.staging_location = 'gs://healthedatalab/staging'
google_cloud_options.temp_location = 'gs://healthedatalab/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'


## Initalize the beam job

In [17]:

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
p = beam.Pipeline(options=options)

bundles = p | 'read' >> beam.io.ReadFromTFRecord(
    'gs://healthedatalab/bundle/test_bundle.tfrecord-00000-of-00001', coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    
labels = bundles | 'BundleToLabel' >> beam.ParDo(
    LengthOfStayRangeLabelAt24HoursFn())
_ = labels | beam.io.WriteToTFRecord(
    'gs://healthedatalab/labeldata/test-label',
    coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel))



I1222 05:29:09.472229 140090207852288 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:29:09.540534 140090207852288 client.py:614] Attempting refresh to obtain initial access_token


## Run the transform

In [18]:
   p.run().wait_until_finish()

I1222 05:29:11.368493 140090207852288 fn_api_runner.py:912] Running ((ref_AppliedPTransform_WriteToTFRecord/Write/WriteImpl/DoOnce/Read_9)+((ref_AppliedPTransform_WriteToTFRecord/Write/WriteImpl/InitializeWrite_10)+(ref_PCollection_PCollection_4/Write)))+(ref_PCollection_PCollection_3/Write)
I1222 05:29:11.377844 140090207852288 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_4/Write >
I1222 05:29:11.379923 140090207852288 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_3/Write >
I1222 05:29:11.381635 140090207852288 bundle_processor.py:291] start <DoOperation WriteToTFRecord/Write/WriteImpl/InitializeWrite output_tags=['out']>
I1222 05:29:11.384872 140090207852288 bundle_processor.py:291] start <ReadOperation WriteToTFRecord/Write/WriteImpl/DoOnce/Read source=SourceBundle(weight=1.0, source=<apache_beam.transforms.create_source._CreateSource object at 0x7f6910cb6210>, start_position=None, stop_position=None)>
I1222 05:29:

I1222 05:29:12.412421 140090207852288 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_12/Write >
I1222 05:29:12.414108 140090207852288 bundle_processor.py:291] start <DoOperation WriteToTFRecord/Write/WriteImpl/PreFinalize output_tags=['out']>
I1222 05:29:12.418554 140090207852288 bundle_processor.py:291] start <DataInputOperation ref_PCollection_PCollection_3/Read receivers=[ConsumerSet[ref_PCollection_PCollection_3/Read.out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
I1222 05:29:12.423790 140090207852288 gcsio.py:446] Starting the size estimation of the input
I1222 05:29:12.428107 140090207852288 client.py:614] Attempting refresh to obtain initial access_token
I1222 05:29:12.507543 140090207852288 gcsio.py:460] Finished listing 0 files in 0.0837459564209 seconds.
I1222 05:29:12.509641 140090207852288 bundle_processor.py:303] finish <DataInputOperation ref_PCollection_PCollection_3/Read receivers=[ConsumerSet[ref_PCollection_PCo

'DONE'

YAY!!