Install packages required for apache-beam and protobuf

In [11]:
!pip install protobuf==3.6
import google.protobuf as proto
print(proto.__version__)

3.6.0


In [12]:
!pip install apache-beam==2.7.0
!pip install apache-beam[gcp]





In [10]:
from absl import app
from absl import flags
import apache_beam as beam
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from py.google.fhir.labels import encounter
from py.google.fhir.labels import label


Initialize variables

In [None]:
flags.DEFINE_string('output_path', 'gs://labelsdata/', 'The output file path')
flags.DEFINE_string('input_path', 'gs://labelsdata/', 'The input file path')
FLAGS = flags.FLAGS

In [13]:
@beam.typehints.with_input_types(resources_pb2.Bundle)
@beam.typehints.with_output_types(google_extensions_pb2.EventLabel)
class LengthOfStayRangeLabelAt24HoursFn(beam.DoFn):
  """Converts Bundle into length of stay range at 24 hours label.

    Cohort: inpatient encounter that is longer than 24 hours
    Trigger point: 24 hours after admission
    Label: multi-label for length of stay ranges, see label.py for detail
  """

  def process(self, bundle):
    """Iterate through bundle and yield label.

    Args:
      bundle: input stu3.Bundle proto
    Yields:
      stu3.EventLabel proto.
    """
    patient = encounter.GetPatient(bundle)
    if patient is not None:
      # Cohort: inpatient encounter > 24 hours.
      for enc in encounter.Inpatient24HrEncounters(bundle):
        for one_label in label.LengthOfStayRangeAt24Hours(patient, enc):
          yield one_label



In [8]:
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re


options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'grand-magpie-222719'
google_cloud_options.job_name = 'job1'
google_cloud_options.staging_location = 'gs://bunsenbeam/staging'
google_cloud_options.temp_location = 'gs://bunsenbeam/temp'
#options.view_as(StandardOptions).runner = 'DataflowRunner'
options.view_as(StandardOptions).runner = 'DirectRunner'


Initalize the beam job

In [6]:
!pip install psutil
#installing psutil==5.4.8
!conda remove python-snappy -y
!conda install psutil==5.4.8 -y


2.7.0
Solving environment: failed

PackagesNotFoundError: The following packages are missing from the target environment:
  - python-snappy


Solving environment: done

## Package Plan ##

  environment location: /usr/local/envs/py2env

  added / updated specs: 
    - psutil==5.4.8


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    psutil-5.4.8               |   py27h7b6447c_0         307 KB  defaults

The following packages will be UPDATED:

    psutil: 4.3.0-py27_0 defaults --> 5.4.8-py27h7b6447c_0 defaults


Downloading and Extracting Packages
psutil-5.4.8         | 307 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


ImportError: cannot import name HolonomyGroup

In [14]:

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
p = beam.Pipeline(options=options)

bundles = p | 'read' >> beam.io.ReadFromTFRecord(
    'gs://labelsdata/data/test_bundle.tfrecord-00000-of-00001', coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    
labels = bundles | 'BundleToLabel' >> beam.ParDo(
    LengthOfStayRangeLabelAt24HoursFn())
_ = labels | beam.io.WriteToTFRecord(
    'gs://labelsdata/data',
    coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel))

p.run()


Using this argument will have no effect on the actual scopes for tokens
requested. These scopes are set at VM instance creation time and
can't be overridden in the request.

I1205 01:35:53.952888 140080787760896 client.py:614] Attempting refresh to obtain initial access_token
I1205 01:35:54.040196 140080787760896 client.py:614] Attempting refresh to obtain initial access_token
I1205 01:35:54.361263 140080787760896 fn_api_runner.py:912] Running ((ref_AppliedPTransform_WriteToTFRecord/Write/WriteImpl/DoOnce/Read_9)+((ref_AppliedPTransform_WriteToTFRecord/Write/WriteImpl/InitializeWrite_10)+(ref_PCollection_PCollection_4/Write)))+(ref_PCollection_PCollection_3/Write)
I1205 01:35:54.372519 140080787760896 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_4/Write >
I1205 01:35:54.375844 140080787760896 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_3/Write >
I1205 01:35:54.379164 140080787760896 bundle_processor.py:291] start <D

I1205 01:35:55.394455 140080787760896 bundle_processor.py:303] finish <DoOperation WriteToTFRecord/Write/WriteImpl/Extract output_tags=['out'], receivers=[ConsumerSet[WriteToTFRecord/Write/WriteImpl/Extract.out0, coder=WindowedValueCoder[LengthPrefixCoder[FastPrimitivesCoder]], len(consumers)=1]]>
I1205 01:35:55.396230 140080787760896 bundle_processor.py:303] finish <DataOutputOperation ref_PCollection_PCollection_11/Write >
I1205 01:35:55.400696 140080787760896 fn_api_runner.py:912] Running ((ref_PCollection_PCollection_3/Read)+(ref_AppliedPTransform_WriteToTFRecord/Write/WriteImpl/PreFinalize_19))+(ref_PCollection_PCollection_12/Write)
I1205 01:35:55.409799 140080787760896 bundle_processor.py:291] start <DataOutputOperation ref_PCollection_PCollection_12/Write >
I1205 01:35:55.411510 140080787760896 bundle_processor.py:291] start <DoOperation WriteToTFRecord/Write/WriteImpl/PreFinalize output_tags=['out']>
I1205 01:35:55.414489 140080787760896 bundle_processor.py:291] start <DataInpu

<apache_beam.runners.portability.fn_api_runner.RunnerResult at 0x7f66ddd57590>

Transform definition

Run the transform

In [None]:
   p.run().wait_until_finish()

YAY!!