<h1> HIMSS Demo - HealtheDatalab </h1>

<h2> Structured Machine Learning using Tensorflow </h2>
<hr />
This notebook demonstrates a process to train, evaluate and deploy a ML model to CloudML. It leverages a pre-built machine learning model to predict Length of Stay in ED and inpatient care settings
<h3>
<br />
<ol>
<li> Access, Analize & Visualize Data using HealtheDataLab </li> <br />
<li> Label generation - Generate Labels in TFRecord format </li> <br />
<li> Generate TFSequenceExamples with context = patient + time series data = encounters </li> <br />
<li> Train and Evaluate Machine Learning Model </li> <br />
<li> Deploy ML Model to CloudML </li> <br />
</ol>
</h3>
<hr />


In [4]:
%bash
#python demo_utils.py
#import demo_utils
#print(demo_utils)

<h2> 1. Access, Analize & Visualize Data using HealtheDataLab </h2>

In [6]:
from pyspark.sql import SparkSession
from bunsen.stu3.bundles import load_from_directory, extract_entry
from demo_utils import age

# Enable Hive support for our session so we can save resources as Hive tables
spark = SparkSession.builder \
                    .config('hive.exec.dynamic.partition.mode', 'nonstrict') \
                    .enableHiveSupport() \
                    .getOrCreate()

# Load and cache the bundles so we don't reload them every time.
bundles = load_from_directory(spark, 'gs://cluster-data/demo/data/synthea/fhir/').cache()

# Extract patients from bundles
patients = extract_entry(spark, bundles, 'patient')

pats = patients.select('id','gender', 'birthDate', 'address.city', 'address.state', 'address.country') 

#pats['birthDate'] = pats['birthDate'].apply(age)
patsDF = pats.limit(10).toPandas()
patsDF['age'] = patsDF['birthDate'].apply(age)
display(patsDF)

Unnamed: 0,id,gender,birthDate,city,state,country,age
0,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,male,1980-11-07,[Pittsfield],[Massachusetts],[US],38
1,urn:uuid:345efce8-d11d-429d-9984-6b67e31a7269,male,1952-06-04,[Harwich],[Massachusetts],[US],66
2,urn:uuid:44810270-bafe-42a4-8fc8-c229368b0058,male,1966-02-17,[Hubbardston],[Massachusetts],[US],52
3,urn:uuid:d6be5e17-7733-4096-b3a7-32c2a80582af,female,2018-12-29,[Worcester],[Massachusetts],[US],0
4,urn:uuid:5c6ad3ff-99b1-47b3-92c1-a37d82a5a559,male,1961-03-13,[Methuen Town],[Massachusetts],[US],57
5,urn:uuid:e3952c11-3fa2-4492-899c-bbbb8c7b6db0,male,1956-07-01,[Wareham],[Massachusetts],[US],62
6,urn:uuid:665b7d87-1e8a-46f5-a2fb-6e8200f6662e,male,1952-08-05,[Hudson],[Massachusetts],[US],66
7,urn:uuid:08e56bf9-7034-4b6e-8345-c61a0d910c6e,female,1963-03-29,[Brockton],[Massachusetts],[US],55
8,urn:uuid:e272d8a3-73c9-4887-a457-f0d1d7cc1e44,female,2003-11-19,[Weymouth Town],[Massachusetts],[US],15
9,urn:uuid:1d9e528b-18b4-4cfa-bfd4-d2eb85e9ce1b,female,1984-11-14,[Lowell],[Massachusetts],[US],34


In [7]:
from pyspark.sql.functions import col
from demo_utils import los

# Extract encounters from bundles
encounters = extract_entry(spark, bundles, 'encounter') 

encs=encounters.select('subject.reference', 
                  'class.code', 
                  'period.start', 
                  'period.end') \
          .where(col('class.code').isin("inpatient", "emergency"))


encsDF = encs.limit(10).toPandas()
encsDF['los'] = encsDF.apply(los, axis=1)
display(encsDF)

Unnamed: 0,reference,code,start,end,los
0,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1994-12-11T11:05:54-08:00,1994-12-12T11:20:54-08:00,"1 day, 0:15:00"
1,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1995-04-06T12:05:54-07:00,1995-04-07T12:20:54-07:00,"1 day, 0:15:00"
2,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1995-06-19T12:05:54-07:00,1995-06-20T12:05:54-07:00,"1 day, 0:00:00"
3,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1995-08-25T12:05:54-07:00,1995-08-26T12:05:54-07:00,"1 day, 0:00:00"
4,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1995-11-28T11:05:54-08:00,1995-11-29T11:05:54-08:00,"1 day, 0:00:00"
5,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1996-01-18T11:05:54-08:00,1996-01-19T11:05:54-08:00,"1 day, 0:00:00"
6,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1996-03-02T11:05:54-08:00,1996-03-03T11:20:54-08:00,"1 day, 0:15:00"
7,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1996-04-22T12:05:54-07:00,1996-04-23T12:20:54-07:00,"1 day, 0:15:00"
8,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1996-10-01T12:05:54-07:00,1996-10-02T12:05:54-07:00,"1 day, 0:00:00"
9,urn:uuid:c127185e-9f14-462a-9817-c90963fb7354,inpatient,1996-12-17T11:05:54-08:00,1996-12-18T11:20:54-08:00,"1 day, 0:15:00"


In [None]:
#CODE(WIP)
PROJECT = 'dp-workspace'
REGION = 'us-west1'
BUCKET = 'cluster-data'

import os
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET

In [None]:
%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

<h2> 2. Preparation of data - Create input data bundles in TFRecord format</h2>
This cell creates FHIR bundles from RAW Synthetic data

In [None]:
from pyspark.sql import SparkSession

# Enable Hive support for our session so we can save resources as Hive tables
spark = SparkSession.builder \
                    .config('hive.exec.dynamic.partition.mode', 'nonstrict') \
                    .enableHiveSupport() \
                    .getOrCreate()

from bunsen.stu3.bundles import load_from_directory, extract_entry, write_to_database

# Load and cache the raw data (FHIR bundles) from Google Cloud Storage bucket so we don't reload them every time.
bundles = load_from_directory(spark, 'gs://bunsen/data/bundles').cache()

# Create TFRecords from the raw FHIR bundles (one line to create TFrecordas)
#TODO ........
#For now we have generated a sample TF Record and stored in a following cloud storage bucket: gs://cluster-data/demo/data/test_bundle.tfrecord-00000-of-00001 
#Text version of the test_bundle.tfrecord-00000-of-00001 is in file: bundle_1.pbtxt

In [None]:
%bash
gsutil ls -l gs://${BUCKET}/demo/data/

<h2> 3. Label generation - Generate Labels in TFRecord format</h2>
Input: FHIR bundles
Output: Labels

In [None]:
from absl import app
from absl import flags
import apache_beam as beam
from proto.stu3 import google_extensions_pb2
from proto.stu3 import resources_pb2
from py.google.fhir.labels import encounter
from py.google.fhir.labels import label

@beam.typehints.with_input_types(resources_pb2.Bundle)
@beam.typehints.with_output_types(google_extensions_pb2.EventLabel)
class LengthOfStayRangeLabelAt24HoursFn(beam.DoFn):
  """Converts Bundle into length of stay range at 24 hours label.

    Cohort: inpatient encounter that is longer than 24 hours
    Trigger point: 24 hours after admission
    Label: multi-label for length of stay ranges, see label.py for detail
  """

  def process(self, bundle):
    """Iterate through bundle and yield label.

    Args:
      bundle: input stu3.Bundle proto
    Yields:
      stu3.EventLabel proto.
    """
    patient = encounter.GetPatient(bundle)
    if patient is not None:
      # Cohort: inpatient encounter > 24 hours.
      for enc in encounter.Inpatient24HrEncounters(bundle):
        for one_label in label.LengthOfStayRangeAt24Hours(patient, enc):
          yield one_label
          
          
          
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import PipelineOptions

from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import apache_beam as beam
import re


options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
#google_cloud_options.project = 'dp-workspace'
google_cloud_options.project = PROJECT
google_cloud_options.job_name = 'job1'
google_cloud_options.staging_location = 'gs://cluster-data/demo/staging'
google_cloud_options.temp_location = 'gs://cluster-data/demo/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'

p = beam.Pipeline(options=options)
input_bundle = 'gs://cluster-data/demo/data/test_bundle.tfrecord-00000-of-00001'
output_file_prefix = 'gs://cluster-data/demo/data/output/label'

bundles = p | 'read' >> beam.io.ReadFromTFRecord(input_bundle, coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
    
labels = bundles | 'BundleToLabel' >> beam.ParDo(
    LengthOfStayRangeLabelAt24HoursFn())
_ = labels | beam.io.WriteToTFRecord(output_file_prefix,
    coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel))


p.run().wait_until_finish()

In [None]:
# Above cell generates a label TFRecord and stores it into a GS Bucket
%bash
gsutil ls -l gs://cluster-data/demo/data/output

<h2> 4. Generate TFSequenceExamples with context = patient + time series data = encounters</h2>
Input: FHIR bundles
Output: Features

In [None]:
#CODE(WIP)

<h2> 5. Train and Evaluate Machine Learning Model </h2>
Input: Training and Evaluation Dataset
Output: Model

In [None]:
#CODE(WIP)

<h2> 6. Deploy ML Model to ??? </h2>
Input: New Data set
Output: Length Of Stay Prediction

In [None]:
#CODE(WIP)