# Section Extraction Notebook (BigQuery)

This notebook extracts sections from MIMIC based on the trigger text file you've unpacked in `../data` in the clone of this repo. You need to create a project in your Google Cloud  / BigQuery account as describe over here.

Please note down your **project id** and the **table name** you would like to save the extracted sections to in BigQuery

## Variables

In [18]:
GOOGLE_PROJECT_ID = "hst-953-2019"
GOOGLE_CLIENT_SECRET_FILE = "/Users/reneahlsdorf/Downloads/client_secret_255053072421-h3okuh4a17ruuuffjriv09l6e3n6lslo.apps.googleusercontent.com.json" # "<PATH TO CLIENT FILE>"
BIGQUERY_TARGET_TABLE = "non_adherence.all_sectioned_new_trigger_v10_2022_restore_2"
DATASETNAME = "mimiciii_noteevents"
TRIGGERS_PATH = "../data/triggers.txt"
JOBS = 2

## Extraction

In [25]:
import multiprocessing as mp
import sys, os, math
import pandas as pd
from google_auth_oauthlib import flow
from google.cloud import bigquery
import warnings

from db import mimic
from sectioning import sectioning
from data import triggers

In [None]:
# Setup stuff
mp.set_start_method('fork')
sys.path.insert(0, "../src")
warnings.filterwarnings('ignore')

### MIMIC Extraction

In [None]:
appflow = flow.InstalledAppFlow.from_client_secrets_file(
    GOOGLE_CLIENT_SECRET_FILE, scopes=["https://www.googleapis.com/auth/bigquery"]
)
appflow.run_console()
credentials = appflow.credentials

In [7]:
client = bigquery.Client(GOOGLE_PROJECT_ID, credentials=credentials)

In [14]:
note_ids = mimic.get_row_ids(client)

### Fetching & Processing

In [19]:
trigger_filepath = TRIGGERS_PATH
triggers, phrase_to_group, allphrases = triggers.process_trigger_file(trigger_filepath)

In [33]:
import math
chunk = math.ceil(note_ids.shape[0] / JOBS)

super_block_size = 1000
super_block_cnt = math.ceil(len(note_ids) / super_block_size)
chunk = math.ceil(super_block_size / JOBS)
range_list = range(0, super_block_size, chunk)

In [34]:
def splitting_section_thread(client, idx_start, idx_end):
      returned_sections = mimic.get_note_texts(client, note_ids[idx_start:idx_end])
      missed = []
      
      ret_val = sectioning.section_notes(list(returned_sections.text.values), list(returned_sections.row_id.values), missed, allphrases, phrase_to_group)
      print("Missed (%d - %d): %d" % (idx_start, idx_end, len(missed)))
      return returned_sections, ret_val

In [45]:
def run_thread(index, offset):
  import warnings
  warnings.filterwarnings('ignore')

  client = bigquery.Client(GOOGLE_PROJECT_ID, credentials=credentials)
  indexes = range_list[index]
  
  if offset + indexes >= len(note_ids):
      cols = ['row_id', 'extracted_section_title', 'section_group', 'section_text', 'start', 'end', 'index', 'section_id']
      section_df = pd.DataFrame([], columns=[_ for _ in cols])
      return [section_df, []]

  sections, missed = splitting_section_thread(client, offset + indexes, min(len(note_ids), offset + min(indexes + chunk, super_block_size)))

  return sections, missed

In [49]:
# row_ids.shape[0]

pool = mp.Pool(JOBS)
all_returns = []
for _s in range(super_block_cnt):
  if _s > 0:
    break
  
  print('Now processing superbloc {}/{}'.format(_s+1, (super_block_cnt)))
  returns = pool.starmap(run_thread, [(_i,_s * super_block_size) for _i in range(JOBS)])
  all_returns.append([_s, returns])
  df = pd.concat([_[0] for _ in returns], axis=0)
  print('Exporting {} sections to GBQ...'.format(df.shape[0]))

  df.to_gbq(BIGQUERY_TARGET_TABLE, project_id=GOOGLE_PROJECT_ID, if_exists='append',)
  

Now processing superbloc 1/2084
Missed (500 - 1000): 416
Missed (0 - 500): 38
Exporting 1000 sections to GBQ...
Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=DRv9iwdtFtVfQFaA4ECSF24WxR9BwK&prompt=consent&access_type=offline
