### Import packages

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import re
import math
from datetime import datetime
from collections import Counter
import itertools

import warnings
warnings.filterwarnings('ignore')

### Functions

In [2]:
def find_most_recent(path, prefix):
    
    ''' Find csv files with a certian prefix and with the most recent date '''
    
    ''' First, the string method startswith() picks out the files that have the right name. No need to use regex here since the names are at the beginning. '''
    
    ''' Then since the dates are structured nicely as YYYY-MM-DD sort the resulting list using sort() or sorted() to get the most recent date. '''
    
    f_list = sorted((f for f in os.listdir(path) if f.find(prefix) != -1), reverse=True,
                    key=lambda f: os.stat(os.path.join(path, f)).st_mtime)
    return f_list[-1]

### Download Data - Harmonized Tables (Joey)

In [3]:
%env SET = joey-mhh-sandbox:/harmonized_output/
    
datadir = "data/" + os.environ['SET'].replace(':','/')
print("Python variable datadir=" + datadir)

env: SET=joey-mhh-sandbox:/harmonized_output/
Python variable datadir=data/joey-mhh-sandbox//harmonized_output/


In [4]:
%%bash
DIR=data/${SET/:/\/}
echo local directory: $DIR
mkdir -p $DIR
dx download -rf "$SET/*" --output $DIR    # $SET/* to download all xlsx files
ls -lh $DIR

local directory: data/joey-mhh-sandbox//harmonized_output/
total 16G
-rw-r--r-- 1 root root 285M Aug 27 19:08 core
-rw-r--r-- 1 root root  56M Aug 27 19:09 demographics.csv
-rw-r--r-- 1 root root  22M Aug 27 19:09 demographics_2022-05-24.csv
-rw-r--r-- 1 root root  22M Aug 27 19:08 demographics_2022-05-26.csv
-rw-r--r-- 1 root root 998M Aug 27 19:09 diagnoses.csv
-rw-r--r-- 1 root root 998M Aug 27 19:09 diagnoses_2022-05-24.csv
-rw-r--r-- 1 root root 998M Aug 27 19:08 diagnoses_2022-05-26.csv
-rw-r--r-- 1 root root  44M Aug 27 19:09 encounters.csv
-rw-r--r-- 1 root root  45M Aug 27 19:09 encounters_2022-05-24.csv
-rw-r--r-- 1 root root  50M Aug 27 19:08 encounters_2022-05-26.csv
-rw-r--r-- 1 root root 1.8G Aug 27 19:09 measurements.csv
-rw-r--r-- 1 root root 4.0G Aug 27 19:09 measurements_2022-05-24.csv
-rw-r--r-- 1 root root 4.2G Aug 27 19:08 measurements_2022-05-26.csv
-rw-r--r-- 1 root root 1.1G Aug 27 19:09 medication_2022-05-24.csv
-rw-r--r-- 1 root root 1.1G Aug 27 19:08 medicati

### Dowload Data - Feature Tables - Dmitrii
To mimc structure

In [5]:
%env SET1 = dx-project-SOWS:/data/MHH_features_table
    
datadir1 = "data/" + os.environ['SET1'].replace(':','/')
#print("Python variable datadir=" + datadir1)

env: SET1=dx-project-SOWS:/data/MHH_features_table


In [6]:
%%bash
DIR=data/${SET1/:/\/}
echo local directory: $DIR
mkdir -p $DIR
dx download -rf "$SET1/*" --output $DIR
#ls -lh $DIR

local directory: data/dx-project-SOWS//data/MHH_features_table


### Dowload Data - Lookup Tables

In [7]:
%env SET2 = bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/harmonized_Lookup
    
datadir2 = "data/" + os.environ['SET2'].replace(':','/')
#print("Python variable datadir=" + datadir2)

env: SET2=bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/harmonized_Lookup


In [17]:
%env SET22 = bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/harmonized_Input
    
datadir22 = "data/" + os.environ['SET22'].replace(':','/')
#print("Python variable datadir=" + datadir22)

env: SET22=bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/harmonized_Input


In [30]:
%%bash
DIR=data/${SET2/:/\/}
echo local directory: $DIR
mkdir -p $DIR
dx download -rf "$SET2/*" --output $DIR
dx download -rf "$SET22/*" --output $DIR

ls -lh $DIR

local directory: data/bec-data-MetroHealth-Dascena//HarmonizedT-FeatureT/harmonized_Lookup
total 6.0G
-rw-r--r-- 1 root root  40K Aug 27 19:46 ABX_guide.docx
-rw-r--r-- 1 root root 3.4K Aug 27 19:46 Antibiotics_V2.csv
-rw-r--r-- 1 root root 121K Aug 27 19:46 Infections_V2.csv
-rw-r--r-- 1 root root  22M Aug 27 19:46 demographics_2022-05-24.csv
-rw-r--r-- 1 root root 998M Aug 27 19:46 diagnoses_2022-05-24.csv
-rw-r--r-- 1 root root  45M Aug 27 19:46 encounters_2022-05-24.csv
-rw-r--r-- 1 root root  19K Aug 27 19:46 iv_meds_sepsis.csv
-rw-r--r-- 1 root root  23K Aug 27 19:46 iv_meds_sepsis.yaml
-rw-r--r-- 1 root root 4.0G Aug 27 19:46 measurements_2022-05-24.csv
-rw-r--r-- 1 root root 1.1G Aug 27 19:46 medication_2022-05-24.csv
-rw-r--r-- 1 root root 3.5K Aug 27 19:46 meds_lookup-Final.csv
-rw-r--r-- 1 root root 3.1K Aug 27 19:46 sepsis_V2.csv
-rw-r--r-- 1 root root  109 Aug 27 19:46 vasopressors_V2.csv


### Download Data - Intermdiate Diagnosis and Medication DF Tables (Revital)

In [47]:
%env SET3 = bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/harmonized_Intermediate
    
datadir3 = "data/" + os.environ['SET3'].replace(':','/')
print("Python variable datadir=" + datadir3)

env: SET3=bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/harmonized_Intermediate
Python variable datadir=data/bec-data-MetroHealth-Dascena//HarmonizedT-FeatureT/harmonized_Intermediate


In [48]:
%%bash
DIR=data/${SET3/:/\/}
echo local directory: $DIR
mkdir -p $DIR
dx download -rf "$SET3/*" --output $DIR
ls -lh $DIR

local directory: data/bec-data-MetroHealth-Dascena//HarmonizedT-FeatureT/harmonized_Intermediate
total 821M
-rw-r--r-- 1 root root 4.8M Aug 27 21:53 diagnoses_df_intermediate.csv
-rw-r--r-- 1 root root 816M Aug 27 21:53 medications_df_intermediate.csv


### Read Lookup Tables

#### Read Infection ICD10 Codes

In [22]:
infection_ICD_df = pd.read_csv(datadir2 + "/Infections_V2.csv", header=None)

#print(infection_ICD_df.columns)

print(infection_ICD_df.shape)
infection_ICD_df.head(3)

(2508, 1)


Unnamed: 0,0
0,A00 Cholera
1,"A00.0 Cholera due to Vibrio cholerae 01, biova..."
2,"A00.1 Cholera due to Vibrio cholerae 01, biova..."


In [23]:
icd10_rx_No = re.compile(r'[A-Z][0-9][A-Z0-9](?:\.[A-Z0-9]{1,4})?')

infection_ICD_df['icd10_code'] = None
infection_ICD_df['icd10_name'] = None

for index, value in infection_ICD_df[0].items():
    s= value
    match = re.match(icd10_rx_No, value) # the match() function only matches at the beginning of the string - fits ICD10 codes
    icd10_rx_Name = re.compile(r"[A-Z][0-9][A-Z0-9](?:\.[A-Z0-9]{1,4})?\s*")
    s = icd10_rx_Name.sub('', s)
    
    infection_ICD_df.at[index, 'icd10_code'] = match.group(0)
    infection_ICD_df.at[index, 'icd10_name'] = s

In [24]:
print(infection_ICD_df.shape)
infection_ICD_df.head()

(2508, 3)


Unnamed: 0,0,icd10_code,icd10_name
0,A00 Cholera,A00,Cholera
1,"A00.0 Cholera due to Vibrio cholerae 01, biova...",A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
2,"A00.1 Cholera due to Vibrio cholerae 01, biova...",A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
3,"A00.9 Cholera, unspecified",A00.9,"Cholera, unspecified"
4,A01 Typhoid and paratyphoid fevers,A01,Typhoid and paratyphoid fevers


#### Read Meds: Antibiotics and Vasopressors 

In [33]:
meds_df_lookup = pd.read_csv(datadir2 + "/meds_lookup-Final.csv", index_col=0) # filter by the route of the medication to get ABX-IV to fit to Sepsis compart to Dmitrii list
#print(meds_df_lookup.columns)

print('meds_df_lookup :' )
print(meds_df_lookup.shape)
meds_df_lookup.head(3)

(117, 2)


Unnamed: 0,ingredient,med_class
0,amikacin,antibacterial
1,ampicillin,antibacterial
2,ampicillin/sulbacta,antibacterial


### Read (Dmitrii) Feature Tables CSV to Pandas
To mimc structure

In [34]:
df_MHH_features_df = pd.read_csv(datadir1 + "/metrohealth_2021_08_08-features-100p-1643529047.csv")

In [35]:
print('metrohealth_2021_08_08-features-100p-1643529047.csv', ' -> ', 'df_MHH_features_df')
print(df_MHH_features_df.shape)
print(df_MHH_features_df.columns.to_list())
print() 

metrohealth_2021_08_08-features-100p-1643529047.csv  ->  df_MHH_features_df
(11309694, 16)
['Measurement', 'Value', 'Patient_ID', 'Timestamp', 'Encounter', 'admission_time', 'discharge_time', 'age', 'visit_type', 'med_class', 'med_name', 'med_route', 'med_dose', 'med_dose_unit', 'med_iv_duration_seconds', 'location']



In [36]:
print(df_MHH_features_df.shape)
df_MHH_features_df.head(5)

(11309694, 16)


Unnamed: 0,Measurement,Value,Patient_ID,Timestamp,Encounter,admission_time,discharge_time,age,visit_type,med_class,med_name,med_route,med_dose,med_dose_unit,med_iv_duration_seconds,location
0,Medication,doxycycline hyclate 100 mg oral tabs,21008896c7e9bbd6fc3f648e1f0d6639c4b68c74,2020-10-30T14:00:00.000Z,000287ab9835ceecdd12a16e5e790a93715c8d85,2020-10-30T13:17:00.000Z,2020-10-30T15:32:00.000Z,77.127586,emergency,antibacterial,doxycycline hyclate 100 mg oral tabs,Oral,100.0,mg,0.0,EMERGENCY
1,Medication,oseltamivir phosphate 75 mg oral caps,e8f44355cf1c8a606f3ba8b0d81ba529b843ecdd,2020-01-02T02:00:00.000Z,001235d117f28c6706b554196e9707218738b48a,2019-12-30T17:47:00.000Z,2020-01-02T19:01:00.000Z,63.095409,inpatient,antiviral,oseltamivir phosphate 75 mg oral caps,Oral,75.0,mg,0.0,INPATIENT FLOOR
2,Medication,oseltamivir phosphate 75 mg oral caps,e8f44355cf1c8a606f3ba8b0d81ba529b843ecdd,2020-01-01T14:34:00.000Z,001235d117f28c6706b554196e9707218738b48a,2019-12-30T17:47:00.000Z,2020-01-02T19:01:00.000Z,63.095409,inpatient,antiviral,oseltamivir phosphate 75 mg oral caps,Oral,75.0,mg,0.0,INPATIENT FLOOR
3,Medication,oseltamivir phosphate 75 mg oral caps,e8f44355cf1c8a606f3ba8b0d81ba529b843ecdd,2020-01-02T15:29:00.000Z,001235d117f28c6706b554196e9707218738b48a,2019-12-30T17:47:00.000Z,2020-01-02T19:01:00.000Z,63.095409,inpatient,antiviral,oseltamivir phosphate 75 mg oral caps,Oral,75.0,mg,0.0,INPATIENT FLOOR
4,Medication,oseltamivir phosphate 75 mg oral caps,e8f44355cf1c8a606f3ba8b0d81ba529b843ecdd,2019-12-31T14:33:00.000Z,001235d117f28c6706b554196e9707218738b48a,2019-12-30T17:47:00.000Z,2020-01-02T19:01:00.000Z,63.095409,inpatient,antiviral,oseltamivir phosphate 75 mg oral caps,Oral,75.0,mg,0.0,INPATIENT FLOOR


### Reading Harmonized Tables 
Input: Joey's Harmonized CSV tables (pandas DFs)


In [37]:
path = datadir
prefix_list = ['demographics', 'encounters', 'measurements', 'procedures']   # 'diagnoses', 'medications', : loading intermidiate tables
most_recent_file_list = []

for prefix in prefix_list:
    
    most_recent_file_list.append(find_most_recent(path, prefix))
    tbl_name = prefix + "_df"
    #taking input as a string
    locals()[tbl_name] = pd.read_csv(path + most_recent_file_list[-1])
    locals()[tbl_name].drop(['Unnamed: 0'], axis=1, inplace=True)
    
    # file info:
    print(most_recent_file_list[-1], ' -> ', tbl_name)
    print(locals()[tbl_name].shape)
    print(locals()[tbl_name].columns.to_list())
    print() 

demographics_2022-05-26.csv  ->  demographics_df
(188126, 7)
['patient_id', 'encounter_id', 'sex', 'race', 'ethnicity', 'birth_year', 'death_time']

diagnoses_2022-05-26.csv  ->  diagnoses_df
(9586705, 6)
['patient_id', 'encounter_id', 'name', 'icd10_code', 'is_primary', 'is_history']

encounters_2022-05-26.csv  ->  encounters_df
(188126, 14)
['patient_id', 'encounter_id', 'checkin_time', 'admission_time', 'discharge_time', 'days_from_index', 'los', 'age', 'visit_type', 'location', 'discharge_disposition', 'post_covid', 'year', 'month']

measurements_2022-05-26.csv  ->  measurements_df
(23305127, 12)
['patient_id', 'encounter_id', 'name', 'description', 'loinc_code', 'type', 'result_value', 'result_value_numeric', 'result_unit', 'order_time', 'collection_time', 'result_time']

medications_2022-05-26.csv  ->  medications_df
(4903243, 11)
['patient_id', 'encounter_id', 'name', 'rxnorm_codes', 'route', 'dose', 'units', 'infusion_rate', 'action', 'order_time', 'action_time']

procedures_20

In [38]:
most_recent_file_list

['demographics_2022-05-26.csv',
 'diagnoses_2022-05-26.csv',
 'encounters_2022-05-26.csv',
 'measurements_2022-05-26.csv',
 'medications_2022-05-26.csv',
 'procedures_2022-06-06.csv']

### Reading Intermediate Tables 
Input: Revital's Intermediate CSV tables

In [52]:
medications_df = pd.read_csv(datadir3 + "/medications_df_intermediate.csv", index_col=0)
diagnoses_df = pd.read_csv(datadir3 + "/diagnoses_df_intermediate.csv")

In [28]:
procedures_df = procedures_df.rename(columns={"procedure_date":"Timestamp"})

# Creating a ventilator column: in procedures_df:
procedures_df["proc_name_first"] = procedures_df["proc_name"].str.split(' ').str[0]
procedures_df['ventilation'] = procedures_df["proc_name_first"].str.contains("ventilator", regex=False, na=False).astype(int)

#df.loc[df[‘column’] condition, ‘new column name’] = ‘value if condition is met’
procedures_df.loc[procedures_df['cpt_code'] != '94660', 'ventilation'] = 0   # needs to be n = 11490

In [None]:
%whos DataFrame

In [39]:
print("File Upload Ended")

File Upload Ended
