## Generate the Data Check Documents for the Proper Data Control Insurance Checks

In [1]:
# Note: output files will be placed in the working dir

#PC: 
database_dir = r"E:\TriNetX\\"   # Location where the database files are stored 
working_dir = r"C:\Users\reblo\Box\Residency Personal Files\Scholarly Work\Locke Research Projects\TriNetX Code\Hypercapnia TriNetX CSV Processing\Working\\" #location where to read and right from (faster = better if space allows)

#Mac 
#database_dir = r"/Volumes/LOCKE STUDY/TriNetX"   # Location where the database files are stored 
#working_dir = r"/Users/blocke/TriNetX Working/"

In [2]:
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime
import gc
import dask.dataframe as dd
import dask
import logging
from dask.distributed import Client, LocalCluster
from dask import config
import h5py

#Create an output directory if it's not already there
os.makedirs(os.path.join(working_dir[:-1], "data_checks"), exist_ok=True)

### Make HD5 Files with each type of data element

#### Vital Signs

In [3]:
#Vital Signs
start_time = time.time()
store_path = os.path.join(working_dir[:-1], 'vitals_unique_encounters.h5')

if os.path.exists(store_path):
    try:
        # Attempt to open and then immediately close the file
        store = pd.HDFStore(store_path)
        store.close()
    except Exception as e:
        print(f"Failed to close the file: {e}")
    os.remove(store_path)  # Ensure a fresh start

#num_spreadsheets = 10
num_spreadsheets = 853

columns = ["patient_id","encounter_id","code_system","code","principal_diagnosis_indicator","admitting_diagnosis","reason_for_visit","date","derived_by_TriNetX","source_id"]

try: 
    store = pd.HDFStore(store_path)
    # Process each CSV and store directly to HDF5
    for i in range(1, num_spreadsheets + 1):
        print(f'{i:04}')  
        file_path = f"{database_dir}Vital Signs/vital_signs{i:04}.csv"
        chunk = pd.read_csv(file_path,
            names=columns,          # Override column names
            usecols=["encounter_id"],  # Only read the "encounter_id" column
            dtype={"encounter_id": str},  # Ensure "encounter_id" is read as a string
            skiprows=1 if i == 1 else 0   # Skip the first row only for the first file
        )
        chunk.drop_duplicates(subset=["encounter_id"], inplace=True)
        store.append('unique_encounters', chunk, format='table', data_columns=True, index=False, min_itemsize={'encounter_id': 12})
finally: 
    store.close()

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()

0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


401

#### Diagnoses

In [4]:
#Diagnoses 
start_time = time.time()

store_path = os.path.join(working_dir[:-1], 'diag_unique_encounters.h5')
if os.path.exists(store_path):
    try:
        # Attempt to open and then immediately close the file
        store = pd.HDFStore(store_path)
        store.close()
    except Exception as e:
        print(f"Failed to close the file: {e}")
    os.remove(store_path)  # Ensure a fresh start

num_spreadsheets = 1273

columns = ["patient_id","encounter_id","code_system","code","principal_diagnosis_indicator","admitting_diagnosis","reason_for_visit","date","derived_by_TriNetX","source_id"]

try: 
    store = pd.HDFStore(store_path)
    # Process each CSV and store directly to HDF5
    for i in range(1, num_spreadsheets + 1):
        print(f'{i:04}')  
        file_path = f"{database_dir}Diagnosis/diagnosis{i:04}.csv"
        chunk = pd.read_csv(file_path,
            names=columns,          # Override column names
            usecols=["encounter_id"],  # Only read the "encounter_id" column
            dtype={"encounter_id": str},  # Ensure "encounter_id" is read as a string
            skiprows=1 if i == 1 else 0   # Skip the first row only for the first file
        )
        chunk.drop_duplicates(subset=["encounter_id"], inplace=True)
        store.append('unique_encounters', chunk, format='table', data_columns=True, index=False, min_itemsize={'encounter_id': 12})
finally: 
    store.close()

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()

0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


397

#### Labs

In [5]:
# Labs
start_time = time.time()

store_path = os.path.join(working_dir[:-1], 'lab_unique_encounters.h5')
if os.path.exists(store_path):
    try:
        # Attempt to open and then immediately close the file
        store = pd.HDFStore(store_path)
        store.close()
    except Exception as e:
        print(f"Failed to close the file: {e}")
    os.remove(store_path)  # Ensure a fresh start

#num_spreadsheets = 10
num_spreadsheets = 2334

columns = ["patient_id","encounter_id","code_system","code","date","value","text_value","units_of_measure","derived_by_TriNetX","source_id"]

try: 
    store = pd.HDFStore(store_path)
    # Process each CSV and store directly to HDF5
    for i in range(1, num_spreadsheets + 1):
        print(f'{i:04}')  
        file_path = f"{database_dir}Lab Results/lab_results{i:04}.csv"
        chunk = pd.read_csv(file_path,
            names=columns,          # Override column names
            usecols=["encounter_id"],  # Only read the "encounter_id" column
            dtype={"encounter_id": str},  # Ensure "encounter_id" is read as a string
            skiprows=1 if i == 1 else 0   # Skip the first row only for the first file
        )
        chunk.drop_duplicates(subset=["encounter_id"], inplace=True)
        store.append('unique_encounters', chunk, format='table', data_columns=True, index=False, min_itemsize={'encounter_id': 12})
finally: 
    store.close()

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()

0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


205

#### Procedures

In [6]:
# Procedures
start_time = time.time()

store_path = os.path.join(working_dir[:-1], 'proc_unique_encounters.h5')
if os.path.exists(store_path):
    try:
        # Attempt to open and then immediately close the file
        store = pd.HDFStore(store_path)
        store.close()
    except Exception as e:
        print(f"Failed to close the file: {e}")
    os.remove(store_path)  # Ensure a fresh start

#num_spreadsheets = 10
num_spreadsheets = 714

columns = ["patient_id","encounter_id","code_system","code","principal_procedure_indicator","date","derived_by_TriNetX","source_id"]

try: 
    store = pd.HDFStore(store_path)
    # Process each CSV and store directly to HDF5
    for i in range(1, num_spreadsheets + 1):
        print(f'{i:04}')  
        file_path = f"{database_dir}Procedure/procedure{i:04}.csv"
        chunk = pd.read_csv(file_path,
            names=columns,          # Override column names
            usecols=["encounter_id"],  # Only read the "encounter_id" column
            dtype={"encounter_id": str},  # Ensure "encounter_id" is read as a string
            skiprows=1 if i == 1 else 0   # Skip the first row only for the first file
        )
        chunk.drop_duplicates(subset=["encounter_id"], inplace=True)
        store.append('unique_encounters', chunk, format='table', data_columns=True, index=False, min_itemsize={'encounter_id': 12})
finally: 
    store.close()

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()

0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


493

#### Meds

In [7]:
# Meds
start_time = time.time()

store_path = os.path.join(working_dir[:-1], 'med_unique_encounters.h5')
if os.path.exists(store_path):
    try:
        # Attempt to open and then immediately close the file
        store = pd.HDFStore(store_path)
        store.close()
    except Exception as e:
        print(f"Failed to close the file: {e}")
    os.remove(store_path)  # Ensure a fresh start

#num_spreadsheets = 10
num_spreadsheets = 2991

columns = ["patient_id","encounter_id","unique_id","code_system","code","start_date","route","brand","strength","derived_by_TriNetX","source_id"]

try: 
    store = pd.HDFStore(store_path)
    # Process each CSV and store directly to HDF5
    for i in range(1, num_spreadsheets + 1):
        print(f'{i:04}')  
        file_path = f"{database_dir}Medications/medication{i:04}.csv"
        chunk = pd.read_csv(file_path,
            names=columns,          # Override column names
            usecols=["encounter_id"],  # Only read the "encounter_id" column
            dtype={"encounter_id": str},  # Ensure "encounter_id" is read as a string
            skiprows=1 if i == 1 else 0   # Skip the first row only for the first file
        )
        chunk.drop_duplicates(subset=["encounter_id"], inplace=True)
        store.append('unique_encounters', chunk, format='table', data_columns=True, index=False, min_itemsize={'encounter_id': 12})
finally: 
    store.close()

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()


0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


61

### Merge All Encounters of Ambulatory and Emerg/Inp, then deduplicate and make screen

#### H5 Structure Check Code

In [8]:
def print_structure_and_count(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(f"{name}: {len(obj)} entries")
    else:
        print(name)

def check_and_print_structure(file_path):
    print(f"Structure of {os.path.basename(file_path)}:")
    with h5py.File(file_path, 'r') as f:
        f.visititems(print_structure_and_count)

diag_path = os.path.join(working_dir[:-1], 'diag_unique_encounters.h5')
vitals_path = os.path.join(working_dir[:-1], 'vitals_unique_encounters.h5')
lab_path = os.path.join(working_dir[:-1], 'lab_unique_encounters.h5')
med_path = os.path.join(working_dir[:-1], 'med_unique_encounters.h5')
proc_path = os.path.join(working_dir[:-1], 'proc_unique_encounters.h5')

# List of all HDF5 paths
paths = [diag_path, vitals_path, lab_path, med_path, proc_path]

for path in paths:
    check_and_print_structure(path)

Structure of diag_unique_encounters.h5:
unique_encounters
unique_encounters/table: 249836036 entries
Structure of vitals_unique_encounters.h5:
unique_encounters
unique_encounters/table: 101603643 entries
Structure of lab_unique_encounters.h5:
unique_encounters
unique_encounters/table: 100800073 entries
Structure of med_unique_encounters.h5:
unique_encounters
unique_encounters/table: 159229970 entries
Structure of proc_unique_encounters.h5:
unique_encounters
unique_encounters/table: 166304142 entries


#### Merge All Ambulatory Screen Encounters

In [9]:
# Define paths
diag_path = os.path.join(working_dir[:-1], 'diag_unique_encounters.h5')
vitals_path = os.path.join(working_dir[:-1], 'vitals_unique_encounters.h5')
lab_path = os.path.join(working_dir[:-1], 'lab_unique_encounters.h5')
output_path = os.path.join(working_dir[:-1], 'amb_screen_all_encounters.csv')

# Configure Dask LocalCluster
memory_per_worker = '6GB'
cluster = LocalCluster(
    n_workers=4,               # Number of worker processes
    threads_per_worker=1,      # Number of threads per worker
    memory_limit=memory_per_worker,  # Memory limit per worker
    processes=True,            # Use separate processes for each worker
    dashboard_address=':8787'  # Dashboard address for monitoring
)
client = Client(cluster)

# Adjust memory spilling settings
dask.config.set({
    'distributed.worker.memory.target': 0.70,
    'distributed.worker.memory.spill': 0.80,
    'distributed.worker.memory.pause': 0.80,
    'distributed.worker.memory.terminate': 0.95,
    'distributed.scheduler.allowed-failures': 10,
})

# Read the data using Dask
try:
    diag_ddf = dd.read_hdf(diag_path, 'unique_encounters/table')
    vitals_ddf = dd.read_hdf(vitals_path, 'unique_encounters/table')
    print("Successfully read H5 files into Dask DataFrames.")
except Exception as e:
    print(f"Error reading H5 files into Dask DataFrames: {e}")

# Drop the index column if it exists
try:
    diag_ddf = diag_ddf[['encounter_id']]
    vitals_ddf = vitals_ddf[['encounter_id']]
except Exception as e:
    print(f"Error selecting columns: {e}")

# Convert 'encounter_id' to string and trim whitespace
try:
    diag_ddf['encounter_id'] = diag_ddf['encounter_id'].astype(str).str.strip()
    vitals_ddf['encounter_id'] = vitals_ddf['encounter_id'].astype(str).str.strip()
    print("Successfully converted 'encounter_id' to strings and trimmed whitespace.")
except Exception as e:
    print(f"Error processing 'encounter_id': {e}")


# Perform the merge operation to keep only common "encounter_id"
try:
    merged_ddf = diag_ddf.merge(vitals_ddf, on='encounter_id', how='inner')
    print("Successfully merged DataFrames.")
except Exception as e:
    print(f"Error merging DataFrames: {e}")

# Persist the intermediate result to avoid recomputation
try:
    merged_ddf = merged_ddf.persist()
    print("Successfully persisted merged DataFrame.")
except Exception as e:
    print(f"Error persisting DataFrame: {e}")

#-----------------------------
# Now add labs
try:
    lab_ddf = dd.read_hdf(lab_path, 'unique_encounters/table')
    print("Successfully read lab H5 files into Dask DataFrames.")
except Exception as e:
    print(f"Error reading H5 lab files into Dask DataFrames: {e}")

# Drop the index column if it exists
try:
    lab_ddf = lab_ddf[['encounter_id']]
except Exception as e:
    print(f"Error selecting lab columns: {e}")

# Convert 'encounter_id' to string and trim whitespace
try:
    lab_ddf['encounter_id'] = lab_ddf['encounter_id'].astype(str).str.strip()
    print("Successfully converted lab 'encounter_id' to strings and trimmed whitespace.")
except Exception as e:
    print(f"Error processing lab 'encounter_id': {e}")

# Perform the merge operation to keep only common "encounter_id"
try:
    merged_ddf = merged_ddf.merge(lab_ddf, on='encounter_id', how='inner')
    del lab_ddf
    print("Successfully merged lab DataFrames.")
except Exception as e:
    print(f"Error merging lab DataFrames: {e}")

# Persist the intermediate result to avoid recomputation
try:
    merged_ddf = merged_ddf.persist()
    print("Successfully persisted merged DataFrame.")
except Exception as e:
    print(f"Error persisting DataFrame: {e}")

# Write the result to a single CSV file
try:
    merged_ddf.to_csv(output_path, single_file=True, index=False)
    print(f"Merged unique encounters saved to: {output_path}")
except Exception as e:
    print(f"Error writing CSV: {e}")

# Close the Dask client
client.close()
cluster.close()


Successfully read H5 files into Dask DataFrames.
Successfully converted 'encounter_id' to strings and trimmed whitespace.
Successfully merged DataFrames.
Successfully persisted merged DataFrame.
Successfully read lab H5 files into Dask DataFrames.
Successfully converted lab 'encounter_id' to strings and trimmed whitespace.
Successfully merged lab DataFrames.
Successfully persisted merged DataFrame.




Merged unique encounters saved to: C:\Users\reblo\Box\Residency Personal Files\Scholarly Work\Locke Research Projects\TriNetX Code\Hypercapnia TriNetX CSV Processing\Working\amb_screen_all_encounters.csv


Check Resulting Size to Ensure It's Plausible

In [10]:
output_csv_path = os.path.join(working_dir[:-1], 'amb_screen_all_encounters.csv')
# Data check
try:
    output_csv = pd.read_csv(output_csv_path, usecols=['encounter_id'])
    print(f"Shape of output: {output_csv.shape}")
except Exception as e:
    print(f"Error reading output CSV file: {e}")


Shape of output: (21739566, 1)


#### Deduplicate and make screen

In [11]:
# Small enough to just use pandas
start_time = time.time()

# Define the HDF5 paths
input_csv_path  = os.path.join(working_dir[:-1], 'amb_screen_all_encounters.csv')
output_csv_path = os.path.join(working_dir[:-1], "data_checks", "amb_enc_screen.csv")


# Read the data using Pandas
pdf = pd.read_csv(input_csv_path, 
                  usecols=["encounter_id"],  # Only read the "encounter_id" column
                  dtype={"encounter_id": str})

# Remove duplicates
pdf = pdf.drop_duplicates()

# Write the DataFrame to a CSV file
pdf.to_csv(output_csv_path, index=False)

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()

Executed in 0 hours, 0 minutes, and 21.91 seconds.


38928

In [12]:
output_csv_path = os.path.join(working_dir[:-1], "data_checks", "amb_enc_screen.csv")
# Data check
try:
    output_csv = pd.read_csv(output_csv_path, usecols=['encounter_id'])
    print(f"Shape of output: {output_csv.shape}")
except Exception as e:
    print(f"Error reading output CSV file: {e}")

Shape of output: (20663452, 1)


#### Merge All Inpatient and Emergency Screen Encounters

In [13]:
# Define paths
diag_path = os.path.join(working_dir[:-1], 'diag_unique_encounters.h5')
vitals_path = os.path.join(working_dir[:-1], 'vitals_unique_encounters.h5')
lab_path = os.path.join(working_dir[:-1], 'lab_unique_encounters.h5')
proc_path = os.path.join(working_dir[:-1], 'proc_unique_encounters.h5')
med_path = os.path.join(working_dir[:-1], 'med_unique_encounters.h5')
output_path = os.path.join(working_dir[:-1], 'inp_screen_all_encounters.csv')

# Configure Dask LocalCluster
memory_per_worker = '6GB'
cluster = LocalCluster(
    n_workers=4,               # Number of worker processes
    threads_per_worker=1,      # Number of threads per worker
    memory_limit=memory_per_worker,  # Memory limit per worker
    processes=True,            # Use separate processes for each worker
    dashboard_address=':8787'  # Dashboard address for monitoring
)
client = Client(cluster)

# Adjust memory spilling settings
dask.config.set({
    'distributed.worker.memory.target': 0.70,
    'distributed.worker.memory.spill': 0.80,
    'distributed.worker.memory.pause': 0.80,
    'distributed.worker.memory.terminate': 0.95,
    'distributed.scheduler.allowed-failures': 10,
})

# Read the data using Dask
try:
    diag_ddf = dd.read_hdf(diag_path, 'unique_encounters/table')
    vitals_ddf = dd.read_hdf(vitals_path, 'unique_encounters/table')
    print("Successfully read diag+vitals H5 files into Dask DataFrames.")
except Exception as e:
    print(f"Error reading H5 diag+vitals files into Dask DataFrames: {e}")

# Drop the index column if it exists
try:
    diag_ddf = diag_ddf[['encounter_id']]
    vitals_ddf = vitals_ddf[['encounter_id']]
except Exception as e:
    print(f"Error selecting diag+vitals columns: {e}")

# Convert 'encounter_id' to string and trim whitespace
try:
    diag_ddf['encounter_id'] = diag_ddf['encounter_id'].astype(str).str.strip()
    vitals_ddf['encounter_id'] = vitals_ddf['encounter_id'].astype(str).str.strip()
    print("Successfully converted diag+vitals 'encounter_id' to strings and trimmed whitespace.")
except Exception as e:
    print(f"Error processing diag+vitals 'encounter_id': {e}")


# Perform the merge operation to keep only common "encounter_id"
try:
    merged_ddf = diag_ddf.merge(vitals_ddf, on='encounter_id', how='inner')
    del diag_ddf
    del vitals_ddf
    print("Successfully merged diag+vitals DataFrames.")
except Exception as e:
    print(f"Error merging diag+vitals DataFrames: {e}")

# Persist the intermediate result to avoid recomputation
try:
    merged_ddf = merged_ddf.persist()
    print("Successfully persisted merged DataFrame.")
except Exception as e:
    print(f"Error persisting DataFrame: {e}")

#-----------------------------
# Now add labs
try:
    lab_ddf = dd.read_hdf(lab_path, 'unique_encounters/table')
    print("Successfully read lab H5 files into Dask DataFrames.")
except Exception as e:
    print(f"Error reading H5 lab files into Dask DataFrames: {e}")

# Drop the index column if it exists
try:
    lab_ddf = lab_ddf[['encounter_id']]
except Exception as e:
    print(f"Error selecting lab columns: {e}")

# Convert 'encounter_id' to string and trim whitespace
try:
    lab_ddf['encounter_id'] = lab_ddf['encounter_id'].astype(str).str.strip()
    print("Successfully converted lab 'encounter_id' to strings and trimmed whitespace.")
except Exception as e:
    print(f"Error processing lab 'encounter_id': {e}")

# Perform the merge operation to keep only common "encounter_id"
try:
    merged_ddf = merged_ddf.merge(lab_ddf, on='encounter_id', how='inner')
    del lab_ddf
    print("Successfully merged lab DataFrames.")
except Exception as e:
    print(f"Error merging lab DataFrames: {e}")

# Persist the intermediate result to avoid recomputation
try:
    merged_ddf = merged_ddf.persist()
    print("Successfully persisted merged DataFrame.")
except Exception as e:
    print(f"Error persisting DataFrame: {e}")

#-----------------------------
# Now add procedures
try:
    proc_ddf = dd.read_hdf(proc_path, 'unique_encounters/table')
    print("Successfully read proc H5 files into Dask DataFrames.")
except Exception as e:
    print(f"Error reading H5 proc files into Dask DataFrames: {e}")

# Drop the index column if it exists
try:
    proc_ddf = proc_ddf[['encounter_id']]
except Exception as e:
    print(f"Error selecting proc columns: {e}")

# Convert 'encounter_id' to string and trim whitespace
try:
    proc_ddf['encounter_id'] = proc_ddf['encounter_id'].astype(str).str.strip()
    print("Successfully converted proc 'encounter_id' to strings and trimmed whitespace.")
except Exception as e:
    print(f"Error processing proc 'encounter_id': {e}")

# Perform the merge operation to keep only common "encounter_id"
try:
    merged_ddf = merged_ddf.merge(proc_ddf, on='encounter_id', how='inner')
    del proc_ddf
    print("Successfully merged proc DataFrames.")
except Exception as e:
    print(f"Error merging proc DataFrames: {e}")

# Persist the intermediate result to avoid recomputation
try:
    merged_ddf = merged_ddf.persist()
    print("Successfully persisted merged DataFrame.")
except Exception as e:
    print(f"Error persisting DataFrame: {e}")

#-----------------------------
# Now add meds
try:
    med_ddf = dd.read_hdf(med_path, 'unique_encounters/table')
    print("Successfully read med H5 files into Dask DataFrames.")
except Exception as e:
    print(f"Error reading H5 med files into Dask DataFrames: {e}")

# Drop the index column if it exists
try:
    med_ddf = med_ddf[['encounter_id']]
except Exception as e:
    print(f"Error selecting med columns: {e}")

# Convert 'encounter_id' to string and trim whitespace
try:
    med_ddf['encounter_id'] = med_ddf['encounter_id'].astype(str).str.strip()
    print("Successfully converted med 'encounter_id' to strings and trimmed whitespace.")
except Exception as e:
    print(f"Error processing med 'encounter_id': {e}")

# Perform the merge operation to keep only common "encounter_id"
try:
    merged_ddf = merged_ddf.merge(med_ddf, on='encounter_id', how='inner')
    del med_ddf
    print("Successfully merged med DataFrames.")
except Exception as e:
    print(f"Error merging med DataFrames: {e}")

# Persist the intermediate result to avoid recomputation
try:
    merged_ddf = merged_ddf.persist()
    print("Successfully persisted merged DataFrame.")
except Exception as e:
    print(f"Error persisting DataFrame: {e}")

# Write the result to a single CSV file
try:
    merged_ddf.to_csv(output_path, single_file=True, index=False)
    print(f"Merged unique encounters saved to: {output_path}")
except Exception as e:
    print(f"Error writing CSV: {e}")

# Close the Dask client
client.close()
cluster.close()

Successfully read diag+vitals H5 files into Dask DataFrames.
Successfully converted diag+vitals 'encounter_id' to strings and trimmed whitespace.
Successfully merged diag+vitals DataFrames.
Successfully persisted merged DataFrame.
Successfully read lab H5 files into Dask DataFrames.
Successfully converted lab 'encounter_id' to strings and trimmed whitespace.
Successfully merged lab DataFrames.
Successfully persisted merged DataFrame.
Successfully read proc H5 files into Dask DataFrames.
Successfully converted proc 'encounter_id' to strings and trimmed whitespace.
Successfully merged proc DataFrames.
Successfully persisted merged DataFrame.
Successfully read med H5 files into Dask DataFrames.
Successfully converted med 'encounter_id' to strings and trimmed whitespace.
Successfully merged med DataFrames.
Successfully persisted merged DataFrame.




Merged unique encounters saved to: C:\Users\reblo\Box\Residency Personal Files\Scholarly Work\Locke Research Projects\TriNetX Code\Hypercapnia TriNetX CSV Processing\Working\inp_screen_all_encounters.csv


#### Check to see if plausible size

In [14]:
output_csv_path = os.path.join(working_dir[:-1], 'inp_screen_all_encounters.csv')
# Data check
try:
    output_csv = pd.read_csv(output_csv_path, usecols=['encounter_id'])
    print(f"Shape of output: {output_csv.shape}")
except Exception as e:
    print(f"Error reading output CSV file: {e}")


Shape of output: (16419594, 1)


#### Deduplicate and make screen

In [15]:
# Small enough to just use pandas
start_time = time.time()

# Define the HDF5 paths
input_csv_path  = os.path.join(working_dir[:-1], 'inp_screen_all_encounters.csv')
output_csv_path = os.path.join(working_dir[:-1], "data_checks", "inp_enc_screen.csv")


# Read the data using Pandas
pdf = pd.read_csv(input_csv_path, 
                  usecols=["encounter_id"],  # Only read the "encounter_id" column
                  dtype={"encounter_id": str})

# Remove duplicates
pdf = pdf.drop_duplicates()

# Write the DataFrame to a CSV file
pdf.to_csv(output_csv_path, index=False)

end_time = time.time()
execution_time = end_time - start_time
hours = int(execution_time // 3600)
minutes = int((execution_time % 3600) // 60)
seconds = execution_time % 60
print(f"Executed in {hours} hours, {minutes} minutes, and {seconds:.2f} seconds.")
gc.collect()

Executed in 0 hours, 0 minutes, and 14.79 seconds.


40980

In [16]:
output_csv_path = os.path.join(working_dir[:-1], "data_checks", "inp_enc_screen.csv")
# Data check
try:
    output_csv = pd.read_csv(output_csv_path, usecols=['encounter_id'])
    print(f"Shape of output: {output_csv.shape}")
except Exception as e:
    print(f"Error reading output CSV file: {e}")


Shape of output: (12564033, 1)
