# Encounter Preprocessing (NEW DATA)

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime
import glob

database_dir = "D:\\TriNetX\\Encounter\\"  # Location where the database files are stored 
#working_dir = "D:\\TriNetX\\Encounter\\"
working_dir = r"C:\Users\reblo\Box\Residency Personal Files\Scholarly Work\Locke Research Projects\TriNetX Code\Hypercapnia TriNetX CSV Processing\Working\\" #location where to read and right from (faster = better if space allows)
num_spreadsheets = 593

### Reformatting of CSVs

In [None]:
#working_dir = "D:\\TriNetX\\Encounter\\"
#num_spreadsheets = 100

columns = ["encounter_id","patient_id","start_date","end_date","type","start_date_derived_by_TriNetX","end_date_derived_by_TriNetX","derived_by_TriNetX","source_id"]

for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    encounter = pd.read_csv(database_dir + "encounter"+f'{i:04}'+".csv",
                         names = columns,
                         dtype = {"encounter_id":str,"patient_id":str,"type":str,"start_date_derived_by_TriNetX":str,"end_date_derived_by_TriNetX":str,"derived_by_TriNetX":str,"source_id":str},
                         parse_dates = ["start_date","end_date"])
    encounter = encounter.drop(columns = ["start_date_derived_by_TriNetX","end_date_derived_by_TriNetX","derived_by_TriNetX","source_id"])
    encounter = encounter[["patient_id", "encounter_id", "start_date", "end_date", "type"]]
    encounter = encounter[encounter["type"].str.match("^AMB$|^EMER$|^IMP$")]
    encounter["start_date"] = pd.to_datetime(encounter["start_date"])
    encounter["end_date"] = pd.to_datetime(encounter["end_date"])
    encounter.to_csv(working_dir + "encounter_NEW_"+f'{i:04}'+".csv", index = False)
    del encounter

### Ambulatory (AMB) Encounters

In [None]:
# NEW
# Operationalization: We are interested in care settings: 
# Ambulatory (AMB)

sub_blocks = [] 
# Read in all the truncated datasets and append to the master file
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    # Use only "patient_id","encounter_id","start_date","end_date","type" columns
    encounter = pd.read_csv(working_dir + "encounter_NEW_"+f'{i:04}'+".csv",
                            parse_dates = ["start_date","end_date"],
                            dtype = {"encounter_id":str,"patient_id":str,"type":str})
    # Exclude all "start_date" that occurs BEFORE 01/01/2022
    encounter = encounter[encounter["start_date"] >= pd.to_datetime("20220101")]
    # Include only "Ambulatory" (AMB) encounters
    encounter = encounter[encounter["type"].str.match("^AMB$")]
    # Fill missing "end_date" column values with 12/31/2022 as STRING
    encounter["end_date"] = encounter["end_date"].fillna("20221231").astype(str)
    # Recast "end_date" columns as DATETIME
    encounter["end_date"] = pd.to_datetime(encounter["end_date"])
    # Reorder the columns as: "patient_id", "encounter_id", "start_date", "end_date", "type"
    encounter = encounter[["patient_id", "encounter_id", "start_date", "end_date", "type"]]
    # Write rows to new dataframe - "AMB_encounters"
    sub_blocks.append(encounter)
    del encounter

# Process the overall dataset
AMB_encounters = pd.concat(sub_blocks, ignore_index = True)
# Sort values by "start_date" (ascending order)
AMB_encounters.sort_values(by = ["start_date","encounter_id"], ascending = [True,False], inplace = True)
# Drop duplicate encounters
AMB_encounters.drop_duplicates(subset = ["encounter_id"], keep = "first", inplace = True)
# Calculate the "Length of Stay" (LOS) variable
AMB_encounters["LOS"] = AMB_encounters["end_date"] - AMB_encounters["start_date"]
# Adding "1" for values where "start_date" = "end_date"; Admitted and discharged on the same day
AMB_encounters["LOS"] = AMB_encounters["LOS"].dt.days + 1
# Some encounters have "end_dates" that come before their associated "start_date"; Removing these rows
AMB_encounters = AMB_encounters[AMB_encounters["LOS"] > 0]
# Print the new dataframe shape
print("AMB_encounters Shape:", AMB_encounters.shape)
# Write new "AMB_encounters" dataframes to CSV
AMB_encounters.to_csv(working_dir + "AMB_encounters.csv", index = False)
# Doublecheck to make sure the CSV was created correctly
# AMB_encounters = pd.read_csv(database_dir + "AMB_encounters.csv", nrows = 1000)
# display(AMB_encounters.head())
del AMB_encounters

### Emergency (EMER) Encounters

In [None]:
# NEW 
# Operationalization: We are interested in care settings: 
# Emergency (EMER)

sub_blocks = [] 
for i in range(1, num_spreadsheets + 1):
    print(f'{i:04}')
    # Call in the encounter1.csv dataframe 
    # Use only "patient_id","encounter_id","start_date","end_date","type" columns
    encounter = pd.read_csv(working_dir + "encounter_NEW_"+f'{i:04}'+".csv",
                            parse_dates = ["start_date","end_date"],
                            dtype = {"encounter_id":str,"patient_id":str,"type":str})
    # Exclude all "start_date" that occurs BEFORE 01/01/2022
    encounter = encounter[encounter["start_date"] >= pd.to_datetime("20220101")]
    # Include only "Emergency" (EMER) encounters
    encounter = encounter[encounter["type"].str.match("^EMER$")]
    # Fill missing "end_date" column values with 12/31/2022 as STRING
    encounter["end_date"] = encounter["end_date"].fillna("20221231").astype(str)
    # Recast "end_date" columns as DATETIME
    encounter["end_date"] = pd.to_datetime(encounter["end_date"])
    # Reorder the columns as: "patient_id", "encounter_id", "start_date", "end_date", "type"
    encounter = encounter[["patient_id", "encounter_id", "start_date", "end_date", "type"]]
    # Write rows to new dataframe - "EMER_encounters"
    #EMER_encounters = pd.concat([EMER_encounters, encounter], ignore_index = True)
    sub_blocks.append(encounter)
    # Print the new dataframe shape
    #print(EMER_encounters.shape)
    # Delete encounter1 
    del encounter

EMER_encounters = pd.concat(sub_blocks, ignore_index = True)
# Sort values by "start_date" (ascending order)
EMER_encounters.sort_values(by = ["start_date","encounter_id"], ascending = [True,False], inplace = True)
# Drop duplicate encounters
EMER_encounters.drop_duplicates(subset = ["encounter_id"], keep = "first", inplace = True)
# Calculate the "Length of Stay" (LOS) variable
EMER_encounters["LOS"] = EMER_encounters["end_date"] - EMER_encounters["start_date"]
# Adding "1" for values where "start_date" = "end_date"; Admitted and discharged on the same day
EMER_encounters["LOS"] = EMER_encounters["LOS"].dt.days + 1
# Some encounters have "end_dates" that come before their associated "start_date"; Removing these rows
EMER_encounters = EMER_encounters[EMER_encounters["LOS"] > 0]
# Print the new dataframe shape
print("EMER_encounters Shape:", EMER_encounters.shape)
# Write new "EMER_encounters" dataframes to CSV
EMER_encounters.to_csv(working_dir + "EMER_encounters.csv", index = False)
# Doublecheck to make sure the CSV was created correctly
# EMER_encounters = pd.read_csv(database_dir + "EMER_encounters.csv", nrows = 1000)
# display(EMER_encounters.head())
del EMER_encounters

### Inpatient (IMP) Encounters

In [None]:
# NEW 
# Operationalization: We are interested in care settings: 
# Inpatient Encounter (IMP)

sub_blocks = []
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    # Call in the encounter1.csv dataframe 
    # Use only "patient_id","encounter_id","start_date","end_date","type" columns
    encounter = pd.read_csv(working_dir + "encounter_NEW_"+f'{i:04}'+".csv",
                            parse_dates = ["start_date","end_date"],
                            dtype = {"encounter_id":str,"patient_id":str,"type":str})
    # Exclude all "start_date" that occurs BEFORE 01/01/2022
    encounter = encounter[encounter["start_date"] >= pd.to_datetime("20220101")]
    # Include only "Inpatient" (IMP) encounters
    encounter = encounter[encounter["type"].str.match("^IMP$")]
    # Fill missing "end_date" column values with 12/31/2022 as STRING
    encounter["end_date"] = encounter["end_date"].fillna("20221231").astype(str)
    # Recast "end_date" columns as DATETIME
    encounter["end_date"] = pd.to_datetime(encounter["end_date"])
    # Reorder the columns as: "patient_id", "encounter_id", "start_date", "end_date", "type"
    encounter = encounter[["patient_id", "encounter_id", "start_date", "end_date", "type"]]
    # Write rows to new dataframe - "IMP_encounters"
    # IMP_encounters = pd.concat([IMP_encounters, encounter], ignore_index = True)
    sub_blocks.append(encounter)
    # Print the new dataframe shape
    #print(IMP_encounters.shape)
    # Delete encounter1 
    del encounter

IMP_encounters = pd.concat(sub_blocks, ignore_index = True)
# Sort values by "start_date" (ascending order)
IMP_encounters.sort_values(by = ["start_date","encounter_id"], ascending = [True,False], inplace = True)
# Drop duplicate encounters
IMP_encounters.drop_duplicates(subset = ["encounter_id"], keep = "first", inplace = True)
# Calculate the "Length of Stay" (LOS) variable
IMP_encounters["LOS"] = IMP_encounters["end_date"] - IMP_encounters["start_date"]
# Adding "1" for values where "start_date" = "end_date"; Admitted and discharged on the same day
IMP_encounters["LOS"] = IMP_encounters["LOS"].dt.days + 1
# Some encounters have "end_dates" that come before their associated "start_date"; Removing these rows
IMP_encounters = IMP_encounters[IMP_encounters["LOS"] > 0]
# Print the new dataframe shape
print("IMP_encounters Shape:", IMP_encounters.shape)
# Write new "IMP_encounters" dataframes to CSV
IMP_encounters.to_csv(working_dir + "INPAT_encounters.csv", index = False)
# Doublecheck to make sure the CSV was created correctly
#IMP_encounters = pd.read_csv(database_dir + "INPAT_encounters.csv", nrows = 1000)
display(IMP_encounters.head())
del IMP_encounters

In [None]:
IMP_encounters = pd.read_csv(working_dir + "INPAT_encounters.csv")
print("Unique IMP Patients:", len(IMP_encounters["patient_id"].unique()))
print("IMP_encounters:",IMP_encounters.shape)
del IMP_encounters

AMB_encounters = pd.read_csv(working_dir + "AMB_encounters.csv")
print("Unique AMB Patients", len(AMB_encounters["patient_id"].unique()))
print("AMB_encounters:",AMB_encounters.shape)
del AMB_encounters

EMER_encounters = pd.read_csv(working_dir + "EMER_encounters.csv")
print("Unique EMER Patients:",len(EMER_encounters["patient_id"].unique()))
print("EMER_encounters:",EMER_encounters.shape)
del EMER_encounters

### Clean Datasets Code

In [None]:
# Construct the pattern to match the files
# This pattern matches "encounter_" followed by exactly 4 digits and ".csv"
pattern = os.path.join(working_dir, "encounter_NEW_????"+".csv")

# Use glob.glob to find all files that match the pattern
matching_files = glob.glob(pattern)

# Iterate over the list of matching files and delete each one
for file_path in matching_files:
    try:
        #os.remove(file_path)
        print(f"Deleted file: {file_path}")
    except Exception as e:
        print(f"Error deleting file {file_path}: {e}")