In [71]:
# 02_log_prefix

In [72]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [73]:
### IMPORT ###
from pathlib import Path
from datetime import datetime
import pandas as pd

In [74]:
### LOCAL IMPORT ###
from config import config_reader

In [75]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
log_dir = str(yaml_config["LOG_DIR"])
prefix_dir = str(yaml_config["LOG_PREFIX_DIR"])

std_suffix = str(yaml_config["STD_SUFFIX"])
enr_suffix = str(yaml_config["ENR_SUFFIX"])

log_in = f"EVENT-LOG_ED_duration_{enr_suffix}.csv" # <-- INPUT: event log standard (std_suffix) or enriched (enr_suffix)

dic_types = dict(yaml_config["EVENT_LOG_TYPES"]) 

# Prefix
prefix_hour = 3 # <-- INPUT prefix length in hours (1, 2, 3)

# Event log main columns
caseid_col = str(yaml_config["CASEID_COL"])
activity_col = str(yaml_config["ACTIVITY_COL"])
timestamp_col = str(yaml_config["TIMESTAMP_COL"])

# FUNCTIONS

In [76]:
def filter_events_after_triage(event_log: pd.DataFrame, hours: int) -> pd.DataFrame:
    """
    Filters the event log to include only events that occurred within a specified number of hours
    after the first TRIAGE activity for each CaseID, including the TRIAGE activity itself.
    
    Parameters:
    event_log (pd.DataFrame): The original event log dataframe with columns including
                              'CaseID', 'ACTIVITY', 'TIMESTAMP', and others.
    hours (int): The number of hours within which events after the first TRIAGE activity are included.
    
    Returns:
    pd.DataFrame: A filtered dataframe containing the rows where the TRIAGE activity occurs
                  and subsequent rows where the event timestamp is within the specified number of
                  hours after the first TRIAGE activity for each CaseID.
    """
    # Convert TIMESTAMP column to datetime type
    event_log['TIMESTAMP'] = pd.to_datetime(event_log['TIMESTAMP'])

    # Define the function to filter rows for each CaseID
    def filter_within_hour_after_triage(df: pd.DataFrame) -> pd.DataFrame:
        # Find the minimum timestamp for TRIAGE activity
        triage_time = df.loc[df['ACTIVITY'] == 'TRIAGE', 'TIMESTAMP'].min()
        
        # If there is no TRIAGE activity, return an empty dataframe
        if pd.isna(triage_time):
            return pd.DataFrame()
        
        # Filter rows where the timestamp is within the specified number of hours after the TRIAGE time
        is_within_threshold = df['TIMESTAMP'] <= triage_time + pd.Timedelta(hours=hours)
        
        return df[is_within_threshold]

    # Apply the filter function to each group of CaseID
    filtered_event_log = event_log.groupby('CaseID').apply(lambda x: filter_within_hour_after_triage(x)).reset_index(drop=True)
    
    return filtered_event_log

# MAIN

In [77]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()

# print(yaml_config) # debug


*** PROGRAM START ***

Start process: 2024-08-22 20:15:50



In [78]:
print(">> Creating output directories")
directory_path = Path(prefix_dir) # <-- Specify the directory to be created
# Create the directory
directory_path.mkdir(parents=True, exist_ok=True)
print(f"Directory '{directory_path}' created successfully.")

>> Creating output directories
Directory 'data_prefix' created successfully.


In [79]:
print(">> Reading event log")
path_data = Path(log_dir) / log_in
print("File:", path_data)
df_log = pd.read_csv(path_data, sep=";", dtype=dic_types)
print("Event log shape:", df_log.shape)
total_cases = df_log["CaseID"].nunique()
print("Event log cases:", total_cases)
print("Event log columns:", df_log.columns)

>> Reading event log
File: data_log/EVENT-LOG_ED_duration_enr.csv
Event log shape: (20624, 16)
Event log cases: 3478
Event log columns: Index(['CaseID', 'ACTIVITY', 'TIMESTAMP', 'RESOURCE', 'ESI', 'OUTCOME',
       'INPAT-HOSP-DEP', 'REMAINING_TIME_sec', 'ACTIVE_CaseID', 'TIMESTAMP_HH',
       'COUNT-ESI-1', 'COUNT-ESI-2', 'COUNT-ESI-3', 'COUNT-ESI-4',
       'COUNT-ESI-5', 'CLUSTER'],
      dtype='object')


In [80]:
df_log

Unnamed: 0,CaseID,ACTIVITY,TIMESTAMP,RESOURCE,ESI,OUTCOME,INPAT-HOSP-DEP,REMAINING_TIME_sec,ACTIVE_CaseID,TIMESTAMP_HH,COUNT-ESI-1,COUNT-ESI-2,COUNT-ESI-3,COUNT-ESI-4,COUNT-ESI-5,CLUSTER
0,2022090001,TRIAGE,2022-09-01 00:03:55,NURS_0,3,A domicilio,-,41105.0,0,0,0,0,0,0,0,0
1,2022090001,PRESA IN CARICO,2022-09-01 00:22:00,DOCT_0,3,A domicilio,-,40020.0,2,0,0,0,1,1,0,0
2,2022090001,LABORATORIO,2022-09-01 00:28:21,-,3,A domicilio,-,39639.0,2,0,0,0,1,1,0,0
3,2022090001,TC CRANIO,2022-09-01 00:38:00,-,3,A domicilio,-,39060.0,2,0,0,0,1,1,0,0
4,2022090001,TAC,2022-09-01 00:38:00,-,3,A domicilio,-,39060.0,2,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20619,2022093478,DIMISSIONE,2022-10-01 00:21:00,-,3,A domicilio,-,0.0,27,0,5,8,7,7,0,0
20620,2022093479,TRIAGE,2022-09-30 23:58:38,NURS_8,4,A domicilio,-,27322.0,31,23,5,8,8,9,1,1
20621,2022093479,PRESA IN CARICO,2022-10-01 00:51:00,DOCT_1,4,A domicilio,-,24180.0,27,0,5,8,7,7,0,1
20622,2022093479,LABORATORIO,2022-10-01 01:00:12,-,4,A domicilio,-,23628.0,27,1,5,8,7,7,0,1


### PARTIAL TRACES

In [81]:
print(">> Filtering after n hours")
df_log_filtered = filter_events_after_triage(df_log, prefix_hour)
print("New event log shape:", df_log_filtered.shape)

>> Filtering after n hours
New event log shape: (14087, 16)


  filtered_event_log = event_log.groupby('CaseID').apply(lambda x: filter_within_hour_after_triage(x)).reset_index(drop=True)


In [82]:
df_log_filtered

Unnamed: 0,CaseID,ACTIVITY,TIMESTAMP,RESOURCE,ESI,OUTCOME,INPAT-HOSP-DEP,REMAINING_TIME_sec,ACTIVE_CaseID,TIMESTAMP_HH,COUNT-ESI-1,COUNT-ESI-2,COUNT-ESI-3,COUNT-ESI-4,COUNT-ESI-5,CLUSTER
0,2022090001,TRIAGE,2022-09-01 00:03:55,NURS_0,3,A domicilio,-,41105.0,0,0,0,0,0,0,0,0
1,2022090001,PRESA IN CARICO,2022-09-01 00:22:00,DOCT_0,3,A domicilio,-,40020.0,2,0,0,0,1,1,0,0
2,2022090001,LABORATORIO,2022-09-01 00:28:21,-,3,A domicilio,-,39639.0,2,0,0,0,1,1,0,0
3,2022090001,TC CRANIO,2022-09-01 00:38:00,-,3,A domicilio,-,39060.0,2,0,0,0,1,1,0,0
4,2022090001,TAC,2022-09-01 00:38:00,-,3,A domicilio,-,39060.0,2,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14082,2022093478,ELETTROCARDIOGRAMMA,2022-10-01 00:15:34,-,3,A domicilio,-,326.0,30,0,5,8,8,9,0,0
14083,2022093478,DIMISSIONE,2022-10-01 00:21:00,-,3,A domicilio,-,0.0,27,0,5,8,7,7,0,0
14084,2022093479,TRIAGE,2022-09-30 23:58:38,NURS_8,4,A domicilio,-,27322.0,31,23,5,8,8,9,1,1
14085,2022093479,PRESA IN CARICO,2022-10-01 00:51:00,DOCT_1,4,A domicilio,-,24180.0,27,0,5,8,7,7,0,1


In [83]:
print(">> Saving event log prefixes")
log_out = f"{Path(log_in).stem}_prefix_{prefix_hour}h.csv"
path_out = Path(prefix_dir) / log_out
print("File:", path_out)
df_log_filtered.to_csv(path_out, sep=";", index=False)

>> Saving event log prefixes
File: data_prefix/EVENT-LOG_ED_duration_enr_prefix_2h.csv


In [84]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print("End process:", end_time)
print("Time to finish:", delta_time)

print()
print("*** PROGRAM END ***")
print()

End process: 2024-08-22 20:15:52
Time to finish: 0:00:02

*** PROGRAM END ***

