In [8]:
# 02_data_prefix.ipynb
# Given an event log, it extracts its 'prefix' (the first n events)

In [9]:
### IMPORT ###
from pathlib import Path
from datetime import datetime
import pandas as pd

In [10]:
### LOCAL IMPORT ###
from config import config_reader

In [11]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_log_dir = str(yaml_config['DATA_LOG_DIR'])
data_file = "EVENT-LOG_ED_filtered.csv" # input
# event log columns
log_key_col = str(yaml_config['LOG_KEY_COL'])
log_timestamp_col = str(yaml_config['LOG_TIMESTAMP_COL'])
# prefix length
prefix_min = int(yaml_config['PREF_MIN_VALUE'])
prefix_max = int(yaml_config['PREF_MAX_VALUE'])
prefix_list = list(range(prefix_min, prefix_max + 1)) 

### FUNCTIONS ###

In [12]:
# Function to get the top n rows per caseid
def get_top_n_rows_per_caseid(df:pd.DataFrame, n:int, timestamp_col:str) ->pd.DataFrame:
    """
    This function sorts a DataFrame by 'caseid' and 'timestamp', and then selects the top n rows for each 'caseid'.

    Parameters:
        df (pd.DataFrame): The input DataFrame which must contain 'caseid' and 'timestamp' columns.
        n (int): The number of top rows to select for each 'caseid'.
        timestamp_col (str): The name of the column representing timestamps.

    Returns:
        pd.DataFrame: A DataFrame containing the top n rows for each 'caseid', sorted by 'caseid' and 'timestamp'.
    """
    # Ensure the timestamp column is in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
        df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    
    # Sort the DataFrame by 'caseid' and 'timestamp'
    df_sorted = df.sort_values(by=['CaseID', timestamp_col])
    
    # Get the top n rows for each 'caseid'
    top_n_df = df_sorted.groupby('CaseID').head(n)
    
    return top_n_df

### MAIN ###

In [13]:
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-06-12 10:35:10



In [14]:
print(">> Reading event data")
path_data = Path(data_log_dir) / data_file
df_log = pd.read_csv(path_data, sep = ";")
print("Initial data shape:", df_log.shape) # rows x cols
print("Event log columns (features):", df_log.columns) 

>> Reading event data
Initial data shape: (5731, 9)
Event log columns (features): Index(['CaseID', 'ACTIVITY', 'TIMESTAMP', 'RESOURCE', 'ESI', 'OUTCOME',
       'INPAT-HOSP-DEP', 'CumulativeTimeHours', 'CumulativeTimeDays'],
      dtype='object')


In [15]:
df_log

Unnamed: 0,CaseID,ACTIVITY,TIMESTAMP,RESOURCE,ESI,OUTCOME,INPAT-HOSP-DEP,CumulativeTimeHours,CumulativeTimeDays
0,2022090010,TRIAGE,2022-09-01 05:39:15,NURS_2,4,A domicilio,-,0,0
1,2022090010,PRESA IN CARICO,2022-09-01 05:51:00,DOCT_4,4,A domicilio,-,0,0
2,2022090010,LABORATORIO,2022-09-01 05:53:20,-,4,A domicilio,-,0,0
3,2022090010,LABORATORIO,2022-09-01 05:54:39,-,4,A domicilio,-,0,0
4,2022090010,DIMISSIONE,2022-09-01 13:15:00,-,4,A domicilio,-,7,0
...,...,...,...,...,...,...,...,...,...
5726,2022093474,RX,2022-09-30 23:04:32,-,4,A domicilio,-,0,0
5727,2022093474,RX,2022-09-30 23:04:32,-,4,A domicilio,-,0,0
5728,2022093474,RX,2022-09-30 23:04:32,-,4,A domicilio,-,0,0
5729,2022093474,RX,2022-09-30 23:04:32,-,4,A domicilio,-,0,0


In [16]:
print(">> Information on event log")
print()
# Number of distinct cases
cases_num = df_log[log_key_col].nunique()
print(f"Cases (distinct '{log_key_col}' values): {cases_num}")
print()

>> Information on event log

Cases (distinct 'CaseID' values): 695



In [17]:
print(">> Prefix extraction")
print()
print("Prefix list:", prefix_list)
print()
for prefix_size in prefix_list:
    print("> Prefix size:", prefix_size)
    df_log_prefix = get_top_n_rows_per_caseid(df_log, prefix_size, log_timestamp_col)
    print("Saving the event log prefix")
    # ordering
    file_out = f"{Path(data_file).stem}_{prefix_size}.csv" # get the input file name and add "_{prefix_size}"
    path_out = Path(data_log_dir) / file_out
    print("Path:", path_out)
    df_log_prefix.to_csv(path_out, sep=";", index=False)
    print()

>> Prefix extraction

Prefix list: [2, 3, 4, 5]

> Prefix size: 2
Saving the event log prefix
Path: data_log/EVENT-LOG_ED_filtered_2.csv

> Prefix size: 3
Saving the event log prefix
Path: data_log/EVENT-LOG_ED_filtered_3.csv

> Prefix size: 4
Saving the event log prefix
Path: data_log/EVENT-LOG_ED_filtered_4.csv

> Prefix size: 5
Saving the event log prefix
Path: data_log/EVENT-LOG_ED_filtered_5.csv



In [18]:
# program END
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)

print()
print("*** PROGRAM END ***")
print()


End process: 2024-06-12 10:35:10
Time to finish: 0:00:00

*** PROGRAM END ***

