In [1]:
# 03_data_encoding.ipynb
# Encodes the event log

In [2]:
### IMPORT ###
from pathlib import Path
from datetime import datetime
import pandas as pd

In [3]:
### LOCAL IMPORT ###
from config import config_reader

In [4]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_log_dir = str(yaml_config['DATA_LOG_DIR'])
data_log_encoded_dir = str(yaml_config['DATA_LOG_ENCODED_DIR'])
data_file = "EVENT-LOG_ED_filtered_DIM.csv" # input (DIM vary by prefix_size)
# event log columns
log_key_col = str(yaml_config['LOG_KEY_COL'])
log_activity_col = str(yaml_config['LOG_ACTIVITY_COL'])
log_timestamp_col = str(yaml_config['LOG_TIMESTAMP_COL'])
log_outcome_col = str(yaml_config['LOG_OUTCOME_COL'])
log_outcome_col_values = list(yaml_config['LOG_OUTCOME_COL_VALUES'])
# prefix length
prefix_min = int(yaml_config['PREF_MIN_VALUE'])
prefix_max = int(yaml_config['PREF_MAX_VALUE'])
prefix_list = list(range(prefix_min, prefix_max + 1)) 
prefix_size = 2 # input that must be inside prefix_list
# encoding
col_enc = [log_key_col, 'ACTIVITY', 'TIMESTAMP', log_outcome_col] # input -> colums to be used for encoding and prediction [case-id, activity column, timestamp column, outcome column]
encoding = "F" # input -> B = binary, F = Frequency, I = simple Index

### FUNCTIONS ###

In [5]:
def binary_encoding(df: pd.DataFrame, activity_col:str) -> pd.DataFrame:
    """
    Perform binary encoding on the given DataFrame.

    Parameters:
        df (pd.DataFrame): The input dataframe containing the event log to be encoded.
        activity_col (str): The name of the column representing activities.

    Returns:
        pd.DataFrame: A dataframe with the 'activity_col' column in binary encoding.
    """
    
    df_encoded = pd.get_dummies(df, columns=[activity_col], prefix=[activity_col], dtype=int)
    return df_encoded

In [6]:
def frequency_encoding(df: pd.DataFrame, case_col:str, activity_col:str, timestamp_col:str) -> pd.DataFrame:
    """
    Perform frequency encoding on the given DataFrame.

    Parameters:
        df (pd.DataFrame): The input dataframe containing the event log to be encoded.
        case_col (str): The name of the column representing case-id.
        activity_col (str): The name of the column representing activities.
        timestamp_col (str): The name of the column representing timestamps.

    Returns:
        pd.DataFrame: A dataframe with the original columns plus the cumulative frequency of activities for each CaseID up to the current TIMESTAMP.
    """
    
    # Ensure TIMESTAMP is in datetime format
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # Sort the dataframe by CaseID and TIMESTAMP
    df = df.sort_values(by=[case_col, timestamp_col])

    # Calculate cumulative counts of activities
    cumulative_counts = df.groupby([case_col, activity_col]).cumcount() + 1

    # Create a pivot table to get cumulative counts as columns
    pivot_df = df.assign(count=cumulative_counts).pivot_table(index=[case_col, timestamp_col], columns=activity_col, values='count', fill_value=0).reset_index()

    # Add prefix ACTIVITY_ to activity columns
    pivot_df = pivot_df.rename(columns=lambda x: f'ACTIVITY_{x}' if x not in [case_col, timestamp_col] else x)

    # Merge the cumulative counts with the original dataframe
    merged_df = df.merge(pivot_df, on=[case_col, timestamp_col], how='left')

    # Ensure the frequency counts are all integers
    activity_columns = [col for col in merged_df.columns if col.startswith('ACTIVITY_')]
    merged_df[activity_columns] = merged_df[activity_columns].astype(int)

    return merged_df

In [7]:
def index_encoding(df:pd.DataFrame, case_col:str, activity_col:str, timestamp_col:str) -> pd.DataFrame:
    """
    Perform simple index encoding on the given DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the event log to be encoded. 
        case_col (str): The name of the column representing case IDs.
        activity_col (str): The name of the column representing activities.
        timestamp_col (str): The name of the column representing timestamps.

    Returns:
        pd.DataFrame: A DataFrame with index encoding where each activity is a column with values indicating the order of appearance in each case.
    """
    
    # Sort the DataFrame by case ID and timestamp
    df_sorted = df.sort_values(by=[case_col, timestamp_col])

    # Group by case ID and apply a counter for each activity
    df_sorted['ACTIVITY_ORDER'] = df_sorted.groupby(case_col).cumcount() + 1

    # Create a column for each activity with the index of its occurrence
    for activity in df[activity_col].unique():
        activity_mask = df_sorted[activity_col] == activity
        df_sorted[f'ACTIVITY_{activity}'] = activity_mask.astype(int) * df_sorted['ACTIVITY_ORDER']

    # Fill NaN with 0 and convert to integer
    activity_columns = [col for col in df_sorted.columns if col.startswith('ACTIVITY_')]
    df_sorted[activity_columns] = df_sorted[activity_columns].fillna(0).astype(int)
    
    # Drop the temporary 'ACTIVITY_ORDER' column and the original activity_col
    df_final = df_sorted.drop(columns=['ACTIVITY_ORDER',activity_col])
    
    return df_final

In [8]:
def df_reorder_columns(df:pd.DataFrame, case_col:str, timestamp_col:str, outcome_col:str) -> pd.DataFrame:
    """
    Reorders the columns of the DataFrame with the following order: CaseID, TIMESTAMP, all other columns, OUTCOME.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame to reorder.
        case_col (str): The name of the column representing case IDs.
        activity_col (str): The name of the column representing activities.
        timestamp_col (str): The name of the column representing timestamps.
    
    Returns:
        pd.DataFrame: The DataFrame with reordered columns.
    """
    # List of all columns in the DataFrame
    columns = list(df.columns)
    
    # Remove CaseID, TIMESTAMP, and OUTCOME from the list of columns
    columns.remove(case_col)
    columns.remove(timestamp_col)
    columns.remove(outcome_col)
    
    # Create the new columns order list
    new_columns_order = [case_col, timestamp_col] + columns + [outcome_col]
    
    # Reorder the DataFrame columns
    df_reordered = df[new_columns_order]
    
    return df_reordered

### MAIN ###

In [9]:
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-06-12 10:35:17



In [10]:
print(">> Reading event log prefixes")
print()
print("Prefix list:", prefix_list)
print()
print("> Prefix size:", prefix_size)
if prefix_size in prefix_list:
    data_file_prefix = data_file.replace("DIM", str(prefix_size))
    path_data = Path(data_log_dir) / data_file_prefix
    print("> Reading:", path_data)
    df_log = pd.read_csv(path_data, sep = ";")
    print("Initial data shape:", df_log.shape) # rows x cols
    print("Event log columns (features):", df_log.columns)
    print("> Information on event log")
    # Number of distinct cases
    cases_num = df_log[log_key_col].nunique()
    print(f"Cases (distinct '{log_key_col}' values): {cases_num}")
    print()
else:
    print(f"ERROR! The prefix size {prefix_size} it's outside the interval {prefix_list}")
    quit()

>> Reading event log prefixes

Prefix list: [2, 3, 4, 5]

> Prefix size: 2
> Reading: data_log/EVENT-LOG_ED_filtered_2.csv
Initial data shape: (1390, 9)
Event log columns (features): Index(['CaseID', 'ACTIVITY', 'TIMESTAMP', 'RESOURCE', 'ESI', 'OUTCOME',
       'INPAT-HOSP-DEP', 'CumulativeTimeHours', 'CumulativeTimeDays'],
      dtype='object')
> Information on event log
Cases (distinct 'CaseID' values): 695



In [11]:
df_log

Unnamed: 0,CaseID,ACTIVITY,TIMESTAMP,RESOURCE,ESI,OUTCOME,INPAT-HOSP-DEP,CumulativeTimeHours,CumulativeTimeDays
0,2022090010,TRIAGE,2022-09-01 05:39:15,NURS_2,4,A domicilio,-,0,0
1,2022090010,PRESA IN CARICO,2022-09-01 05:51:00,DOCT_4,4,A domicilio,-,0,0
2,2022090014,TRIAGE,2022-09-01 06:42:25,NURS_3,4,A domicilio,-,0,0
3,2022090014,PRESA IN CARICO,2022-09-01 07:06:00,DOCT_0,4,A domicilio,-,0,0
4,2022090016,TRIAGE,2022-09-01 07:27:34,NURS_4,4,Ricoverato,MED. INTERNA AD IND. EMATOLOGICO RO,0,0
...,...,...,...,...,...,...,...,...,...
1385,2022093461,PRESA IN CARICO,2022-09-30 19:36:00,DOCT_14,4,A domicilio,-,0,0
1386,2022093462,TRIAGE,2022-09-30 20:00:53,NURS_4,3,A domicilio,-,0,0
1387,2022093462,PRESA IN CARICO,2022-09-30 20:37:00,DOCT_89',3,A domicilio,-,0,0
1388,2022093474,TRIAGE,2022-09-30 22:40:52,NURS_8,4,A domicilio,-,0,0


In [12]:
# Columns to be used in the encoding and prediction phases
print(">> Selecting columns (features) for the encoding and prediction phases")
df_log = df_log[col_enc]
print("Columns:", df_log.columns)

>> Selecting columns (features) for the encoding and prediction phases
Columns: Index(['CaseID', 'ACTIVITY', 'TIMESTAMP', 'OUTCOME'], dtype='object')


In [13]:
df_log

Unnamed: 0,CaseID,ACTIVITY,TIMESTAMP,OUTCOME
0,2022090010,TRIAGE,2022-09-01 05:39:15,A domicilio
1,2022090010,PRESA IN CARICO,2022-09-01 05:51:00,A domicilio
2,2022090014,TRIAGE,2022-09-01 06:42:25,A domicilio
3,2022090014,PRESA IN CARICO,2022-09-01 07:06:00,A domicilio
4,2022090016,TRIAGE,2022-09-01 07:27:34,Ricoverato
...,...,...,...,...
1385,2022093461,PRESA IN CARICO,2022-09-30 19:36:00,A domicilio
1386,2022093462,TRIAGE,2022-09-30 20:00:53,A domicilio
1387,2022093462,PRESA IN CARICO,2022-09-30 20:37:00,A domicilio
1388,2022093474,TRIAGE,2022-09-30 22:40:52,A domicilio


In [14]:
print(">> Encoding")
print("> Encoding the outcome")
outcome_mapping = {log_outcome_col_values[0]: 0, log_outcome_col_values[1]: 1}  
df_log = df_log.copy() 
df_log["OUTCOME_BIN"] = df_log[log_outcome_col].map(outcome_mapping)
df_log = df_log.drop(columns=[log_outcome_col]) # drop the old outcome in str format
df_log = df_log.rename(columns={"OUTCOME_BIN": log_outcome_col})

>> Encoding
> Encoding the outcome


In [15]:
df_log

Unnamed: 0,CaseID,ACTIVITY,TIMESTAMP,OUTCOME
0,2022090010,TRIAGE,2022-09-01 05:39:15,0
1,2022090010,PRESA IN CARICO,2022-09-01 05:51:00,0
2,2022090014,TRIAGE,2022-09-01 06:42:25,0
3,2022090014,PRESA IN CARICO,2022-09-01 07:06:00,0
4,2022090016,TRIAGE,2022-09-01 07:27:34,1
...,...,...,...,...
1385,2022093461,PRESA IN CARICO,2022-09-30 19:36:00,0
1386,2022093462,TRIAGE,2022-09-30 20:00:53,0
1387,2022093462,PRESA IN CARICO,2022-09-30 20:37:00,0
1388,2022093474,TRIAGE,2022-09-30 22:40:52,0


In [16]:
print("> Encoding the activity column")
print("Encoding type:", encoding)
if encoding == "B":
    df_encoded = binary_encoding(df_log, log_activity_col)
if encoding == "F":
    df_encoded = frequency_encoding(df_log, log_key_col, log_activity_col, log_timestamp_col)
if encoding == "I":
    df_encoded = index_encoding(df_log, log_key_col, log_activity_col, log_timestamp_col)

# Reorder the columns
df_encoded = df_reorder_columns(df_encoded, log_key_col, log_timestamp_col, log_outcome_col)

> Encoding the activity column
Encoding type: F


In [17]:
df_encoded

Unnamed: 0,CaseID,TIMESTAMP,ACTIVITY,ACTIVITY_ELETTROCARDIOGRAMMA,ACTIVITY_PRESA IN CARICO,ACTIVITY_TRIAGE,OUTCOME
0,2022090010,2022-09-01 05:39:15,TRIAGE,0,0,1,0
1,2022090010,2022-09-01 05:51:00,PRESA IN CARICO,0,1,0,0
2,2022090014,2022-09-01 06:42:25,TRIAGE,0,0,1,0
3,2022090014,2022-09-01 07:06:00,PRESA IN CARICO,0,1,0,0
4,2022090016,2022-09-01 07:27:34,TRIAGE,0,0,1,1
...,...,...,...,...,...,...,...
1385,2022093461,2022-09-30 19:36:00,PRESA IN CARICO,0,1,0,0
1386,2022093462,2022-09-30 20:00:53,TRIAGE,0,0,1,0
1387,2022093462,2022-09-30 20:37:00,PRESA IN CARICO,0,1,0,0
1388,2022093474,2022-09-30 22:40:52,TRIAGE,0,0,1,0


In [18]:
print(">> Saving the event log encoded")
# ordering
final_sampled_df = df_encoded.sort_values(by = [log_key_col, log_timestamp_col])
file_out = f"{Path(data_file_prefix).stem}_{encoding}.csv" # get the input file name and add "_{encoding}"
path_out = Path(data_log_encoded_dir) / file_out
print("Path:", path_out)
final_sampled_df.to_csv(path_out, sep=";", index=False)

>> Saving the event log encoded
Path: data_log_encoded/EVENT-LOG_ED_filtered_2_F.csv


In [19]:
# program END
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)

print()
print("*** PROGRAM END ***")
print()


End process: 2024-06-12 10:35:17
Time to finish: 0:00:00

*** PROGRAM END ***

