In [123]:
# 03_log_encoding

In [124]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [125]:
### IMPORT ###
from pathlib import Path
from datetime import datetime
import pandas as pd

In [126]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import extract_files

In [127]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")

log_dir = str(yaml_config["LOG_DIR"])
prefix_dir = str(yaml_config["LOG_PREFIX_DIR"]) 
encoding_dir = str(yaml_config["LOG_ENCODING_DIR"])

dic_types = dict(yaml_config["EVENT_LOG_TYPES"]) 

col_clean_list = ['ACTIVITY', 'RESOURCE', 'OUTCOME', 'INPAT-HOSP-DEP'] # <-- INPUT: columns to be cleaned as string

col_activity = "ACTIVITY" # <-- INPUT: the activity column to be encoded

encoding_type = "I" # <-- INPUT encoding type of the ACTIVITY column (B = Binary, F = Frequency, I = Index)

# FUNCTIONS

In [128]:
def preprocess_column_values(df: pd.DataFrame, columns_to_preprocess: list) -> pd.DataFrame:
    """
    Preprocess column values by converting to lowercase, replacing spaces with underscores,
    and removing periods.

    Parameters:
        df (pd.DataFrame): The DataFrame containing columns to preprocess.
        columns_to_preprocess (List[str]): A list of column names to preprocess.

    Returns:
        pd.DataFrame: The DataFrame with preprocessed column values.
    """
    invalid_chars = ['[', ']', '<',' ', '(', ')', '.']
    for column in columns_to_preprocess:
        if column in df.columns:
            for char in invalid_chars:
                df[column] = df[column].str.lower().str.replace(char, '_')
        else:
            raise ValueError(f"The provided DataFrame does not contain a '{column}' column.")
    return df

In [129]:
def encode_feature_column_binary(df: pd.DataFrame, column_name: str, prefix: str) -> pd.DataFrame:
    """
    Preprocess and perform one-hot encoding for the specified column in the provided DataFrame,
    and return the DataFrame with the original columns plus the encoded columns.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be encoded.
        column_name (str): The name of the column to one-hot encode.
        prefix (str): The prefix for the one-hot encoded columns.

    Returns:
        pd.DataFrame: A DataFrame with the original columns and the specified column one-hot encoded.
    """
    
    if column_name in df.columns:
        # Perform one-hot encoding
        df_encoded = pd.get_dummies(df[column_name], prefix=prefix).astype(int)
        # Concatenate the DataFrame with the encoded columns and drop the original column
        df_final = pd.concat([df.drop(columns=[column_name]), df_encoded], axis=1)
        return df_final
    else:
        raise ValueError(f"The provided DataFrame does not contain a '{column_name}' column.")

In [130]:
def encode_feature_column_frequency(df: pd.DataFrame, id_column: str, activity_column: str, prefix: str) -> pd.DataFrame:
    """
    Adds columns to the original DataFrame, where each column represents the cumulative frequency
    of a distinct value in the activity column, calculated separately for each CaseID. The column
    names are prefixed with the specified string. The activity column is removed from the resulting
    DataFrame.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        id_column (str): The name of the column containing case IDs.
        activity_column (str): The name of the column containing activities.
        prefix (str): The prefix to use for the new frequency columns.
        
    Returns:
        pd.DataFrame: The original DataFrame with new columns added for cumulative frequencies, and the activity column removed.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Initialise frequency columns for each activity
    unique_activities = df[activity_column].unique()
    for activity in unique_activities:
        df[f"{prefix}_{activity}"] = 0
    
    # Calculate cumulative frequencies for each CaseID
    for case_id, group in df.groupby(id_column):
        cumulative_counts = {activity: 0 for activity in unique_activities}
        for idx in group.index:
            current_activity = df.at[idx, activity_column]
            cumulative_counts[current_activity] += 1
            for activity in unique_activities:
                df.at[idx, f"{prefix}_{activity}"] = cumulative_counts[activity]
    
    # Remove the activity column
    df.drop(columns=[activity_column], inplace=True)

    return df

In [131]:
def encode_feature_column_index(df:pd.DataFrame, id_column:str, activity_col:str, timestamp_col:str, prefix:str) -> pd.DataFrame:
    """
    Perform simple index encoding on the given DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the event log to be encoded. 
        id_column (str): The name of the column representing case IDs.
        activity_col (str): The name of the column representing activities.
        timestamp_col (str): The name of the column representing timestamps.
        prefix (str): The prefix to use for the new frequency columns.

    Returns:
        pd.DataFrame: A DataFrame with index encoding where each activity is a column with values indicating the order of appearance in each case.
    """
    
    # Sort the DataFrame by case ID and timestamp
    df_sorted = df.sort_values(by=[id_column, timestamp_col])

    # Group by case ID and apply a counter for each activity
    df_sorted['ACTIVITY_ORDER'] = df_sorted.groupby(id_column).cumcount() + 1

    # Create a column for each activity with the index of its occurrence
    for activity in df[activity_col].unique():
        activity_mask = df_sorted[activity_col] == activity
        df_sorted[f'{prefix}_{activity}'] = activity_mask.astype(int) * df_sorted['ACTIVITY_ORDER']

    # Fill NaN with 0 and convert to integer
    activity_columns = [col for col in df_sorted.columns if col.startswith('ACTIVITY_')]
    df_sorted[activity_columns] = df_sorted[activity_columns].fillna(0).astype(int)
    
    # Drop the temporary 'ACTIVITY_ORDER' column and the original activity_col
    df_final = df_sorted.drop(columns=['ACTIVITY_ORDER',activity_col])
    
    return df_final

In [132]:
def move_columns_to_end(df: pd.DataFrame, columns_to_move: list) -> pd.DataFrame:
    """
    Move specified columns to the end of the DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame from which columns are to be moved.
        columns_to_move (list): A list of column names to move to the end.

    Returns:
        pd.DataFrame: A DataFrame with the specified columns moved to the end.
    """
    # Ensure all specified columns exist in the DataFrame
    for column in columns_to_move:
        if column not in df.columns:
            print(f"WARNING! The provided DataFrame does not contain a '{column}' column.")
            continue

    # List of columns to keep at the start
    columns = [col for col in df.columns if col not in columns_to_move]

    # Reorder the DataFrame columns
    df_reordered = df[columns + columns_to_move]
    return df_reordered

# MAIN

In [133]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()

# print(yaml_config) # debug


*** PROGRAM START ***

Start process: 2024-08-22 20:16:27



In [134]:
print(">> Creating output directories")
directory_path = Path(encoding_dir) # <-- Specify the directory to be created
# Create the directory
directory_path.mkdir(parents=True, exist_ok=True)
print(f"Directory '{directory_path}' created successfully.")

>> Creating output directories
Directory 'data_encoding' created successfully.


In [135]:
print(">> Listing event log prefixes files")
list_files = extract_files(prefix_dir, "csv")
list_files_len = len(list_files)
print("Files found:", list_files_len)
print(list_files)

>> Listing event log prefixes files
Files found: 6
['EVENT-LOG_ED_duration_enr_prefix_1h.csv', 'EVENT-LOG_ED_duration_enr_prefix_2h.csv', 'EVENT-LOG_ED_duration_enr_prefix_3h.csv', 'EVENT-LOG_ED_duration_std_prefix_1h.csv', 'EVENT-LOG_ED_duration_std_prefix_2h.csv', 'EVENT-LOG_ED_duration_std_prefix_3h.csv']


In [136]:
print(">> Listing event log prefixes files")
for file_name in list_files:
    path_data = Path(prefix_dir) / file_name
    print("File:", path_data)
    df_log = pd.read_csv(path_data, sep=";", dtype=dic_types)
    print("Event log shape:", df_log.shape)
    print("Event log cases:", df_log["CaseID"].nunique())
    print(df_log.columns)
    print("> Cleaning the event log")
    df_log = preprocess_column_values(df_log, col_clean_list)
    for col_name in col_clean_list:
        print(col_name)
        print(df_log[col_name].unique().tolist())
        print("-")
    print(">> Encoding activity and features")
    df_log_encoded = None
    if encoding_type == "B":
        df_log_encoded = encode_feature_column_binary(df_log, col_activity, col_activity)
    if encoding_type == "F":
        df_log_encoded = encode_feature_column_frequency(df_log, "CaseID", col_activity, col_activity)
    if encoding_type == "I":
        df_log_encoded = encode_feature_column_index(df_log, "CaseID", col_activity, "TIMESTAMP", col_activity)
    df_log_encoded = encode_feature_column_binary(df_log_encoded, "RESOURCE", "RESOURCE")
    df_log_encoded = encode_feature_column_binary(df_log_encoded, "OUTCOME", "OUTCOME")
    df_log_encoded = encode_feature_column_binary(df_log_encoded, "INPAT-HOSP-DEP", "INPAT-HOSP-DEP")
    # df_log_encoded = move_columns_to_end(df_log_encoded, ['ESI', 'CASE_DURATION_sec', 'REMAINING_TIME_sec'])
    df_log_encoded = move_columns_to_end(df_log_encoded, ['ESI', 'REMAINING_TIME_sec'])
    print("Encoded event log columns")
    print(df_log_encoded.columns)
    print(">> Saving event log")
    log_out = f"{Path(file_name).stem}_{encoding_type}.csv"
    path_out = Path(encoding_dir) / log_out
    print("File:", path_out)
    df_log_encoded.to_csv(path_out, sep=";", index=False)

>> Listing event log prefixes files
File: data_prefix/EVENT-LOG_ED_duration_enr_prefix_1h.csv
Event log shape: (11289, 16)
Event log cases: 3478
Index(['CaseID', 'ACTIVITY', 'TIMESTAMP', 'RESOURCE', 'ESI', 'OUTCOME',
       'INPAT-HOSP-DEP', 'REMAINING_TIME_sec', 'ACTIVE_CaseID', 'TIMESTAMP_HH',
       'COUNT-ESI-1', 'COUNT-ESI-2', 'COUNT-ESI-3', 'COUNT-ESI-4',
       'COUNT-ESI-5', 'CLUSTER'],
      dtype='object')
> Cleaning the event log
ACTIVITY
['triage', 'presa_in_carico', 'laboratorio', 'tc_cranio', 'tac', 'consulenza', 'rx', 'dimissione', 'elettrocardiogramma', 'rx_thorax_/_th+lat', 'ecograf_renovescicale', 'eco-doppler_tronchi_sovraortici', 'rx_thorax_bed', 'ecografia_addome_sup_', 'ecograf_addome_completo', 'controlli_pretrasf_', 'obi', 'ecografia_addome_inferiore', 'esofagogastroduodenoscopia__egd_', 'eco-doppler_arterioso_arti_inf_', 'ecografia_scrotale/testicolare', 'ecografia_muscolo_tendinea', 'broncoscopia_con_fibre_ottiche', 'ecografia_cute_e_tessuto', 'controllo_pm-ic

In [137]:
# Last event log created
df_log_encoded

Unnamed: 0,CaseID,TIMESTAMP,CASE_DURATION_sec,ACTIVITY_triage,ACTIVITY_presa_in_carico,ACTIVITY_laboratorio,ACTIVITY_tc_cranio,ACTIVITY_tac,ACTIVITY_dimissione,ACTIVITY_consulenza,...,INPAT-HOSP-DEP_oncologia_polmonare_ro,INPAT-HOSP-DEP_ortopedia_e_traumatologia_ro,INPAT-HOSP-DEP_pneumologia_ro__no_covid__,INPAT-HOSP-DEP_psichiatria_ro,INPAT-HOSP-DEP_rianimazione_covid,INPAT-HOSP-DEP_s_s__u_t_i_c__ro,INPAT-HOSP-DEP_terapia_onco_ematologica_intensiva_trapianto_ro,INPAT-HOSP-DEP_urologia_ro,ESI,REMAINING_TIME_sec
0,2022090001,2022-09-01 00:03:55,41105.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,41105.0
1,2022090001,2022-09-01 00:22:00,41105.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,40020.0
2,2022090001,2022-09-01 00:28:21,41105.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3,39639.0
3,2022090001,2022-09-01 00:38:00,41105.0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,3,39060.0
4,2022090001,2022-09-01 00:38:00,41105.0,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,3,39060.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15948,2022093478,2022-10-01 00:15:34,2562.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,326.0
15949,2022093478,2022-10-01 00:21:00,2562.0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,3,0.0
15950,2022093479,2022-09-30 23:58:38,27322.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,27322.0
15951,2022093479,2022-10-01 00:51:00,27322.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,24180.0


In [138]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print("End process:", end_time)
print("Time to finish:", delta_time)

print()
print("*** PROGRAM END ***")
print()

End process: 2024-08-22 20:16:50
Time to finish: 0:00:23

*** PROGRAM END ***

