In [256]:
# 03_log_encoding

In [257]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [258]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [259]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import extract_files

In [260]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")

log_dir = str(yaml_config["LOG_DIR"])
prefix_dir = str(yaml_config["LOG_PREFIX_DIR"]) 
encoding_dir = str(yaml_config["LOG_ENCODING_DIR"])

dic_types = {"CaseID":object, "ACTIVITY":object, "TIMESTAMP":object, "RESOURCE":object, "ESI": int, "OUTCOME":object, "INPAT-HOSP-DEP":object, "CASE_DURATION_sec": float, "REMAINING_TIME_sec":float} # <-- INPUT: columns types

col_clean_list = ['ACTIVITY', 'RESOURCE', 'OUTCOME', 'INPAT-HOSP-DEP'] # <-- INPUT: columns to be cleaned as string

col_activity = "ACTIVITY" # <-- INPUT: the activity column to be encoded

encoding_type = "F" # <-- INPUT encoding type of the ACTIVITY column (B = Binary, F = Frequency)

# FUNCTIONS

In [261]:
def preprocess_column_values(df: pd.DataFrame, columns_to_preprocess: list) -> pd.DataFrame:
    """
    Preprocess column values by converting to lowercase, replacing spaces with underscores,
    and removing periods.

    Parameters:
        df (pd.DataFrame): The DataFrame containing columns to preprocess.
        columns_to_preprocess (List[str]): A list of column names to preprocess.

    Returns:
        pd.DataFrame: The DataFrame with preprocessed column values.
    """
    for column in columns_to_preprocess:
        if column in df.columns:
            df[column] = df[column].str.lower().str.replace(' ', '_').str.replace('.', '')
        else:
            raise ValueError(f"The provided DataFrame does not contain a '{column}' column.")
    return df

In [262]:
def encode_feature_column_binary(df: pd.DataFrame, column_name: str, prefix: str) -> pd.DataFrame:
    """
    Preprocess and perform one-hot encoding for the specified column in the provided DataFrame,
    and return the DataFrame with the original columns plus the encoded columns.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be encoded.
        column_name (str): The name of the column to one-hot encode.
        prefix (str): The prefix for the one-hot encoded columns.

    Returns:
        pd.DataFrame: A DataFrame with the original columns and the specified column one-hot encoded.
    """
    
    if column_name in df.columns:
        # Perform one-hot encoding
        df_encoded = pd.get_dummies(df[column_name], prefix=prefix).astype(int)
        # Concatenate the DataFrame with the encoded columns and drop the original column
        df_final = pd.concat([df.drop(columns=[column_name]), df_encoded], axis=1)
        return df_final
    else:
        raise ValueError(f"The provided DataFrame does not contain a '{column_name}' column.")

In [263]:
def encode_feature_column_frequency(df: pd.DataFrame, id_column: str, activity_column: str, prefix: str) -> pd.DataFrame:
    """
    Adds columns to the original DataFrame, where each column represents the cumulative frequency
    of a distinct value in the activity column, calculated separately for each CaseID. The column
    names are prefixed with the specified string. The activity column is removed from the resulting
    DataFrame.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        id_column (str): The name of the column containing case IDs.
        activity_column (str): The name of the column containing activities.
        prefix (str): The prefix to use for the new frequency columns.
        
    Returns:
        pd.DataFrame: The original DataFrame with new columns added for cumulative frequencies, and the activity column removed.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Initialise frequency columns for each activity
    unique_activities = df[activity_column].unique()
    for activity in unique_activities:
        df[f"{prefix}_{activity}"] = 0
    
    # Calculate cumulative frequencies for each CaseID
    for case_id, group in df.groupby(id_column):
        cumulative_counts = {activity: 0 for activity in unique_activities}
        for idx in group.index:
            current_activity = df.at[idx, activity_column]
            cumulative_counts[current_activity] += 1
            for activity in unique_activities:
                df.at[idx, f"{prefix}_{activity}"] = cumulative_counts[activity]
    
    # Remove the activity column
    df.drop(columns=[activity_column], inplace=True)

    return df

In [264]:
def move_columns_to_end(df: pd.DataFrame, columns_to_move: list) -> pd.DataFrame:
    """
    Move specified columns to the end of the DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame from which columns are to be moved.
        columns_to_move (list): A list of column names to move to the end.

    Returns:
        pd.DataFrame: A DataFrame with the specified columns moved to the end.
    """
    # Ensure all specified columns exist in the DataFrame
    for column in columns_to_move:
        if column not in df.columns:
            raise ValueError(f"The provided DataFrame does not contain a '{column}' column.")

    # List of columns to keep at the start
    columns = [col for col in df.columns if col not in columns_to_move]

    # Reorder the DataFrame columns
    df_reordered = df[columns + columns_to_move]
    return df_reordered

# MAIN

In [265]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()

# print(yaml_config) # debug


*** PROGRAM START ***

Start process: 2024-08-01 17:36:28



In [266]:
print(">> Creating output directories")
directory_path = Path(encoding_dir) # <-- Specify the directory to be created
# Create the directory
directory_path.mkdir(parents=True, exist_ok=True)
print(f"Directory '{directory_path}' created successfully.")

>> Creating output directories
Directory 'data_encoding' created successfully.


In [267]:
print(">> Listing event log prefixes files")
list_files = extract_files(prefix_dir, "csv")
list_files_len = len(list_files)
print("Files found:", list_files_len)
print(list_files)


>> Listing event log prefixes files
Files found: 5
['EVENT-LOG_ED_duration_prefix_1h.csv', 'EVENT-LOG_ED_duration_prefix_2h.csv', 'EVENT-LOG_ED_duration_prefix_3h.csv', 'EVENT-LOG_ED_duration_prefix_4h.csv', 'EVENT-LOG_ED_duration_prefix_5h.csv']


In [268]:
print(">> Listing event log prefixes files")
for file_name in list_files:
    path_data = Path(prefix_dir) / file_name
    print("File:", path_data)
    df_log = pd.read_csv(path_data, sep=";", dtype=dic_types)
    print("Event log shape:", df_log.shape)
    print("Event log cases:", df_log["CaseID"].nunique())
    print(df_log.columns)
    print("> Cleaning the event log")
    df_log = preprocess_column_values(df_log, col_clean_list)
    for col_name in col_clean_list:
        print(col_name)
        print(df_log[col_name].unique().tolist())
        print("-")
    print(">> Encoding activity and features")
    df_log_encoded = None
    if encoding_type == "B":
        df_log_encoded = encode_feature_column_binary(df_log, col_activity, col_activity)
    if encoding_type == "F":
        df_log_encoded = encode_feature_column_frequency(df_log, "CaseID", col_activity, col_activity)
    df_log_encoded = encode_feature_column_binary(df_log_encoded, "RESOURCE", "RESOURCE")
    df_log_encoded = encode_feature_column_binary(df_log_encoded, "OUTCOME", "OUTCOME")
    df_log_encoded = encode_feature_column_binary(df_log_encoded, "INPAT-HOSP-DEP", "INPAT-HOSP-DEP")
    df_log_encoded = move_columns_to_end(df_log_encoded, ['ESI', 'CASE_DURATION_sec', 'REMAINING_TIME_sec'])
    print("Encoded event log columns")
    print(df_log_encoded.columns)
    print(">> Saving event log")
    log_out = f"{Path(file_name).stem}_{encoding_type}.csv"
    path_out = Path(encoding_dir) / log_out
    print("File:", path_out)
    df_log_encoded.to_csv(path_out, sep=";", index=False)

>> Listing event log prefixes files
File: data_prefix/EVENT-LOG_ED_duration_prefix_1h.csv
Event log shape: (11289, 9)
Event log cases: 3478
Index(['CaseID', 'ACTIVITY', 'TIMESTAMP', 'RESOURCE', 'ESI', 'OUTCOME',
       'INPAT-HOSP-DEP', 'CASE_DURATION_sec', 'REMAINING_TIME_sec'],
      dtype='object')
> Cleaning the event log
ACTIVITY
['triage', 'presa_in_carico', 'laboratorio', 'tc_cranio', 'tac', 'consulenza', 'rx', 'dimissione', 'elettrocardiogramma', 'rx_thorax_/_th+lat', 'ecografrenovescicale', 'eco-doppler_tronchi_sovraortici', 'rx_thorax_bed', 'ecografia_addome_sup', 'ecografaddome_completo', 'controlli_pretrasf', 'obi', 'ecografia_addome_inferiore', 'esofagogastroduodenoscopia_[egd]', 'eco-doppler_arterioso_arti_inf', 'ecografia_scrotale/testicolare', 'ecografia_muscolo_tendinea', 'broncoscopia_con_fibre_ottiche', 'ecografia_cute_e_tessuto', 'controllo_pm-icd-loop', 'arcata_dentaria_completa_(sup_o_inf)', 'interventi_endoscopici_in_trachea', 'rimozione_endoscopica_di_dispositiv

In [269]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print("End process:", end_time)
print("Time to finish:", delta_time)

print()
print("*** PROGRAM END ***")
print()

End process: 2024-08-01 17:36:49
Time to finish: 0:00:21

*** PROGRAM END ***

