# T4 Data leakage

In [33]:
import os
import pm4py
from utils import *

## Set up

### Setting up data to read in

In [34]:
log_dir_path = os.path.join('..', 'data', 'full_logs')

os.listdir(log_dir_path)
# path_to_log = os.path.join(path_to_logs, file_name)

['Helpdesk.xes',
 'BPI_Challenge_2012_W_Two_TS.xes',
 'PurchasingExample.xes',
 'BPI_Challenge_2017_W_Two_TS.xes',
 'ConsultaDataMining201618.xes',
 'cvs_pharmacy.xes']

In [35]:
log_names = [
    'BPI_Challenge_2012_W_Two_TS.xes',
    'PurchasingExample.xes',
    'BPI_Challenge_2017_W_Two_TS.xes',
    'ConsultaDataMining201618.xes',
    'cvs_pharmacy.xes'
]

## Helper functions

In [36]:
def group_lifecycle(log):

    # Sample data setup
    # log = pd.DataFrame( ... )

    # Step 0: Filter for relevant lifecycle transitions ("start", "complete", "completed")
    filtered_log = log[log["lifecycle:transition"].isin(["start", "complete", "completed"])].copy()

    # Step 1: Create a shifted version of the filtered dataframe to compare consecutive rows
    filtered_shifted = filtered_log.shift(-1)

    # Step 2: Identify pairs where the first row has "start" and the second has "complete" or "completed"
    mask = (
        (filtered_log["lifecycle:transition"] == "start") &
        (filtered_shifted["lifecycle:transition"].isin(["complete", "completed"]))
    )

    # Step 3: Ensure all columns except "lifecycle:transition" and "time:timestamp" are the same for the pair
    columns_to_compare = filtered_log.columns.difference(["lifecycle:transition", "time:timestamp"])
    same_values_mask = (
        filtered_log[columns_to_compare].eq(filtered_shifted[columns_to_compare]) |
        (filtered_log[columns_to_compare].isna() & filtered_shifted[columns_to_compare].isna())
    ).all(axis=1)

    # Step 4: Combine the two conditions (lifecycle transition and matching values in other columns)
    valid_pairs_mask = mask & same_values_mask

    # Step 5: Create a new column "event_id" and assign unique IDs to the valid pairs
    filtered_log["event_id"] = None  # Initialize with None

    # Assign unique IDs to each valid pair
    event_id_counter = 1
    for idx in filtered_log[valid_pairs_mask].index:
        filtered_log.at[idx, "event_id"] = event_id_counter  # Assign to "start" row
        filtered_log.at[idx + 1, "event_id"] = event_id_counter  # Assign to "complete" row
        event_id_counter += 1

    # The dataframe 'log' now has a unique 'event_id' for each valid consecutive "start" and "complete

    return filtered_log


def convert_timestamps(log):

    log = group_lifecycle(log)

    # slit logs in start and complete
    log_start = log[log['lifecycle:transition'] == 'start']
    log_compl = log[log['lifecycle:transition'] == 'complete']


    # rename and drop column
    log_start = log_start.rename(columns = {'time:timestamp': 'start_timestamp'})
    log_compl = log_compl.rename(columns = {'time:timestamp': 'end_timestamp'})

    # filter for only necessary columns
    key_cols = ['case:concept:name', 'concept:name', 'event_id']
    filter_cols = key_cols.copy()
    filter_cols.append('start_timestamp')
    log_start = log_start[filter_cols]
    
    # displaying
    # display(log_compl.shape, log_start.shape)
    log_merged = log_compl.merge(log_start, left_on=key_cols, right_on=key_cols, how='left')
    # display(log_merged[log_merged['case:concept:name'] == str(13)][['concept:name', 'case:concept:name', 'start_timestamp', 'end_timestamp']])  

    # return log_start, log_compl
    return log_merged

In [37]:
def get_data_leakage_share(logs, log_name, print_bool=False):
    
    extraction_key = 'starting'
    
    if extraction_key not in logs[log_name]['splits'].keys():
        extraction_key = 'full'

    train_log = logs[log_name]['splits'][extraction_key]['regular']['train']
    test_log = logs[log_name]['splits'][extraction_key]['regular']['test']

    timestamp = 'start_timestamp'
    # Step 1: Get the earliest timestamp from the test_log
    earliest_test_timestamp = test_log[timestamp].min()

    # Step 2: Count events in train_log with a timestamp past the earliest timestamp in test_log
    train_events_past_test_start = train_log[train_log[timestamp] > earliest_test_timestamp]
    num_train_events_past_test_start = len(train_events_past_test_start)

    # Step 3: Count the total number of events in logs
    num_train_events = len(train_log)
    num_test_events = len(test_log)

    share_of_train = num_train_events_past_test_start/num_train_events
    share_of_test  = num_train_events_past_test_start/num_test_events

    logs[log_name]['leakage'] = {
        'train':share_of_train,
        'test': share_of_test
        }

    if print_bool:
        # Print results
        print("Earliest timestamp in test_log:", earliest_test_timestamp)
        print("Number of events in train_log past this timestamp:", num_train_events_past_test_start)
        print("Total number of events in train_log:", num_train_events)
        print("Total number of events in test_log:", num_test_events)

        print('Share of intersecting events:')
        print(' - share of train log: ', share_of_train )
        print(' - share of test log: ', share_of_test)


    return logs

## Overalap calculation

In [38]:
logs = {}
target_dir_path = ''
criterium_value = 80

for log_name in log_names:
    path_to_log = os.path.join(log_dir_path, log_name)  
    
    log = pm4py.read_xes(path_to_log)
    log = convert_timestamps(log)
    
    logs = extract_split_logs(logs, log, log_name, target_dir_path, criterium_value, extract=False, cut_dates=None, write_out_logs = False)
    logs = get_data_leakage_share(logs, log_name, print_bool=False)


parsing log, completed traces :: 100%|██████████| 8616/8616 [00:03<00:00, 2521.25it/s]
parsing log, completed traces :: 100%|██████████| 608/608 [00:00<00:00, 2011.38it/s]
parsing log, completed traces :: 100%|██████████| 30276/30276 [00:10<00:00, 2955.77it/s]
parsing log, completed traces :: 100%|██████████| 954/954 [00:00<00:00, 3258.74it/s]
parsing log, completed traces :: 100%|██████████| 10000/10000 [00:05<00:00, 1921.76it/s]


In [39]:
# calculate overlap for running example from T3
log = logs['BPI_Challenge_2012_W_Two_TS.xes']['full_log']
log_name = 'example_t3'
extract = True
start_time = pd.to_datetime('2011-11-1 00:00:00').tz_localize('UTC')
end_time   = pd.to_datetime('2012-01-31 23:59:59').tz_localize('UTC')
cut_dates = [start_time, end_time]
logs = extract_split_logs(logs, log, log_name, target_dir_path, criterium_value, extract=extract, cut_dates=cut_dates, write_out_logs = False)
logs = get_data_leakage_share(logs, log_name, print_bool=False)


In [40]:
for log_name in logs.keys():
    print('\n', log_name)
    print(logs[log_name]['leakage'])



 BPI_Challenge_2012_W_Two_TS.xes
{'train': 0.08778355879292404, 'test': 0.3748666903661571}

 PurchasingExample.xes
{'train': 0.10483356714704757, 'test': 0.6431924882629108}

 BPI_Challenge_2017_W_Two_TS.xes
{'train': 0.028004197827726042, 'test': 0.12335068665290369}

 ConsultaDataMining201618.xes
{'train': 0.06735299249694643, 'test': 0.3341991341991342}

 cvs_pharmacy.xes
{'train': 0.11484507840544114, 'test': 0.4597712016639879}

 example_t3
{'train': 0.15385730858468677, 'test': 0.598589562764457}
