### Make top-level imports

* PM4Py
* OrdinoR (*note: from local repo*)

In [3]:
from os.path import join as path_join

from itertools import product

import pandas as pd
import pm4py

import ordinor.constants as const

### Import original event log data file

- Resource labels refer to the organizational group labels (i.e., `org:group`) in the sepsis log

In [4]:
DIRPATH = './data/DATA_csv'
LOGNAME = 'sepsis'

from ordinor.io import read_csv
from ordinor.utils.log_preprocessing import append_case_duration

fn = path_join(DIRPATH, f'{LOGNAME}.csv')
print(f'Import source event log {fn}')

try:
    # set the resource label manually
    log = read_csv(fn, resource_id=const.GROUP)
except Exception as e:
    print(e)

# NOTE: the sepsis log records a case with case id "NA", which will be parsed by Pandas as a N/A value ("nan")
# Manually remap case id "NA" to "NA (not nan)"

log.loc[log[const.CASE_ID] == 'nan', const.CASE_ID] = 'NA (not nan)'

print(sorted(log.columns))
print(log[const.CASE_ID].unique())

Import source event log ./data/DATA_csv/sepsis.csv
Importing from CSV file ./data/DATA_csv/sepsis.csv
Scanned 15214 events from "./data/DATA_csv/sepsis.csv".
--------------------------------------------------------------------------------
Number of events:		15190
Number of cases:		1049
--------------------------------------------------------------------------------
['@@case_index', '@@index', 'Age', 'CRP', 'Diagnose', 'DiagnosticArtAstrup', 'DiagnosticBlood', 'DiagnosticECG', 'DiagnosticIC', 'DiagnosticLacticAcid', 'DiagnosticLiquor', 'DiagnosticOther', 'DiagnosticSputum', 'DiagnosticUrinaryCulture', 'DiagnosticUrinarySediment', 'DiagnosticXthorax', 'DisfuncOrg', 'Hypotensie', 'Hypoxie', 'InfectionSuspected', 'Infusion', 'LacticAcid', 'Leucocytes', 'Oligurie', 'SIRSCritHeartRate', 'SIRSCritLeucos', 'SIRSCritTachypnea', 'SIRSCritTemperature', 'SIRSCriteria2OrMore', 'Unnamed: 0', 'case:concept:name', 'concept:name', 'lifecycle:transition', 'org:resource', 'time:timestamp']
<StringArray>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[


### Data preprocessing (generic)

- Annotate events to cluster activities:
    - based on whether they are of "medical" or "logistical category" (see data description doc)
    - based on which "phase" they are concerned with (see data description doc)
- Annotate cases:
    - "case:returning": based on whether cases are related to returning patients
    - "case:release_type": based on which release variant cases ended up with
- Populate event attributes that are case-level attributes, and rename them with prefix `case:` (see data description doc)

In [5]:
# Annotate events to cluster activities
ACTIVITY_CLUSTERS = {
    'ER Registration': {'category': 'logistical', 'phase': 'registration and triaging'},
    'ER Triage': {'category': 'logistical', 'phase': 'registration and triaging'},
    'ER Sepsis Triage': {'category': 'logistical', 'phase': 'registration and triaging'},
    'Leucocytes': {'category': 'medical', 'phase': 'measurement'},
    'CRP': {'category': 'medical', 'phase': 'measurement'},
    'LacticAcid': {'category': 'medical', 'phase': 'measurement'},
    'Admission IC': {'category': 'logistical', 'phase': 'admission or transfer'},
    'Admission NC': {'category': 'logistical', 'phase': 'admission or transfer'},
    'Release A': {'category': 'logistical', 'phase': 'discharge'},
    'Release B': {'category': 'logistical', 'phase': 'discharge'},
    'Release C': {'category': 'logistical', 'phase': 'discharge'},
    'Release D': {'category': 'logistical', 'phase': 'discharge'},
    'Release E': {'category': 'logistical', 'phase': 'discharge'},
    'Return ER': {'category': 'logistical', 'phase': 'returning'},
    'IV Liquid': {'category': 'medical', 'phase': 'giving infusions'},
    'IV Antibiotics': {'category': 'medical', 'phase': 'giving infusions'},
} 
# medical/logistical category
log.loc[:, 'category'] = log[const.ACTIVITY].apply(
    lambda a: ACTIVITY_CLUSTERS[a]['category']
)
# phase
log.loc[:, 'phase'] = log[const.ACTIVITY].apply(
    lambda a: ACTIVITY_CLUSTERS[a]['phase']
)

# Annotate cases (all events within the same case)
for case, events in log.groupby(const.CASE_ID):
    unique_activities = events[const.ACTIVITY].unique()
    # returning patients?
    log.loc[(log[const.CASE_ID] == case), 'case:returning'] = True if 'Return ER' in unique_activities else False
    # release type
    release_activity = [a for a in unique_activities if a.startswith('Release')]           
    if len(release_activity) == 1:
        log.loc[(log[const.CASE_ID] == case), 'case:release_type'] = release_activity[0]
    elif len(release_activity) == 0:
        log.loc[(log[const.CASE_ID] == case), 'case:release_type'] = 'not released'
    else:
        raise ValueError('len of {} is {}'.format(len(release_activity), release_activity))

# Populate event attributes that are case-level attributes
# Pre-check: Are all of the attributes indeed "case-level"?
MED_ATTRS = [col for col in log.columns if col.startswith('Diagnostic')]
MED_ATTRS += [
    'DisfuncOrg', 'Hypotensie', 'Hypoxie', 'InfectionSuspected', 'Infusion', 'Oligurie', 
    'SIRSCritHeartRate', 'SIRSCritLeucos', 'SIRSCritTachypnea', 'SIRSCritTemperature', 'SIRSCriteria2OrMore'
]

#     [YES]
#     for case, events in log.groupby(const.CASE_ID):
#         if len(events.value_counts(subset=ATTR_TO_CHECK, dropna=True)) == 1:
#             pass
#         else:
#             raise ValueError('duplicates found: {}'.format(events_drop_duplicates))
for case, events in log.groupby(const.CASE_ID):
    for attr in MED_ATTRS:
        log.loc[(log[const.CASE_ID] == case), f'case:{attr}'] = events[attr].unique()[0]

log.to_csv(f'data/{LOGNAME}.preprocessed.csv')

### Experiments

##### Preprocessing

- Filter out meaningless resource labels: `?`

##### Config
- Determine CT based on both the original and the "derived" trace attributes (as in the dataset description)
    
    case:returning, case:release_type

- Determine AT by
    
    concept:name (activity label), phase, category
    
- Determine TT by
    
    year, month, weekday
    
- Discover resource groups using AHC, number of groups automatically determined (via CV)

- Apply OverallScore, parameters automatically determined (via Grid Search)