In [1]:
import pandas as pd
import numpy as np
import os
import pm4py as pm
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

In [2]:
input_data_folder = "../orig_logs"
output_data_folder = "../input_files"
in_filename_xes = "sepsis_cases.xes"
in_filename_csv = "sepsis_cases.csv"

In [3]:
variant = xes_importer.Variants.LINE_BY_LINE
parameters = {variant.value.Parameters.TIMESTAMP_SORT: True}
log = xes_importer.apply(os.path.join(input_data_folder, in_filename_xes),
                         variant=variant, parameters=parameters)

In [4]:
dataframe = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

In [5]:
dataframe.rename(columns={'case:concept:name': 'Case ID', 'concept:name': 'Activity', 
                          'time:timestamp': 'Complete Timestamp', 'org:group' : 'user'}, inplace=True)

In [6]:
dataframe.to_csv(os.path.join(input_data_folder, in_filename_csv))

In [7]:
case_id_col = "Case ID"
activity_col = "Activity"
timestamp_col = "Complete Timestamp"
label_col = "label"
pos_label = "deviant"
neg_label = "regular"

In [8]:
category_freq_threshold = 10

In [9]:
dynamic_cat_cols = ["Activity", 'user'] # i.e. event attributes
static_cat_cols = ['Diagnose', 'DiagnosticArtAstrup', 'DiagnosticBlood', 'DiagnosticECG',
       'DiagnosticIC', 'DiagnosticLacticAcid', 'DiagnosticLiquor',
       'DiagnosticOther', 'DiagnosticSputum', 'DiagnosticUrinaryCulture',
       'DiagnosticUrinarySediment', 'DiagnosticXthorax', 'DisfuncOrg',
       'Hypotensie', 'Hypoxie', 'InfectionSuspected', 'Infusion', 'Oligurie',
       'SIRSCritHeartRate', 'SIRSCritLeucos', 'SIRSCritTachypnea',
       'SIRSCritTemperature', 'SIRSCriteria2OrMore'] # i.e. case attributes that are known from the start
dynamic_num_cols = ['CRP', 'LacticAcid', 'Leucocytes']
static_num_cols = ['Age']

In [10]:
static_cols = static_cat_cols + static_num_cols + [case_id_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [11]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    #tmp = tmp.fillna(0)
    #group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / pd.Timedelta(1, 'm'))) # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    #tmp = tmp.fillna(0)
    #group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / pd.Timedelta(1, 'm'))) # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group

In [12]:
def cut_before_activity(group):
    relevant_activity_idxs = np.where(group[activity_col] == relevant_activity)[0]
    if len(relevant_activity_idxs) > 0:
        cut_idx = relevant_activity_idxs[0]
        return group[:cut_idx]
    else:
        return group

In [13]:
def get_open_cases(date):
    return sum((dt_first_last_timestamps["start_time"] <= date) & (dt_first_last_timestamps["end_time"] > date))

In [14]:
def check_if_activity_exists(group, activity):
    relevant_activity_idxs = np.where(group[activity_col] == activity)[0]
    if len(relevant_activity_idxs) > 0:
        idx = relevant_activity_idxs[0]
        group[label_col] = pos_label
        return group[:idx]
    else:
        group[label_col] = neg_label
        return group

In [15]:
def check_if_activity_exists_and_time_less_than(group, activity):
    relevant_activity_idxs = np.where(group[activity_col] == activity)[0]
    if len(relevant_activity_idxs) > 0:
        idx = relevant_activity_idxs[0]
        if group["timesincelastevent"].iloc[idx] <= 28 * 1440: # return in less than 28 days
            group[label_col] = pos_label
            return group[:idx]
        else:
            group[label_col] = neg_label
            return group[:idx]
    else:
        group[label_col] = neg_label
        return group

In [16]:
def check_if_any_of_activities_exist(group, activities):
    if np.sum(group[activity_col].isin(activities)) > 0:
        return True
    else:
        return False

In [17]:
data = pd.read_csv(os.path.join(input_data_folder, in_filename_csv), sep=",")
data.rename(columns={"Unnamed: 0" : "event_nr"}, inplace=True)
data[case_id_col] = data[case_id_col].fillna("missing_caseid")

In [18]:
# remove incomplete cases
tmp = data.groupby(case_id_col).apply(check_if_any_of_activities_exist, activities=["Release A", "Release B", "Release C", "Release D", "Release E"])
incomplete_cases = tmp.index[tmp==False]
data = data[~data[case_id_col].isin(incomplete_cases)]

In [19]:
#columns = [static_cols + dynamic_cols]
#data = data.reindex(columns=columns)
data = data[static_cols + dynamic_cols]

In [20]:
# add features extracted from timestamp
data[timestamp_col] = pd.to_datetime(data[timestamp_col], utc=True)
data["timesincemidnight"] = data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour
data = data.groupby(case_id_col).apply(extract_timestamp_features)

In [21]:
# add inter-case features
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(get_open_cases)

In [22]:
# impute missing values
grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))

In [23]:
data[cat_cols] = data[cat_cols].fillna('missing')
data = data.fillna(0)

In [24]:
for col in cat_cols:
    counts = data[col].value_counts()
    mask = data[col].isin(counts[counts >= category_freq_threshold].index)
    data.loc[~mask, col] = "other"

In [25]:
# first labeling
dt_labeled = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists_and_time_less_than, activity="Return ER")
dt_labeled.to_csv(os.path.join(output_data_folder, "sepsis_cases_1.csv"), sep=",", index=False)

In [26]:
# second labeling
dt_labeled = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists, activity="Admission IC")
dt_labeled.to_csv(os.path.join(output_data_folder, "sepsis_cases_2.csv"), sep=",", index=False)

In [27]:
# fourth labeling
dt_labeled = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists, activity="Release A")
dt_labeled.to_csv(os.path.join(output_data_folder, "sepsis_cases_3.csv"), sep=",", index=False)