In [1]:
import os
import pm4py as pm
import numpy as np
import pandas as pd
#------------------------------------------------------------------
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
#------------------------------------------------------------------
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

In [38]:
input_data_folder = "../orig_logs"
output_data_folder = "../input_files"
in_filename_xes = "sepsis_cases.xes"
in_filename_csv = "sepsis_cases.csv"

In [39]:
variant = xes_importer.Variants.LINE_BY_LINE
parameters = {variant.value.Parameters.TIMESTAMP_SORT: True}
log = xes_importer.apply(os.path.join(input_data_folder, in_filename_xes),
                         variant=variant, parameters=parameters)

In [40]:
dataframe = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

In [41]:
dataframe.rename(columns={'case:concept:name': 'Case ID', 'concept:name': 'Activity', 
                          'time:timestamp': 'Complete Timestamp', 'org:group' : 'user'}, inplace=True)

In [42]:
dataframe.to_csv(os.path.join(input_data_folder, in_filename_csv))

In [43]:
case_id_col = "Case ID"
activity_col = "Activity"
timestamp_col = "Complete Timestamp"
label_col = "label"
pos_label = "deviant"
neg_label = "regular"

In [44]:
category_freq_threshold = 10

In [45]:
dynamic_cat_cols = ["Activity", 'user'] # i.e. event attributes
static_cat_cols = ['Diagnose', 'DiagnosticArtAstrup', 'DiagnosticBlood', 'DiagnosticECG',
       'DiagnosticIC', 'DiagnosticLacticAcid', 'DiagnosticLiquor',
       'DiagnosticOther', 'DiagnosticSputum', 'DiagnosticUrinaryCulture',
       'DiagnosticUrinarySediment', 'DiagnosticXthorax', 'DisfuncOrg',
       'Hypotensie', 'Hypoxie', 'InfectionSuspected', 'Infusion', 'Oligurie',
       'SIRSCritHeartRate', 'SIRSCritLeucos', 'SIRSCritTachypnea',
       'SIRSCritTemperature', 'SIRSCriteria2OrMore'] # i.e. case attributes that are known from the start
dynamic_num_cols = ['CRP', 'LacticAcid', 'Leucocytes']
static_num_cols = ['Age']

In [46]:
static_cols = static_cat_cols + static_num_cols + [case_id_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [47]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    #tmp = tmp.fillna(0)
    #group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / pd.Timedelta(1, 'm'))) # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    #tmp = tmp.fillna(0)
    #group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / pd.Timedelta(1, 'm'))) # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group

In [48]:
def cut_before_activity(group):
    relevant_activity_idxs = np.where(group[activity_col] == relevant_activity)[0]
    if len(relevant_activity_idxs) > 0:
        cut_idx = relevant_activity_idxs[0]
        return group[:cut_idx]
    else:
        return group

In [49]:
def get_open_cases(date):
    return sum((dt_first_last_timestamps["start_time"] <= date) & (dt_first_last_timestamps["end_time"] > date))

In [50]:
def check_if_activity_exists(group, activity):
    relevant_activity_idxs = np.where(group[activity_col] == activity)[0]
    if len(relevant_activity_idxs) > 0:
        idx = relevant_activity_idxs[0]
        group[label_col] = pos_label
        return group[:idx]
    else:
        group[label_col] = neg_label
        return group

In [51]:
def check_if_activity_exists_and_time_less_than(group, activity):
    relevant_activity_idxs = np.where(group[activity_col] == activity)[0]
    if len(relevant_activity_idxs) > 0:
        idx = relevant_activity_idxs[0]
        if group["timesincelastevent"].iloc[idx] <= 28 * 1440: # return in less than 28 days
            group[label_col] = pos_label
            return group[:idx]
        else:
            group[label_col] = neg_label
            return group[:idx]
    else:
        group[label_col] = neg_label
        return group

In [52]:
def check_if_any_of_activities_exist(group, activities):
    if np.sum(group[activity_col].isin(activities)) > 0:
        return True
    else:
        return False

In [53]:
data = pd.read_csv(os.path.join(input_data_folder, in_filename_csv), sep=",")
data.rename(columns={"Unnamed: 0" : "event_nr"}, inplace=True)
data[case_id_col] = data[case_id_col].fillna("missing_caseid")

In [54]:
# remove incomplete cases
tmp = data.groupby(case_id_col).apply(check_if_any_of_activities_exist, activities=["Release A", "Release B", "Release C", "Release D", "Release E"])
incomplete_cases = tmp.index[tmp==False]
data = data[~data[case_id_col].isin(incomplete_cases)]

In [55]:
#columns = [static_cols + dynamic_cols]
#data = data.reindex(columns=columns)
data = data[static_cols + dynamic_cols]

In [56]:
# add features extracted from timestamp
data[timestamp_col] = pd.to_datetime(data[timestamp_col], utc=True)
data["timesincemidnight"] = data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour
data = data.groupby(case_id_col).apply(extract_timestamp_features)

In [57]:
# add inter-case features
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(get_open_cases)

In [58]:
# impute missing values
grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))

In [59]:
data[cat_cols] = data[cat_cols].fillna('missing')
data = data.fillna(0)

In [60]:
for col in cat_cols:
    counts = data[col].value_counts()
    mask = data[col].isin(counts[counts >= category_freq_threshold].index)
    data.loc[~mask, col] = "other"

In [62]:
# first labeling
dt_labeledbf = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists_and_time_less_than, activity="Return ER")
dt_labeledbf.to_csv(os.path.join(output_data_folder, "sepsis_cases_before.csv"), sep=",", index=False)

In [63]:
data.shape

(13422, 39)

In [64]:
#Removing the Columns which is not necessary

essencial_col1 = ['Case ID', 'Complete Timestamp', 'user']

#additional_cols = ['event_nr', 'month', 'weekday', 'hour', 'timesincemidnight', 'timesincelastevent', 'timesincecasestart', 'open_cases']
additional_cols = ['timesincemidnight', 'month', 'weekday', 'hour']
columns_to_drop = essencial_col1 + additional_cols
#print("Dropped Columns : ", columns_to_drop)
#data = data.drop(columns_to_drop, axis=1)
#Backup of data
#dataf = data

dataf = data.drop(columns_to_drop, axis=1)

feature_cols = static_cat_cols + static_num_cols + ["Activity"]

#dataf = data[feature_cols]

#Get the Numeric Columns so that Columns which is Catorical can be filtered out

data_list = dataf.columns

data_num_cols = dataf._get_numeric_data().columns

data_cat_cols = list(set(data_list) - set(data_num_cols))

In [65]:
data_cat_cols

['DisfuncOrg',
 'Oligurie',
 'InfectionSuspected',
 'DiagnosticIC',
 'SIRSCritTachypnea',
 'Hypoxie',
 'DiagnosticOther',
 'SIRSCriteria2OrMore',
 'Activity',
 'DiagnosticLacticAcid',
 'DiagnosticUrinarySediment',
 'SIRSCritHeartRate',
 'DiagnosticBlood',
 'Hypotensie',
 'DiagnosticXthorax',
 'DiagnosticSputum',
 'SIRSCritTemperature',
 'DiagnosticLiquor',
 'DiagnosticArtAstrup',
 'Diagnose',
 'DiagnosticUrinaryCulture',
 'Infusion',
 'DiagnosticECG',
 'SIRSCritLeucos']

In [66]:
dataf = dataf[~dataf.Diagnose.isin(['missing']) & ~dataf.DiagnosticArtAstrup.isin(['missing'])]

In [67]:
#for label encoding
data_cat_dict = {}
for i in range(len(data_cat_cols)):

    _label = data_cat_cols[i]
    temp_list = dataf[[_label]].values.tolist()
    subsec_set = {(x[0]) for x in temp_list}
    
    subsec_set = sorted(list(subsec_set))
    _index = dict()

    for ix, _ in enumerate(subsec_set):

        _index[subsec_set[ix]] = ix + 1
    
    _idx = lambda x: _index[x[_label]]
    
    #Actual Vlaues in the dictionary
    data_cat_dict[_label] = _index

    dataf[_label] = dataf.apply(_idx, axis=1)

In [68]:
# Labels are the values we want to predict
labels_activity = np.array(dataf['Activity'])

# Remove the labels from the features
# axis 1 refers to the columns
dataf = dataf.drop('Activity', axis = 1)
# Saving feature names for later use
data_list = list(dataf.columns)
# Convert to numpy array
dataf = np.array(dataf)

In [69]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(dataf, labels_activity, test_size = 0.25, random_state = 42)

In [70]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
clf.fit(train_features, train_labels)

RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)

In [71]:
# Store the name and gini importance of each feature
feature_list = []
for feature in zip(data_list, clf.feature_importances_):
    feature_list.append(feature)
    
#feature to dictionary
result = dict(feature_list)
#sorting features
result = dict(sorted(result.items(), key=lambda item: item[1], reverse=True))

In [72]:
result

{'event_nr': 0.19652179878610154,
 'timesincecasestart': 0.16603291762528904,
 'timesincelastevent': 0.15659215822247052,
 'Leucocytes': 0.09520168092038814,
 'CRP': 0.0901182982089405,
 'LacticAcid': 0.0682098557399746,
 'open_cases': 0.05878026783010485,
 'Diagnose': 0.040121883532746566,
 'Age': 0.034871027545316235,
 'DiagnosticArtAstrup': 0.009793070261011097,
 'SIRSCritTachypnea': 0.009155887479117674,
 'DiagnosticUrinaryCulture': 0.00898765509923254,
 'DiagnosticUrinarySediment': 0.008523268512213342,
 'SIRSCritTemperature': 0.005794842186913388,
 'SIRSCritHeartRate': 0.005392555242714382,
 'DiagnosticECG': 0.005208364028590732,
 'Infusion': 0.004236784892930051,
 'DiagnosticXthorax': 0.004229179674688459,
 'SIRSCritLeucos': 0.004104135462359808,
 'Hypotensie': 0.003923020943745561,
 'DiagnosticLacticAcid': 0.003666933299168466,
 'DisfuncOrg': 0.003501430152508376,
 'DiagnosticBlood': 0.003219469071041575,
 'Oligurie': 0.002607188262054725,
 'DiagnosticSputum': 0.002499529249582

In [100]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)

# Train the selector
sfm.fit(train_features, train_labels)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=1000, n_jobs=-1,
                                                 random_state=0),
                threshold=0.15)

In [101]:
# Print the names of the most important features
data_list_important = []
for feature_list_index in sfm.get_support(indices=True):
    print(data_list[feature_list_index])
    data_list_important.append(data_list[feature_list_index])

Diagnose
Age


In [102]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(train_features)
X_important_test = sfm.transform(test_features)

In [103]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, train_labels)

RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)

In [104]:
# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(test_features)

# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(test_labels, y_pred)

0.1441061199879409

In [105]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature Model
accuracy_score(test_labels, y_important_pred)

0.1863129333735303

In [106]:
essencial_cols = ['Case ID', 'Activity', 'Complete Timestamp', 'user']

cols = essencial_cols + ['label'] + data_list_important

In [140]:
# first labeling
dt_labeled1 = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists_and_time_less_than, activity="Return ER")
dt_labeled1[cols].to_csv(os.path.join(output_data_folder, "sepsis_cases_1.csv"), sep=",", index=False)

In [122]:
# second labeling
dt_labeled2 = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists, activity="Admission IC")
dt_labeled2[cols].to_csv(os.path.join(output_data_folder, "sepsis_cases_2.csv"), sep=",", index=False)

In [123]:
# fourth labeling
dt_labeled3 = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists, activity="Release A")
dt_labeled3[cols].to_csv(os.path.join(output_data_folder, "sepsis_cases_4.csv"), sep=",", index=False)

In [110]:
data[essencial_cols + data_list_important].to_csv(os.path.join(output_data_folder, "sepsis_cases.csv"), sep=",", index=False)

In [111]:
pdList = [dt_labeled1, dt_labeled2, dt_labeled3]  # List of your dataframes
dt_labeled = pd.concat(pdList)

In [112]:
dt_labeled.shape

(36507, 40)

In [113]:
data.shape

(13422, 39)

In [114]:
dt_labeled = dt_labeled.drop_duplicates(subset=['Case ID', 'Activity', 'Complete Timestamp', 'user']+ data_list_important)

In [115]:
#dt_labeled = dt_labeled.drop_duplicates(subset=essencial_cols+static_cat_cols+dynamic_num_cols+static_num_cols+['label'])

In [116]:
dt_labeled.shape

(13373, 40)

In [117]:
dt_labeled[cols].to_csv(os.path.join(output_data_folder, "sepsis_cases.csv"), sep=",", index=False)

In [2]:
products = {'Product': ['Tablet','iPhone','Laptop','Monitor'],
            'Price': [250,800,1200,300]
            }

df = pd.DataFrame(products, columns= ['Product', 'Price'])

products_list = [df.columns.values.tolist()] + df.values.tolist()
print (products_list)

[['Product', 'Price'], ['Tablet', 250], ['iPhone', 800], ['Laptop', 1200], ['Monitor', 300]]


In [11]:
import pandas as pd

products = {'Product': ['Tablet','iPhone','Laptop','Monitor'],
            'Price': [250,800,1200,300]
            }

df = pd.DataFrame(products, columns= ['Product', 'Price'])

products_list = [df.columns.values.tolist()] + df.values.tolist()
f = '{:<8}|{:<15}' # formatting

for i in products_list:
    print(f.format(*i))

Product |Price          
Tablet  |250            
iPhone  |800            
Laptop  |1200           
Monitor |300            


In [12]:
df

Unnamed: 0,Product,Price
0,Tablet,250
1,iPhone,800
2,Laptop,1200
3,Monitor,300


In [13]:
df.rename(columns={"Product" : 'Predicted'}, inplace=False)

Unnamed: 0,Predicted,Price
0,Tablet,250
1,iPhone,800
2,Laptop,1200
3,Monitor,300


In [14]:
df

Unnamed: 0,Product,Price
0,Tablet,250
1,iPhone,800
2,Laptop,1200
3,Monitor,300
