In [None]:
import pm4py
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

log = pm4py.read_xes('/Users/simonimmler/PycharmProjects/Praktikum/data/BPI Challenge 2017.xes') # Todo: Change
df  = pm4py.convert_to_dataframe(log)

Note: This Analysis was conducted with help of ChatGPT

# Advanced analysis - Output prediction

First try, for each case label the case as success if it contains A_Pending

In [71]:
case_col, act_col = "case:concept:name", "concept:name"
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"])
case_acts = df.groupby(case_col)[act_col].agg(set) 
label_df = case_acts.apply(lambda acts: int("A_Pending" in acts)).to_frame("label")
print("Label distribution (0 = no A_Pending, 1 = has A_Pending):")
print(label_df["label"].value_counts())

Label distribution (0 = no A_Pending, 1 = has A_Pending):
label
1    17228
0    14281
Name: count, dtype: int64


List number of occurences of each event

In [74]:
last_events = (df.sort_values("time:timestamp").groupby(case_col).tail(4))
print(last_events[act_col].value_counts().head(10))
# Far more Cases get labeled as Cancelled (18850 vs 14281) -> Some cases are canceled even after they get stated as pending

concept:name
W_Validate application     27327
O_Cancelled                18850
A_Pending                  16946
W_Call after offers        16907
O_Accepted                 16263
W_Call incomplete files    10435
A_Cancelled                10060
O_Refused                   4638
A_Denied                    3609
W_Complete application       193
Name: count, dtype: int64


Show cases that got marked as pending but still got canceled afterwards 4436

In [79]:
case_col = "case:concept:name"
act_col = "concept:name"
df_sorted = df.sort_values("time:timestamp")

first_event = "A_Pending"             
#later_events = ["A_Cancelled"]         
# later_events = ["O_Cancelled"]      
later_events = ["A_Cancelled", "O_Cancelled"]  # both

def has_first_then_later(group):
    seen_first = False
    for act in group[act_col]:
        if act == first_event: seen_first = True
        elif seen_first and act in later_events:return True
    return False

flags_first_then_later = df_sorted.groupby(case_col).apply(has_first_then_later,include_groups=False)
cases_first_then_later = flags_first_then_later[flags_first_then_later].index.tolist()
print("Cases with", first_event, "after", later_events, ":", len(cases_first_then_later))

Cases with A_Pending after ['A_Cancelled', 'O_Cancelled'] : 4436
expl: ['Application_1001114274', 'Application_1002626536', 'Application_1002961837', 'Application_1003879772', 'Application_1004601801', 'Application_1005228320', 'Application_1005604553', 'Application_1005734659', 'Application_1005878170', 'Application_1006138809']


Prepare the data

- define a binary outcome label for each case based on the position of the activity "A_Pending" in the trace.
- A case is labelled as successful (1) if "A_Pending" occurs in the last few events (last 4 events) and no cancellation activity ("A_Cancelled", "O_Cancelled", "A_Denied", "O_Refused") appears after the last "A_Pending"
 - All other cases are labelled as not successful (0).

In [51]:
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"])
case_seqs = (df.sort_values("time:timestamp").groupby(case_col)[act_col].apply(list))

In [82]:
cancel_events = {"A_Cancelled", "O_Cancelled", "A_Denied", "O_Refused"} # what counts as cancel events?

def label_success(acts, window=4):
    if "A_Pending" not in acts:return 0
    lastpending_idx = max(i for i, a in enumerate(acts) if a == "A_Pending")
    
    if lastpending_idx<len(acts)- window:return 0
    tail = acts[lastpending_idx + 1:]
    if any(a in cancel_events for a in tail): return 0
    return 1

label_series = case_seqs.apply(label_success)   # default window=4
label_df = label_series.to_frame(name="label")


Construct the case level table

For each case take the first event and extract the static case attributes "case:ApplicationType", "case:LoanGoal" and "case:RequestedAmount"
These attributes are then joined with the previously defined outcome label to form a case-level table that serves as input for the prediction model.

In [83]:
first_events = (df.sort_values("time:timestamp").groupby(case_col).head(1).set_index(case_col))
attr_cols = ["case:ApplicationType", "case:LoanGoal", "case:RequestedAmount"]
case_attrs = first_events[attr_cols]
case_table = label_df.join(case_attrs, how="left")

print("case_table shape:", case_table.shape)
print("Cols:", case_table.columns.tolist())
print("Label distribution in case_table:\n", case_table["label"].value_counts())

case_table shape: (31509, 4)
Cols: ['label', 'case:ApplicationType', 'case:LoanGoal', 'case:RequestedAmount']
Label distribution in case_table:
 label
0    18718
1    12791
Name: count, dtype: int64


Feature selection and preprocessing

- Predictors: "case:ApplicationType", "case:LoanGoal", "case:RequestedAmount".
- "ApplicationType" and "LoanGoal" are treated as categorical features.
- "RequestedAmount" is used as a numerical feature.
- Numerical missing values are imputed with the median.
- Categorical missing values are replaced by a "missing" category.
- The final case-level table is split into \(X\) (predictors) and \(y\) (binary outcome label).


In [81]:
data = case_table.copy()
cat_features = ["case:ApplicationType", "case:LoanGoal"]
num_features = ["case:RequestedAmount"]

predictor_table = pd.DataFrame({
    "Feature": attr_cols,
    "Type": ["categorical" if col in cat_features else "numerical"
             for col in attr_cols]
})
print(predictor_table)

for col in num_features:
    data[col] = data[col].astype(float).fillna(data[col].median())
for col in cat_features:
    data[col] = data[col].fillna("missing")

X = data[attr_cols]
y = data["label"]

                Feature         Type
0  case:ApplicationType  categorical
1         case:LoanGoal  categorical
2  case:RequestedAmount    numerical


Model training and evaluation

In [80]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Preprocessing + model
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", StandardScaler(), num_features),
    ]
)

log_reg = LogisticRegression(max_iter=1000, solver="lbfgs")

clf = Pipeline([
    ("preprocess", preprocess),
    ("model", log_reg),
])

# Fit model
clf.fit(X_train, y_train)

# Coefficients (Betas)
pre_fitted = clf.named_steps["preprocess"]
lr = clf.named_steps["model"]

ohe = pre_fitted.named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_features)
all_features = list(cat_feature_names) + num_features

coef_table = pd.DataFrame({
    "Feature": all_features,
    "Beta (Estimate)": lr.coef_[0],
    "Odds Ratio": np.exp(lr.coef_[0]),
}).sort_values(by="Beta (Estimate)", key=lambda s: s.abs(), ascending=False)

print(coef_table)

# Evaluation
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

                                 Feature  Beta (Estimate)  Odds Ratio
3            case:LoanGoal_Business goal        -0.725285    0.484186
1        case:ApplicationType_New credit        -0.582911    0.558271
0       case:ApplicationType_Limit raise         0.540820    1.717415
11           case:LoanGoal_Not speficied        -0.397194    0.672204
15                 case:LoanGoal_Unknown         0.374310    1.453988
9         case:LoanGoal_Home improvement         0.257755    1.294022
14            case:LoanGoal_Tax payments        -0.213643    0.807637
5         case:LoanGoal_Caravan / Camper         0.180308    1.197586
13     case:LoanGoal_Remaining debt home         0.152228    1.164425
7   case:LoanGoal_Existing loan takeover         0.139721    1.149953
10              case:LoanGoal_Motorcycle         0.127199    1.135643
8     case:LoanGoal_Extra spending limit         0.082761    1.086282
6       case:LoanGoal_Debt restructuring        -0.082424    0.920882
4                   