1. Setup and Libraries

In [1]:
import pandas as pd          
import numpy as np           
import matplotlib.pyplot as plt  
import seaborn as sns        
import pm4py


2. Read out the xes file

In [2]:
from pm4py.objects.log.importer.xes import importer as xes_importer
LOG_PATH = '/Users/simonimmler/PycharmProjects/Praktikum/data/BPI Challenge 2017.xes'  # Pfad anpassen
log = xes_importer.apply(LOG_PATH)

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

# 3.1. Basic Analysis

In [3]:
df = pm4py.convert_to_dataframe(log)
pd.set_option('display.max_rows', 200)       # Anzahl Zeilen
pd.set_option('display.max_columns', None)   # alle Spalten
pd.set_option('display.width', 0)            # volle Breite
pd.set_option('display.max_colwidth', None)  # kein Spalten-Cut

df.head(100)


Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
5,statechange,User_1,A_Concept,Application,ApplState_642383566,complete,2016-01-01 09:52:36.413000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
6,Obtained,User_17,W_Complete application,Workflow,Workitem_1875340971,start,2016-01-02 10:45:22.429000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
7,Released,User_17,W_Complete application,Workflow,Workitem_1452291795,suspend,2016-01-02 10:49:28.816000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
8,statechange,User_52,A_Accepted,Application,ApplState_99568828,complete,2016-01-02 11:23:04.299000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
9,Created,User_52,O_Create Offer,Offer,Offer_148581083,complete,2016-01-02 11:29:03.994000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,20000.0,44.0,True,498.29,True,979.0,20000.0,


a) Count Cases

In [4]:
num_cases = len(log)
print(f"Cases: {num_cases}")
# print(log.count(0))

Cases: 31509


b) Count Events (after Filtering in 3.3_Process_Model_Creation_and_Validation)

In [5]:
num_events = sum(len(trace) for trace in log)
print(f"Events: {num_events}")

Events: 1202267


c) different Variants (after Filtering in 3.3_Process_Model_Creation_and_Validation)

In [6]:
variants = pm4py.get_variants(log)
num_variants = len(variants)
print(f"Variants: {num_variants}")

Variants: 15930



d) Mean case length and standard deviation of case length

In [7]:

case_lengths = [len(trace) for trace in log]
print(f"Mean case length: {np.mean(case_lengths):.2f}")
print(f"Standard devation of case length: {np.std(case_lengths):.2f}")

Mean case length: 38.16
Standard devation of case length: 16.72



e) Mean case duration and standard deviation of case duration

In [18]:
case_durations = pm4py.get_all_case_durations(log)  # in seconds

mean_sec = np.mean(case_durations)
std_sec  = np.std(case_durations)

def format_dms(seconds):
    days = int(seconds // 86400)
    rest = seconds % 86400
    minutes = int(rest // 60)
    secs = int(rest % 60)
    return f"{days} days, {minutes} minutes, {secs} seconds"

print("Mean case duration:", format_dms(mean_sec))
print("Standard deviation of case duration:", format_dms(std_sec))

Mean case duration: 21 days, 1295 minutes, 25 seconds
Standard deviation of case duration: 13 days, 243 minutes, 23 seconds


f) event labels and case labels

In [15]:
event_attrs = pm4py.get_event_attributes(log)
cleaned_ev_attr = sorted([a for a in event_attrs if "case:" not in a])
print("Event attributes:", cleaned_ev_attr)
print("Number of event labels:", len(cleaned_ev_attr))

case_attrs = pm4py.get_trace_attributes(log)
print("Case attributes:", case_attrs)
print("Number of case labels:", len(case_attrs))


Event attributes: ['Accepted', 'Action', 'CreditScore', 'EventID', 'EventOrigin', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferID', 'OfferedAmount', 'Selected', 'concept:name', 'org:resource', 'time:timestamp']
Number of event labels: 14
Case attributes: ['ApplicationType', 'RequestedAmount', 'LoanGoal']
Number of case labels: 3


g) Number of categorical event attributes (incl ids)

In [17]:
categorical_event_attrs = []
for col in cleaned_ev_attr:
    if df[col].dtype == "object":  
        categorical_event_attrs.append(col)

print("Categorical event attributes:", categorical_event_attrs)
print("Number of categorical event attributes:", len(categorical_event_attrs))

Categorical event attributes: ['Accepted', 'Action', 'EventID', 'EventOrigin', 'OfferID', 'Selected', 'concept:name', 'org:resource']
Number of categorical event attributes: 8


h) (optional) Events per Day

In [11]:
df = pm4py.convert_to_dataframe(log)
df["date"] = pd.to_datetime(df["time:timestamp"]).dt.date
events_per_day = df.groupby("date").size().mean()

print(f"Average number of events per day: {events_per_day:.2f}")

Average number of events per day: 3020.77


In [12]:
#print("Unique dates", df["date"].nunique())
#print(df["date"].min(), "bis", df["date"].max())
#print(df["date"].value_counts().sort_index().head(100))


i) (optional) most frequent activities

In [13]:
top_n = df["concept:name"].value_counts()
print(top_n.head(15))

concept:name
W_Validate application      209496
W_Call after offers         191092
W_Call incomplete files     168529
W_Complete application      148900
W_Handle leads               47264
O_Create Offer               42995
O_Created                    42995
O_Sent (mail and online)     39707
A_Validating                 38816
A_Create Application         31509
A_Concept                    31509
A_Accepted                   31509
A_Complete                   31362
O_Returned                   23305
A_Incomplete                 23055
Name: count, dtype: int64


j) (optional) shortest and longest durations of the cases

In [14]:
df1 = df.copy()
df1 = df1.dropna(subset=["time:timestamp", "case:concept:name"])
df1["ts"] = pd.to_datetime(df1["time:timestamp"], utc=True, errors="coerce")
df1 = df1.dropna(subset=["ts"])

span = df1.groupby("case:concept:name")["ts"].agg(first_ts="min", last_ts="max")
span["duration_days"] = (span["last_ts"] - span["first_ts"]).dt.total_seconds() / 86400

min_d = span["duration_days"].min()
max_d = span["duration_days"].max()
print(f"shortest duration: {min_d*24*60:.4f} min | longest duration: {max_d:.2f} days")


shortest duration: 3.3510 min | longest duration: 286.07 days
