In [1]:
import os
import pandas as pd
import statistics
from pm4py.objects.conversion.log import converter as log_converter

In [7]:
path = os.path.join(os.getcwd(), "incidentprocess_custom1.csv")
data = pd.read_csv(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303819 entries, 0 to 303818
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Incident ID              303819 non-null  object 
 1   Activity                 303819 non-null  object 
 2   ActivityTimeStamp        303819 non-null  object 
 3   Asset Affected           303819 non-null  object 
 4   Asset Type Affected      303819 non-null  object 
 5   Asset SubType Affected   303819 non-null  object 
 6   Service Affected         303819 non-null  object 
 7   Status                   303819 non-null  object 
 8   Impact                   303819 non-null  int64  
 9   Urgency                  303819 non-null  int64  
 10  Priority                 303819 non-null  int64  
 11  Category                 303819 non-null  object 
 12  Number of Reassignments  303810 non-null  float64
 13  Open Time                303819 non-null  object 
 14  Reop

In [8]:
data.columns = data.columns.str.replace(" ", "_")
rename_map = {
    "Incident_ID":"case:concept:name",
    "Activity":"concept:name",
    "ActivityTimeStamp":"time:timestamp"
}
data = data.rename(columns=rename_map)

In [9]:
data.columns

Index(['case:concept:name', 'concept:name', 'time:timestamp', 'Asset_Affected',
       'Asset_Type_Affected', 'Asset_SubType_Affected', 'Service_Affected',
       'Status', 'Impact', 'Urgency', 'Priority', 'Category',
       'Number_of_Reassignments', 'Open_Time', 'Reopen_Time', 'Resolved_Time',
       'Close_Time', 'Handle_Time_(Hours)', 'Closure_Code', 'Asset_Caused',
       'Asset_Type_Caused', 'Asset_SubType_Caused', 'Service_Caused',
       'Assignment_Group'],
      dtype='object')

In [10]:
assert {"case:concept:name","concept:name","time:timestamp"}.issubset(data.columns), \
    "Missing one of the mandatory columns after rename!"

In [12]:
data["time:timestamp"] = pd.to_datetime(data["time:timestamp"], dayfirst=True)

In [14]:
total_cases         = data["case:concept:name"].nunique()
total_events        = len(data)
distinct_activities = data["concept:name"].nunique()
epc                 = data.groupby("case:concept:name").size()
events_mean         = epc.mean()
events_median       = epc.median()
events_stdev        = epc.std(ddof=0)
num_columns         = data.shape[1]

In [15]:
print(f"Total cases:               {total_cases}")
print(f"Total events:              {total_events}")
print(f"Distinct activities:       {distinct_activities}")
print(f"Events per case (mean):    {events_mean:.2f}")
print(f"Events per case (median):  {events_median}")
print(f"Events per case (stdev):   {events_stdev:.2f}")
print(f"Number of columns:         {num_columns}")

Total cases:               46601
Total events:              303819
Distinct activities:       18
Events per case (mean):    6.52
Events per case (median):  5.0
Events per case (stdev):   5.71
Number of columns:         24


In [13]:
#Convert the DataFrame into a PM4PY EventLog
log = log_converter.apply(
    data,
    variant=log_converter.Variants.TO_EVENT_LOG
)