<a href="https://colab.research.google.com/github/rohandawar/ProcessMining/blob/main/ProcessMining_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I am trying to learn processing mining through ** Health Care** industry example

In [2]:
! pip install pm4py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pm4py
  Downloading pm4py-2.7.4-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecation (from pm4py)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting intervaltree (from pm4py)
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting stringdist (from pm4py)
  Downloading StringDist-1.0.9.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: intervaltree, stringdist
  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone
  Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26099 sha256=ea15cabea2ed2690a3f3d914b53e5a60b4d33fd1b6b4d6ef88c29eb68bb2f20f
  Stored in directory: /root/.cache/pip/

In [14]:
# Import Libs 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Ipython
from IPython.display import display, Markdown

# Pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer

# Process mining
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery

# Viz

from pm4py.visualization.petri_net import visualizer as pn_visualizer


In [None]:
# Function Defination
def printmd(string):
  display(Markdown(string))

In [None]:
#https://gitlab.com/healthcare2/process-mining-tutorial
#https://medium.com/@c3_62722/process-mining-with-python-tutorial-a-healthcare-application-part-2-4cf57053421f

In [None]:
data_path = 'https://gitlab.com/healthcare2/process-mining-tutorial/-/raw/master/ArtificialPatientTreatment.csv'
events = pd.read_csv(data_path)
events.head()

Unnamed: 0,patient,action,org:resource,DateTime
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11
1,patient 0,Blood test,Lab,2017-01-02 12:47:33
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09


In [None]:
# renaming the columns to have a better shape
events.columns = ['patient', 'action', 'resources', 'datetime']

# Change the event to date time 
events['datetime'] = pd.to_datetime(events['datetime'])
events.head()

Unnamed: 0,patient,action,resources,datetime
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11
1,patient 0,Blood test,Lab,2017-01-02 12:47:33
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09


In [None]:
# Get the case start time
case_start_ends = events.pivot_table(index='patient', aggfunc={'datetime':['min', 'max']})
case_start_ends = case_start_ends.reset_index()
case_start_ends.columns = ['patient', 'casened', 'casestart']
case_start_ends.head()



Unnamed: 0,patient,casened,casestart
0,patient 0,2017-01-09 08:29:28,2017-01-02 11:40:11
1,patient 1,2017-01-06 16:49:21,2017-01-02 12:50:35
2,patient 10,2017-01-30 11:19:19,2017-01-17 14:13:17
3,patient 11,2017-02-02 10:13:13,2017-01-19 13:35:20
4,patient 12,2017-01-27 11:18:57,2017-01-20 11:43:38


In [None]:
# Merging the 2 dataframes to have the case start time & end time for every patient
events = events.merge(case_start_ends, on='patient')
events['relativetime'] = events['datetime'] - events['casestart']
events['action'] = events['action'].apply(lambda x : x.strip())
events.head()

Unnamed: 0,patient,action,resources,datetime,casened,casestart,relativetime
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 00:00:00
1,patient 0,Blood test,Lab,2017-01-02 12:47:33,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:07:22
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:13:39
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 04:40:55
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09,2017-01-09 08:29:28,2017-01-02 11:40:11,3 days 01:42:58


In [None]:
# Create a column for action sequence
delimiter = '___'

nameEventString = lambda x: delimiter.join(x)
nameEventString.__name__ = 'nameEventString'

numEvents = lambda x: len(x)
numEvents.__name__ = 'numEvents'

# caselogs = events.pivot