In [3]:
import pandas as pd
import numpy as np
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer


In [5]:
# Load Excel log data
df = pd.read_excel("./log_cleaning.xlsx")

# Clean data
df.columns = [col.strip().lower() for col in df.columns]
df['content'] = df['content'].astype(str).str.strip().str.replace('"', '').replace('Unknown', np.nan).replace('', np.nan)
df['type'] = df['type'].astype(str).str.lower().str.strip().replace('', np.nan)

df = df.dropna(subset=['content', 'type'])
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp']).sort_values(by='timestamp')
df = df.drop_duplicates().reset_index(drop=True)
print(df,"Data cleaned successfully.") 


             timestamp    type                                        content
0  2025-07-18 09:52:00   filed                                            nan
1  2025-07-18 09:52:00  button                                          PDF下载
2  2025-07-18 09:52:00  button  基于CRISPR/Cas12a系统联合重组酶聚合酶扩增的鸭星状病毒2型核酸检测试纸条的制备
3  2025-07-18 09:52:00  button                 基于RPA-LFD可视化快速检测肺炎克雷伯菌方法的建立及评价
4  2025-07-18 09:52:00  button                  热带水产品中溶藻弧菌重组酶聚合酶等温扩增快速检测方法的建立
5  2025-07-18 09:52:00  button               基因编辑水稻RPA-CRISPR/Cas12b快速检测方法的建立
6  2025-07-18 09:52:00  button                           RPA+AI在企业财务领域应用研究及实践
7  2025-07-18 09:52:00  button                                             检索
8  2025-07-18 09:52:00  button                                             搜索
9  2025-07-18 09:52:00  button                                            nan
10 2025-07-18 09:52:00  button               CRISPR-Cas检测系统在食品供应链中安全防控的应用研究进展 Data cleaned successfully.


In [6]:
df['session_id'] = (df['timestamp'].diff().dt.total_seconds() > 10).cumsum()


In [7]:
df_pm4py = df.rename(columns={
    'session_id': 'case:concept:name',
    'content': 'concept:name',
    'timestamp': 'time:timestamp'
})
df_pm4py = dataframe_utils.convert_timestamp_columns_in_df(df_pm4py)
event_log = log_converter.apply(df_pm4py)


In [10]:
net, initial_marking, final_marking = alpha_miner.apply(event_log)
# gviz = pn_visualizer.apply(net, initial_marking, final_marking)
# pn_visualizer.view(gviz)
