In [1]:
import ast
import re
from event_loop.preprocessing.dataframe import *

import metrics
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report

%load_ext autoreload
%load_ext memory_profiler

# Prerequisuites

## Activity Action Model
Train a Model with the task of classifying Start, End and NoAction events in the interleaved data. 
Training is done during a "warmup" phase with generated training data. 

### Load Data

In [2]:
# HR data in data/Train/R1 is missing frame.number. We take another (already filtered) dataset and apply our feature extraction to this one
df_train_in = pd.read_csv('../../data_v3/hr_extended_features.csv', converters={"MessageAttributes": ast.literal_eval})

In [3]:
# This is the Interleaved Data Set for our pipeline
df_il_in = pd.read_csv('../../data/HR-INTERLEAVED/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})

### Preprocessing Training Data

In [4]:
%autoreload 2
# data is at R1 Level. Apply filter and feature extraction
df_train = pre_process(df_train_in)
df_test = pre_process(df_il_in)


In [47]:
# Load start and end events from ground truth data.
# Tag according frames in interleaved data for testing
df_gt = pd.read_csv("../../data_v3/hr_ground_truth.csv")

start_indices = df_gt["start"].tolist()
end_indices = df_gt["actual_end"].tolist()

df_test["ActivityAction"] = df_test["frame.number"].apply(lambda x: "Activity Start" if x in start_indices else
("Activity End" if x in end_indices else "NoAction"))

def extract_labels(labels):
    return [[y] for y in labels]

test_labels = extract_labels(df_test["ActivityAction"])

In [6]:
# Form sequences in training data by grouping
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

In [7]:
def mark_start_end(df):
    # Mark start event of each BusinessActivity Instance
    df["activityStart"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount() == 0
    # Mark end event of each Business Activity Instance
    df["activityEnd"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount(ascending=False) == 0
    # Merge start and end columns to form labels
    df["ActivityAction"] = df.apply(lambda row: "Activity Start" if row["activityStart"] else (
        "Activity End" if row["activityEnd"] else 'NoAction'), axis=1)

    return df.drop(["activityStart", 'activityEnd'], axis=1)


df_train = mark_start_end(df_train)

In [8]:
# exclude from training data 
df_train_filt = df_train[~df_train["SequenceNumber"].isin([128])]


## Activity Model
The activity model utilises multiple sliding windows over the training data for pattern matching



In [9]:
from event_loop.event_activity import EventActivityAssignment
from event_loop.activity_type import ActivityTypeClassifier
from event_loop.activity_boundaries import ActivityBoundariesClassifier

# working
activity_boundaries_classifier = ActivityBoundariesClassifier(df_train_filt, None)


In [11]:
activity_type_classifier = ActivityTypeClassifier(df_train)

event_activity_model = EventActivityAssignment(df_train,10,["applicant_id", "activity_id"])

# Action Loop

Main loop. Gets raw R1 data as input. 
Applies filtering, activity action and sequence classification

In [12]:
records = df_il_in.to_dict("records")

In [36]:
from event_loop.event import Event
from event_loop.stack import Stack
from event_loop.event_activity import search_stack_for_request_frame, search_stream_index
%autoreload 2

import time
from event_loop.preprocessing.event import keep_event

# Parameter
EVENT_LOOP_CUTOFF_NO_ACTION = 3
EVENT_LOOP_CUTOFF_END_EVENT = 3
ENTROPY_THRESHOLD = 0.4 #0.5
MAX_WINDOW_SIZE = 10
VERBOSE = False
SETTING = "HR"

# init variables
event_buffer: list[Event] = []
attribute_buffer: list[dict] = []
stacks: list[Stack] = []
stacks_out: list[Stack] = []
event_loop_index = 0


HR_ATTRIBUTES = ["applicant_id", "activity_id"]
PTP_ATTRIBUTES = ["sale_order_id", "sale_order_line_id","purchase_requisition_id","purchase_requisition_line_id",]


processing_times = []
processing_times_filter = []
buffer_sizes = []


for i, event_data in enumerate(records):
    start_time = time.time()

    buffer_sizes.append(sum([len(stack) for stack in stacks]))
    # Filter Event Stream
    if not keep_event(event_data):
        end_time = time.time()
        processing_times_filter.append(end_time - start_time)
        # skip event in loop
        continue
        
    
    
    # count every not filtered event for event loop index
    event_loop_index += 1

    # Extract Features and generate Event Object
    event = Event(event_data, event_loop_index, event_buffer, SETTING)
    event_buffer.append(event)
    
    
    activity_boundaries_classifier.classify_event(event)
    
    # Activity Action Classification
    activity_action = event.activity_action
    
    # Activity Matching
    if activity_action == "Activity Start": 
        stacks.append(Stack(SETTING,event))
        
    if activity_action == "NoAction": 
        if len(stacks) == 1: 
            stacks[0].append_event(event)
        elif event.origin_request_frame: 
            idx = search_stack_for_request_frame(event.origin_request_frame, stacks)
            stacks[idx].append_event(event)
        else: 
            # Check attributes of each stack
            
            # we can filter out stacks that already have attributes different to the event
            exclude_indices =  event_activity_model.exclude_stacks_by_attribute(stacks, event)
    
            stack_index:int = event_activity_model.check_stack_attributes(stacks, event, exclude_indices)
                    
            if stack_index == -1:        
                stack_index = event_activity_model.assign_to_sequence(event,stacks, 4, exclude_indices)
            
            # for elements that are not matchable based on 2 sequences we fall back to stream index
            if stack_index == -1: 
                stack_index = search_stream_index(stacks, event, exclude_indices)    
            
            # fallback - no match add to first stack
            if stack_index == -1:
                res = next((i for i in range(len(stacks)) if i not in exclude_indices and stacks[i].confidence),-1)
                stack_index = res
                
            stacks[stack_index].append_event(event)
        
    if activity_action == "Activity End":
        
        stack_index = search_stack_for_request_frame(event.origin_request_frame, stacks)
        stacks[stack_index].append_event(event)
        
        if event.confidence: 
            if len(stacks) > 1: 
                stack = stacks.pop(stack_index)
                stacks_out.append(stack)
            else: 
                event.confidence = False
     

    # Loop through all currently open stacks
    for idx, stack in enumerate(stacks):
        last_event = stack[-1]
        # check for non-confident "No Action" Classifications. These could be "Activity End" Instead
        if not last_event.confidence and last_event.activity_action == "NoAction":
            # If a stack has not been continued for N event loops 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_NO_ACTION: 
                stacks.pop(idx)
                stacks_out.append(stack)
                
    for idx, stack in enumerate(stacks): 
        last_event = stack.events[-1]
        if not last_event.confidence and last_event.activity_action == "Activity End": 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_END_EVENT: 
            
                # we are now sure to pop the stack. 
                stacks.pop(idx)
                stacks_out.append(stack) 
                
    end_time = time.time()
    processing_times.append(end_time - start_time)
                
# pop all stacks that are still left
for stack in stacks: 
    stacks_out.append(stack)  


In [37]:
import sys
import pickle
model_size = [
    #sys.getsizeof(pickle.dumps(crf)),
    #sys.getsizeof(pickle.dumps(activity_classifier)),
    sys.getsizeof(pickle.dumps(event_activity_model.model))
]

In [38]:
from statistics import mean,stdev
print("average processing time:",mean(processing_times)*1000)
print("max buffer size:", max(buffer_sizes),f"({max(buffer_sizes) / len(records) * 100})")

average processing time: 0.11764658559667002
max buffer size: 154 (0.25080207807436117)


In [39]:
stack_predictions = [activity_type_classifier.classify_stack(stack) for stack in stacks_out]

In [40]:
start = [stack[0].frame_number for stack in stacks_out]
end = [stack[-1].frame_number for stack in stacks_out]

res_df = pd.DataFrame({"start_pred":start, "end_pred":end})

eval_df = df_gt[["start", "actual_end"]].merge(res_df,how="left", left_on ="start", right_on = "start_pred").fillna(-1).astype(int)
eval_df["end_pred_true"] = eval_df["actual_end"] == eval_df["end_pred"]
eval_df["start_pred_true"] = eval_df["start"] == eval_df["start_pred"]
eval_df["start_end_true"] =eval_df["start_pred_true"] == eval_df["end_pred_true"]

display(eval_df)
print(f"Overall matching accuracy: {0.5 + eval_df['end_pred_true'].mean()/2}")

Unnamed: 0,start,actual_end,start_pred,end_pred,end_pred_true,start_pred_true,start_end_true
0,17,325,17,325,True,True,True
1,356,1192,356,1192,True,True,True
2,1212,1520,1212,1520,True,True,True
3,1582,2336,1582,2336,True,True,True
4,2354,2664,2354,2664,True,True,True
5,2708,4461,2708,4461,True,True,True
6,3057,4871,3057,4871,True,True,True
7,4467,4881,4467,4881,True,True,True
8,4939,6164,4939,6164,True,True,True
9,5606,6859,5606,6859,True,True,True


Overall matching accuracy: 1.0


In [62]:
df_aa_test = pd.DataFrame(df_test[["frame.number", "ActivityAction"]])
df_aa_test["ActivityAction"] = "NoAction"
df_aa_test.loc[df_aa_test["frame.number"].isin(eval_df["end_pred"]), "ActivityAction"] = "Activity End"
df_aa_test.loc[df_aa_test["frame.number"].isin(eval_df["start_pred"]), "ActivityAction"] = "Activity Start"
print(classification_report(test_labels, df_aa_test["ActivityAction"]))

                precision    recall  f1-score   support

  Activity End       1.00      1.00      1.00        37
Activity Start       1.00      1.00      1.00        37
      NoAction       1.00      1.00      1.00      1239

      accuracy                           1.00      1313
     macro avg       1.00      1.00      1.00      1313
  weighted avg       1.00      1.00      1.00      1313


In [63]:
# Function to check if intervals overlap
def intervals_overlap(row, df):
    overlapping_names = []
    overlapping_bps = set()
    for index, other_row in df.iterrows():
        if row.name != index and row['start'] <= other_row['actual_end'] and row['actual_end'] >= other_row['start']:
            overlapping_names.append(f"{other_row['activity_name']} {other_row['bp_id']}")
            overlapping_bps.add(other_row['bp_id'])
    return overlapping_names, list(overlapping_bps)

df_gt[["overlapping_activities", "overlapping_bps"]] = df_gt.apply(intervals_overlap, axis=1, df = df_gt, result_type="expand")

In [64]:
# Create dataframe with mapping of frame numbers to event stacks
frame_numbers = [event.frame_number for idx,stack in enumerate(stacks_out) for event in stack]
stack_numbers = [idx for idx,stack in enumerate(stacks_out) for event in stack]
applicant_ids = [event.attributes["applicant_id"] for idx,stack in enumerate(stacks_out) for event in stack]
activity_ids = [event.attributes["activity_id"] for idx,stack in enumerate(stacks_out) for event in stack]
mail_ids = [event.attributes["mail_id"] for idx,stack in enumerate(stacks_out) for event in stack]
sniff_time =  [event.sniff_time for idx,stack in enumerate(stacks_out) for event in stack]
case_id = [stack.case_id["id"]  if stack.case_id else -1 for idx, stack in enumerate(stacks_out) for event in stack]

df_frame_numbers = pd.DataFrame(data={"frame.number": frame_numbers, "sniff_time": sniff_time, "stack_idx": stack_numbers, "applicant_id": applicant_ids,"activity_id": activity_ids, "mail_id":mail_ids,"case_id": case_id})

# Merge Activity Name from ground truth frame to event sequences for evaluation
merged_df = df_frame_numbers.merge(df_gt[["activity_name","start","bp_id"]], how="left",left_on="frame.number", right_on="start").drop(columns="start")

merged_df[["activity_name","bp_id"]] = merged_df.groupby("stack_idx")[["activity_name","bp_id"]].ffill()
#merged_df["activity_name"] = merged_df.groupby("stack_idx")["bp_id"].ffill()

# Merge with filtered interleaved test data
merged_df = df_test.merge(merged_df, on="frame.number")

In [65]:
merged_df

Unnamed: 0,BusinessActivity,InstanceNumber,frame.number,sniff_time_x,synthetic_sniff_time,event_with_roles,request_method_call,file_data,pgsql.query,pgsql.target,...,origin_file_data,ActivityAction,sniff_time_y,stack_idx,applicant_id,activity_id,mail_id,case_id,activity_name,bp_id
0,HrRecruitmentProcess,over,17,2020-09-18 01:07:22.425912,2020-09-18 13:37:23.875912,End Point (HR Manager)->Odoo Application:[Http...,version,[],,,...,,Activity Start,2020-09-18 13:37:23.875912,0,,,,1,GenerateJobApplicationActivity,1224.0
1,HrRecruitmentProcess,over,49,2020-09-18 01:07:22.473472,2020-09-18 13:45:19.523472,Odoo Application->End Point (HR Manager):[Http...,server_version,"[server_version, 12.0-20190820, server_version...",,,...,version,NoAction,2020-09-18 13:45:19.523472,0,,,,1,GenerateJobApplicationActivity,1224.0
2,HrRecruitmentProcess,over,57,2020-09-18 01:07:22.476403,2020-09-18 13:45:48.836403,End Point (HR Manager)->Odoo Application:[Http...,authenticate,"[odoo01, dana.wireless@gmail.com, 123456789]",,,...,,NoAction,2020-09-18 13:45:48.836403,0,,,,1,GenerateJobApplicationActivity,1224.0
3,HrRecruitmentProcess,over,101,2020-09-18 01:07:22.742117,2020-09-18 14:30:06.242117,Odoo Application->db Server/Mail Server:[Pgsql...,,,"INSERT INTO ""res_users_log"" (""id"", ""create_uid...",res_users_log,...,,NoAction,2020-09-18 14:30:06.242117,0,,,,1,GenerateJobApplicationActivity,1224.0
4,HrRecruitmentProcess,over,110,2020-09-18 01:07:22.748067,2020-09-18 14:31:05.748067,Odoo Application->End Point (HR Manager):[Http...,IsNumber,[6],,,...,,NoAction,2020-09-18 14:31:05.748067,0,,,,1,GenerateJobApplicationActivity,1224.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308,HrRecruitmentProcess,over,28267,2020-09-18 01:08:59.750937,2020-09-29 19:59:51.450937,Odoo Application->db Server/Mail Server:[Pgsql...,,,"UPDATE ""mail_message_res_partner_needaction_re...",mail_message_res_partner_needaction_rel,...,,NoAction,2020-09-29 19:59:51.450937,36,,,,8,PerformAnInterviewMeeting,1231.0
1309,HrRecruitmentProcess,over,28304,2020-09-18 01:08:59.764501,2020-09-29 20:02:07.104501,Odoo Application->End Point (Recruiting manage...,IsNumber,[1],,,...,hr.applicant_write_stage_id_8,NoAction,2020-09-29 20:02:07.104501,36,,,,8,PerformAnInterviewMeeting,1231.0
1310,HrRecruitmentProcess,over,28312,2020-09-18 01:08:59.766260,2020-09-29 20:02:24.696260,End Point (Recruiting manager)->Odoo Applicati...,execute_kw,"[odoo01, 7, 123456789, hr.applicant, archive_a...",,,...,,NoAction,2020-09-29 20:02:24.696260,36,1231,,,8,PerformAnInterviewMeeting,1231.0
1311,HrRecruitmentProcess,over,28377,2020-09-18 01:08:59.823539,2020-09-29 20:11:57.543539,Odoo Application->db Server/Mail Server:[Pgsql...,,,"UPDATE ""hr_applicant"" SET ""active""=false,""writ...",hr_applicant,...,,NoAction,2020-09-29 20:11:57.543539,36,1231,,,8,PerformAnInterviewMeeting,1231.0


In [66]:
unique_no_nan = lambda x: list(filter(None, pd.unique(x)))
first_unique = lambda x: unique_no_nan(x)[0]

def compare_values(x,y):
    # Multi index and casting magic - I just want to compare the bp_ids lol
    x = int(x[0])
    y = int(y[0])

    return x == y


res = merged_df.groupby("stack_idx").agg(applicant_id = ("applicant_id", unique_no_nan),activity_id=("activity_id", unique_no_nan), mail_id=("mail_id", unique_no_nan),case_id=("case_id", first_unique),bp_id=("bp_id", unique_no_nan),frame_number_min=("frame.number","min"),frame_number_max =  ("frame.number","max"),sniff_time_min=("sniff_time_x","min"),sniff_time_max=("sniff_time_x","min"), activity_name=("activity_name", lambda x: x.head(1)))
res["stack_prediction"] = stack_predictions
# Apply the custom function to compare 'sale_order_line_id' and 'sale_order_line_id_case_id'
res["bp_true"] = res.apply(lambda x: compare_values(x["applicant_id"], x["bp_id"]), axis = 1)
res["activity_true"] = res["activity_name"] ==  res["stack_prediction"]
#res.loc["Mean","bp_true"] = res["bp_true"].mean()
#res.loc["Mean","activity_true"] = res["activity_true"].mean()

In [67]:
res

Unnamed: 0_level_0,applicant_id,activity_id,mail_id,case_id,bp_id,frame_number_min,frame_number_max,sniff_time_min,sniff_time_max,activity_name,stack_prediction,bp_true,activity_true
stack_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,[1224],[],[],1,[1224.0],17,325,2020-09-18 01:07:22.425912,2020-09-18 01:07:22.425912,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True
1,[1224],[],[7950],1,[1224.0],356,1192,2020-09-18 01:07:28.935647,2020-09-18 01:07:28.935647,ResumeReviewActivity,ResumeReviewActivity,True,True
2,[1225],[],[],2,[1225.0],1212,1520,2020-09-18 01:07:30.784745,2020-09-18 01:07:30.784745,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True
3,[1225],[],[7954],2,[1225.0],1582,2336,2020-09-18 01:07:35.257826,2020-09-18 01:07:35.257826,ResumeReviewActivity,ResumeReviewActivity,True,True
4,[1226],[],[],3,[1226.0],2354,2664,2020-09-18 01:07:39.037302,2020-09-18 01:07:39.037302,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True
5,[1226],[],[7958],3,[1226.0],2708,4461,2020-09-18 01:07:45.544565,2020-09-18 01:07:45.544565,ResumeReviewActivity,ResumeReviewActivity,True,True
6,[1225],[732],[7960],2,[1225.0],3057,4871,2020-09-18 01:07:46.041104,2020-09-18 01:07:46.041104,ScheduleAnInterviewActivityCall,ScheduleAnInterviewActivityCall,True,True
7,[1227],[],[],4,[1227.0],4467,4881,2020-09-18 01:07:47.605781,2020-09-18 01:07:47.605781,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True
8,[1227],[],[7964],4,[1227.0],4939,6164,2020-09-18 01:07:52.101027,2020-09-18 01:07:52.101027,ResumeReviewActivity,ResumeReviewActivity,True,True
9,[1225],[732],"[7965, 7967]",2,[1225.0],5606,6859,2020-09-18 01:07:53.104460,2020-09-18 01:07:53.104460,PerformAnInterviewCall,PerformAnInterviewCall,True,True


In [68]:
print("------------------ Activity Type --------------")
print(classification_report(res["activity_name"], res["stack_prediction"]))

------------------ Activity Type --------------
                                 precision    recall  f1-score   support

               ContractProposal       1.00      1.00      1.00         1
 GenerateJobApplicationActivity       1.00      1.00      1.00        10
         PerformAnInterviewCall       1.00      1.00      1.00         5
      PerformAnInterviewMeeting       1.00      1.00      1.00         3
           ResumeReviewActivity       1.00      1.00      1.00        10
ScheduleAnInterviewActivityCall       1.00      1.00      1.00         5
     ScheduleAnInterviewMeeting       1.00      1.00      1.00         3

                       accuracy                           1.00        37
                      macro avg       1.00      1.00      1.00        37
                   weighted avg       1.00      1.00      1.00        37


In [69]:
first_int = lambda x: int(x[0])

pred = res["applicant_id"].map(first_int)
true = res["bp_id"].map(first_int)

print("------------------ Activity Type --------------")
print(classification_report(true[pred!= -1], pred[pred!= -1], zero_division=0.0))

------------------ Activity Type --------------
              precision    recall  f1-score   support

        1224       1.00      1.00      1.00         2
        1225       1.00      1.00      1.00         4
        1226       1.00      1.00      1.00         2
        1227       1.00      1.00      1.00         2
        1228       1.00      1.00      1.00         6
        1229       1.00      1.00      1.00         7
        1230       1.00      1.00      1.00         2
        1231       1.00      1.00      1.00         6
        1232       1.00      1.00      1.00         4
        1233       1.00      1.00      1.00         2

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37


In [70]:
out = res.sort_values(by= "sniff_time_min")[["sniff_time_min","stack_prediction","case_id"]].reset_index(drop=True)
out.columns = ["timestamp", "activity", "case_id"]

In [25]:
out.to_csv("../../data_v3/out/ptp_xes_out.csv", index = False)