In [65]:
import ast
import re
from event_loop.preprocessing.dataframe import *

import metrics
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report

%load_ext autoreload
%load_ext memory_profiler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


# Prerequisuites

## Activity Action Model
Train a Model with the task of classifying Start, End and NoAction events in the interleaved data. 
Training is done during a "warmup" phase with generated training data. 

### Load Data

In [66]:
# HR data in data/Train/R1 is missing frame.number. We take another (already filtered) dataset and apply our feature extraction to this one
df_train_in = pd.read_csv('../../data_v3/hr_extended_features.csv', converters={"MessageAttributes": ast.literal_eval})

In [67]:
# This is the Interleaved Data Set for our pipeline
df_il_in = pd.read_csv('../../data/HR-INTERLEAVED/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})

In [68]:
len(df_il_in)

61403

### Preprocessing

In [69]:
# data is at R1 Level. Apply filter and feature extraction
df_train = pre_process(df_train_in)

df_test = pre_process(df_il_in)

In [70]:
len(df_test)

1313

In [71]:
# Load start and end events from ground truth data.
# Tag according frames in interleaved data for testing
df_gt = pd.read_csv("../../data_v3/hr_ground_truth.csv")

start_indices = df_gt["start"].tolist()
end_indices = df_gt["actual_end"].tolist()

df_test["ActivityAction"] = df_test["frame.number"].apply(lambda x: "Activity Start" if x in start_indices else
("Activity End" if x in end_indices else "NoAction"))

In [72]:
# ------------ OPTIONAL ---------------
# TODO Duplicate with Activity Model - move down and delete
# Form sequences in training data by grouping
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

# check sequence length of training data
df_train["SequenceNumber"].value_counts()

SequenceNumber
13     132
21     115
6      115
1      115
5       96
      ... 
404     15
158     15
157     15
25      15
154     14
Name: count, Length: 933, dtype: int64

In [73]:
def mark_start_end(df):
    # Mark start event of each BusinessActivity Instance
    df["activityStart"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount() == 0
    # Mark end event of each Business Activity Instance
    df["activityEnd"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount(ascending=False) == 0
    # Merge start and end columns to form labels
    df["ActivityAction"] = df.apply(lambda row: "Activity Start" if row["activityStart"] else (
        "Activity End" if row["activityEnd"] else 'NoAction'), axis=1)

    return df.drop(["activityStart", 'activityEnd'], axis=1)


df_train = mark_start_end(df_train)

In [74]:
cols = ["event_with_roles", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]


In [75]:
def dict_to_features(dict):
    return [[{**d, "bias": 1.0}] for d in dict]

def extract_labels(labels):
    return [[y] for y in labels]

In [76]:
# exclude from training data 
df_train_filt = df_train[~df_train["SequenceNumber"].isin([128])]


In [77]:
train_features = df_train_filt[cols].to_dict("records")
train_features = dict_to_features(train_features)
train_labels = extract_labels(df_train_filt["ActivityAction"])

In [78]:
test_features = df_test[cols].to_dict("records")
test_features = dict_to_features(test_features)
test_labels = extract_labels(df_test["ActivityAction"])

### Model Training

In [79]:
# optional Train Test split for evaluation on training data
# In prod case, we train on 100% training data and evaluate on interleaved data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3, random_state=42)

In [80]:
%%time
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
crf.fit(train_features, train_labels)

CPU times: user 1.16 s, sys: 6.55 ms, total: 1.17 s
Wall time: 1.19 s


### Optimization

In [81]:
from sklearn.metrics import make_scorer
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import metrics

# define fixed parameters and parameters to search
crf2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=200, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='macro', labels=np.unique(test_labels))

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=5, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=150, 
                        scoring=f1_scorer)
#rs.fit(train_features, train_labels)

#crf = rs.best_estimator_

### Evaluation

In [82]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn_crfsuite import metrics


def flatten(xss):
    return [x for xs in xss for x in xs]


def evaluate(model, x, y_true):
    y_pred = model.predict(x)
    print(metrics.flat_f1_score(y_true, y_pred, average='macro', labels=model.classes_))
    print(metrics.flat_classification_report(y_true, y_pred, model.classes_))
    [print(label, "\n", matrix) for matrix, label in
     zip(multilabel_confusion_matrix(flatten(y_true), flatten(y_pred), labels=model.classes_), model.classes_)]


In [83]:
evaluate(crf, test_features, test_labels)

0.9608303595410767
                precision    recall  f1-score   support

Activity Start       1.00      1.00      1.00        37
      NoAction       1.00      1.00      1.00      1239
  Activity End       0.94      0.84      0.89        37

      accuracy                           0.99      1313
     macro avg       0.98      0.95      0.96      1313
  weighted avg       0.99      0.99      0.99      1313

Activity Start 
 [[1276    0]
 [   0   37]]
NoAction 
 [[  68    6]
 [   2 1237]]
Activity End 
 [[1274    2]
 [   6   31]]


In [84]:
import pickle

with open('hr_activity_action.pkl', 'wb') as f:  # open a text file
    pickle.dump(crf, f)

In [85]:
pred = crf.predict(test_features)

In [86]:
pred_mg = crf.predict_marginals(test_features)

In [87]:
#margs = [pred_mg[i] for i in wrong_pred_idx]
columns = pred_mg[0][0].keys()
flat_margs = [[entry[column] for column in columns] for sublist in pred_mg for entry in sublist]
df_margs = pd.DataFrame(flat_margs, columns=columns)

In [88]:
#df_eval = pd.DataFrame([(pred[i], test_labels[i], df_test.iloc[i]["frame.number"] ) for i in wrong_pred_idx],columns = ["predicted","true","frame.number"],)

df_eval = pd.DataFrame({"predicted": pred, "true": test_labels, "frame.number":df_test["frame.number"]}).reset_index(drop=True)

df_eval = pd.concat([df_eval, df_margs], axis = 1)

df_eval["pred_true"] = df_eval["predicted"] == df_eval["true"]



In [89]:
from scipy.stats import entropy

entropy_cols = ['Activity Start', 'NoAction', 'Activity End']

# Calculate entropy for each row using the specified columns
df_eval["entropy"] = df_eval[entropy_cols].apply(entropy, axis=1)


In [90]:
df_eval[~ df_eval["pred_true"]].sort_values(by='entropy', ascending=False)

Unnamed: 0,predicted,true,frame.number,Activity Start,NoAction,Activity End,pred_true,entropy
126,[NoAction],[Activity End],2336,0.000312,0.53365,0.466037,False,0.693477
403,[NoAction],[Activity End],8044,0.000312,0.53365,0.466037,False,0.693477
471,[NoAction],[Activity End],9329,0.000312,0.53365,0.466037,False,0.693477
749,[NoAction],[Activity End],15350,0.000312,0.53365,0.466037,False,0.693477
983,[NoAction],[Activity End],20665,0.000312,0.53365,0.466037,False,0.693477
1013,[NoAction],[Activity End],21651,0.000192,0.711912,0.287896,False,0.602024
313,[Activity End],[NoAction],6266,0.000212,0.259278,0.74051,False,0.574246
1214,[Activity End],[NoAction],26182,0.000212,0.259278,0.74051,False,0.574246


In [91]:
eval_cols = ["event_with_roles","pgsql.query", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]

df_eval = df_eval.merge(df_test[["frame.number", *eval_cols]], how="left",left_on="frame.number", right_on="frame.number")

We observe a high entropy > 0.5 for all wrong classifications

-> Apply fallback model for this cases

## Activity Classifier

In [92]:
def sequence_by_activities(data, seq_data):
    return [data[seq_data == i] for i in range(seq_data.max())]

In [93]:
feature_cols = ["event_with_roles", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]


In [94]:
# List of dataframes each containing one activity sequence
train_activity_sequences = sequence_by_activities(df_train, df_train["SequenceNumber"])

In [95]:
# Sequences without window features

def dict_to_feature_sequence(dict):
    return [{**d, "bias": 1.0} for d in dict]

def df_to_features(df):
    return dict_to_feature_sequence(df.to_dict("records"))

train_features_seq = [df_to_features(df[feature_cols]) for df in train_activity_sequences]
train_labels_seq = [df["BusinessActivity"].values for df in train_activity_sequences]

In [96]:
# Single Events no window features 

def dict_to_feature(dict):
    return [[{**d, "bias": 1.0}] for d in dict]

def extract_labels(labels):
    return [[y] for y in labels]

train_features = dict_to_feature(df_train[feature_cols].to_dict("records"))
train_labels = extract_labels(df_train["BusinessActivity"])

In [97]:
# Single Events w. window features

# Apply sequencing - flatten later

def seq2features(seq, bw, fw): 
    return [event2features(seq, i, bw, fw) for i in range(len(seq))]

def event2features(seq, i, bw, fw):
    features = {"bias": 1.0}
    
    features.update({
        f"0:{k}": v for k,v in seq[i].items()
    })
    
    for j in range(1, bw+1): 
        index = i-j
        if index >= 0: 
            features.update({
                f"-{j}:{k}": v for k,v in seq[index].items()
            })
        else: 
            features.update({
                 f"-{j}:{k}": "NoMessage" for k,_ in seq[i].items()
            })
        
    for j in range(1,fw+1): 
        index = i + j
        if index < len(seq): 
             features.update({
                f"+{j}:{k}": v for k,v in seq[index].items()
            })
        else: 
            features.update({
                 f"+{j}:{k}": "NoMessage" for k,_ in seq[i].items()
            })
            
    return features

train_features_seq_window = [seq2features(seq[feature_cols].to_dict("records"), 10,10) for seq in train_activity_sequences]
train_labels_seq_window = [seq["BusinessActivity"] for seq in train_activity_sequences]

In [98]:
def flatten_and_encapsulate(list_of_list):
    return [[item] for sublist in list_of_list for item in sublist]

X_train = flatten_and_encapsulate(train_features_seq_window)
y_train = flatten_and_encapsulate(train_labels_seq_window)

In [99]:
%%time
import sklearn_crfsuite

activity_classifier= sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
activity_classifier.fit(X_train, y_train)

CPU times: user 13.7 s, sys: 227 ms, total: 14 s
Wall time: 15.6 s


In [100]:
def confidence_weighted_majority_voting(predictions):
    """
    Perform confidence-weighted majority voting on each sublist of predictions.

    :param predictions: A list of dictionaries where each dictionary contains predictions and their confidences.
    :return: A list of majority voted predictions for each sublist.
    """
    majority_voted_predictions = []
    for sublist in predictions:
        if not sublist:
            # If the sublist is empty, append None to the majority voted predictions
            majority_voted_predictions.append(None)
        else:
            # Initialize variables to store cumulative confidences for each prediction
            cumulative_confidences = {label: 0.0 for label in sublist[0].keys()}
            
            # Calculate cumulative confidences for each prediction across all dictionaries in the sublist
            for prediction_dict in sublist:
                for label, confidence in prediction_dict.items():
                    cumulative_confidences[label] += confidence
            
            # Find the prediction with the maximum cumulative confidence
            majority_voted_prediction = max(cumulative_confidences, key=cumulative_confidences.get)
            majority_voted_predictions.append(majority_voted_prediction)

    return majority_voted_predictions

## Activity Model
The activity model utilises multiple sliding windows over the training data for pattern matching



In [101]:
from sklearn.preprocessing import LabelEncoder
from numpy.lib.stride_tricks import sliding_window_view


def get_unique_sequences(seq_data):
    # Convert each array to a tuple and create a set of tuples
    array_set = set(tuple(arr) for arr in seq_data)

    # Convert the set of tuples back to a list of NumPy arrays
    return [np.array(arr) for arr in array_set]


df_train["joined"] = df_train["event_with_roles"] + df_train["selective_file_data"]

# Label Encode Training Data 
le = LabelEncoder()
df_train["joined_LE"] = le.fit_transform(df_train["joined"])

# Mark groups of Instance Number and BusinessActivity with sequence numbers
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
# Align Sequence Numbers so that they start at 0
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

# Divides dataframe into arrays according to to Sequence Data Indicator
data_joined_LE = sequence_by_activities(df_train["joined_LE"], df_train["SequenceNumber"])

unique_sequences = get_unique_sequences(data_joined_LE)

print(f"Reduced the number of sequences from {len(data_joined_LE)} to {len(unique_sequences)} unique ones")

def get_activity_model_data(max_window_length):
    return [np.concatenate([sliding_window_view(seq, i) for seq in unique_sequences], axis=0) for i in
                       range(max_window_length)]
    

# form sliding window sequences of Size N for Training Data 
#activity_model_data = get_activity_model_data(4)

Reduced the number of sequences from 932 to 26 unique ones


# Action Loop

Main loop. Gets raw R1 data as input. 
Applies filtering, activity action and sequence classification

In [102]:
records = df_il_in.to_dict("records")

In [103]:
from event_loop.event import Event


def get_max_from_dict(d: dict):
    return max(d, key= lambda k: d[k])
    

def classify_event(event: Event): 
    margs = crf.predict_marginals_single([event.to_features()])[0]
    pred = get_max_from_dict(margs)
    
    e = entropy([p for p in margs.values()])
    
    #print(f"{event.frame_number} {pred} {margs[pred]:.3f} {e:.3f},")
    
    
    true_val = df_eval[df_eval["frame.number"] == event.frame_number]["true"].iloc[0][0]
    #print(true_val)
    
    # Change to entropy of prediction
    if e > ENTROPY_THRESHOLD: 
    #if pred != true_val:
        # If pred is wrong we have two options for "wrong classifications" 
        # 1 -> We have No Action predicted although the stack should end here 
        # Idea 1: If the stack did not change after N events, emit it. 
        
        # Mark the confidence on the event.
        event.confidence = False 


        # 2 -> We have End predicted although the stack should continue. 
        if pred != true_val:
            print(f"False {event.frame_number} {pred} {margs[pred]:.3f} {e:.3f} should be {true_val}")
       
        event.activity_action = pred
            
    else: 
        event.confidence = True
        event.activity_action = pred

In [104]:
from event_loop.stack import Stack


def search_stack_for_request_frame(frame_number):
    for index, stack in enumerate(stacks):
        if stack.contains_request_frame(frame_number): 
            return index
    return -1

def search_window_for_sequence(seq): 
    """
    Check for pattern matches with the training data and return the count
    :param seq: array_like
                sequence of events
            
    :return: number of occurences of seq in training data
    """
    return np.sum(np.all(activity_model_data[len(seq)] == seq, axis = 1))


def classify_by_train_sequences(event: Event, n : int, exclude_indices: list[int]): 
    # search for existing stacks in training data 
    sequences = [le.transform([ e.to_activity_model_string() for e in stack]+[event.to_activity_model_string()]) for stack in stacks]  
    
    # loop to max 2 elements down
    for i in range(n, 1, -1):   
        res = [search_window_for_sequence(seq[-i:]) if j not in exclude_indices else -1 for j,seq in enumerate(sequences)]
        
        max_res = max(res)
        max_res_count = res.count(max_res)
        idx = np.argmax(res)
        
        if max_res > 0: 
            #print("res:",res, max_res, max_res_count, "->", idx)
            return idx
        
    return -1

def search_stream_index(event: Event, exclude_indices: list[int]) -> int: 
    indices = [i for i,stack in enumerate(stacks) if stack.contains_stream_index(event.stream_index) and i not in exclude_indices]

    if len(indices) == 1: 
        return indices[0]
    else:
        return -1
    

def check_stack_attributes(stacks: list[Stack], event: Event, exclude_indices: list[int]) -> int:
    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            indices = [i for i, stack in enumerate(stacks) if stack.contains_attribute(key,value) and i not in exclude_indices]
            print()
            
            if len(indices) ==1: 
                print("MATCH", indices)
                # we have a clear match -> return idx
                return indices[0]

    return -1

def check_stack_attributes_case_id(stacks: list[Stack], event: Event, exclude_indices: list[int]) -> int:
    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            indices = [i for i, stack in enumerate(stacks) if stack.case_id == Stack.case_id_from_attribute(key, value)]
            
            if len(indices) ==1: 
                # we have a clear match -> return idx
                return indices[0]

    return -1


def exclude_stacks_by_attribute(stacks: list[Stack], event: Event, stacks_out: list[Stack]) -> list[int]: 
    
    exclude_indices = []

        
    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            # exlucde all stacks that have a different attribute 
            exclude_indices.extend(i for i, stack in enumerate(stacks) if stack.has_attribute(key) and not stack.contains_attribute(key, value))
    return exclude_indices

def exclude_stacks_by_attribute_case_id(stacks: list[Stack], event: Event, stacks_out: list[Stack]) -> list[int]: 
    
    exclude_indices = []

    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            # exclude all stacks that have a different attribute 
            # 
            for i, stack in enumerate(stacks): 
                event_case_id = Stack.case_id_from_attribute(key, value)
                if event_case_id and stack.case_id and event_case_id != stack.case_id: 
                    exclude_indices.append(i)
            #exclude_indices.extend(i for i, stack in enumerate(stacks) if Stack.case_id_from_attribute(key, value) != stack.case_id and stack.case_id)
    return exclude_indices

In [105]:
import time

from event_loop.preprocessing.event import keep_event
%autoreload 2




# Parameter
EVENT_LOOP_CUTOFF_NO_ACTION = 3
EVENT_LOOP_CUTOFF_END_EVENT = 3
ENTROPY_THRESHOLD = 0.4 #0.5
MAX_WINDOW_SIZE = 10
VERBOSE = False
SETTING = "HR"

# init variables
event_buffer: list[Event] = []
attribute_buffer: list[dict] = []
stacks: list[Stack] = []
stacks_out: list[Stack] = []
event_loop_index = 0


# TODO Change config to something like this:
config_dict = {
    "PTP": {
        "1to1" : ["applicant_id", "activity_id"],
        "1toN": ["mail_id"]
    }
}

HR_ATTRIBUTES = ["applicant_id", "activity_id"]
PTP_ATTRIBUTES = ["sale_order_id", "sale_order_line_id","purchase_requisition_id","purchase_requisition_line_id",]


activity_model_data = get_activity_model_data(MAX_WINDOW_SIZE)


processing_times = []
buffer_sizes = []

for i, event_data in enumerate(records):
    
    buffer_sizes.append(sum([len(stack) for stack in stacks]))
    start_time = time.time()

    # Filter Event Stream
    if not keep_event(event_data):
        # skip event in loop
        continue
    
    # count every not filtered event for event loop index
    event_loop_index += 1

    # Extract Features and generate Event Object
    event = Event(event_data, event_loop_index, event_buffer, SETTING)
    event_buffer.append(event)
    
    classify_event(event)
    
    # Activity Action Classification
    activity_action = event.activity_action
    
    # Activity Matching
    if activity_action == "Activity Start": 
        print(f"Add new stack {event.frame_number}")
        stacks.append(Stack(SETTING,event))
        
    if activity_action == "NoAction": 
        if len(stacks) == 1: 
            if VERBOSE: print(f"Add Between Event {event.frame_number} to only stack")
            stacks[0].append_event(event)
        elif event.origin_request_frame: 
            idx = search_stack_for_request_frame(event.origin_request_frame)
            if VERBOSE: print(f"Add Between Event {event.frame_number} by request frame\t{idx}")
            stacks[idx].append_event(event)
        else: 
            if VERBOSE: print("Classify Between Event", event.frame_number)
            # Check attributes of each stack
            
            # we can filter out stacks that already have attributes different to the event
            exclude_indices =  exclude_stacks_by_attribute(stacks, event, stacks_out)
    
            stack_index:int = check_stack_attributes(stacks, event, exclude_indices)
                    
            if stack_index == -1:        
                stack_index = classify_by_train_sequences(event, 4, exclude_indices)
            
            # for elements that are not matchable based on 2 sequences we fall back to stream index
            if stack_index == -1: 
                stack_index = search_stream_index(event, exclude_indices)    
            
            # fallback - no match add to first stack
            if stack_index == -1:
                res = next((i for i in range(len(stacks)) if i not in exclude_indices and stacks[i].confidence),-1)
                if VERBOSE: print("NO NO MATCH", res, exclude_indices)
                stack_index = res
                
            stacks[stack_index].append_event(event)
        
    if activity_action == "Activity End":
        
        stack_index = search_stack_for_request_frame(event.origin_request_frame)
        if VERBOSE : print("Search by request frame", stack_index)
        stacks[stack_index].append_event(event)
        
        #if not event.confidence: 
        #check_pop_idx = idx
        #else:
        if event.confidence: 
            if len(stacks) > 1: 
                if VERBOSE: print("POP Confident Stack")
                stack = stacks.pop(stack_index)
                stacks_out.append(stack)
            else: 
                event.confidence = False
     

    # Loop through all currently open stacks
    for idx, stack in enumerate(stacks):
        last_event = stack[-1]
        # check for non-confident "No Action" Classifications. These could be "Activity End" Instead
        if not last_event.confidence and last_event.activity_action == "NoAction":
            # If a stack has not been continued for N event loops 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_NO_ACTION: 
                stacks.pop(idx)
                stacks_out.append(stack)
                if VERBOSE: print("POP Unsure No Action Event", event_loop_index, last_event.event_loop_index)
                
    for idx, stack in enumerate(stacks): 
        last_event = stack.events[-1]
        if not last_event.confidence and last_event.activity_action == "Activity End": 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_END_EVENT: 
            
                # we are now sure to pop the stack. 
                if VERBOSE: print("POP Unsure Activity End Stack", event_loop_index, last_event.event_loop_index)
                stacks.pop(idx)
                stacks_out.append(stack)  
                
    end_time = time.time()
    processing_times.append(end_time - start_time)
                
# pop all stacks that are still left
for stack in stacks: 
    stacks_out.append(stack)  


Add new stack 17
Add new stack 356
Add new stack 1212
Add new stack 1582
False 2336 NoAction 0.534 0.693 should be Activity End
Add new stack 2354
Add new stack 2708
Add new stack 3057
Add new stack 4467
Add new stack 4939
Add new stack 5606
False 6266 Activity End 0.741 0.574 should be NoAction
Add new stack 6879
Add new stack 7261
False 8044 NoAction 0.534 0.693 should be Activity End
Add new stack 8060
Add new stack 8421
Add new stack 9105
False 9329 NoAction 0.534 0.693 should be Activity End
Add new stack 10272
Add new stack 10384
Add new stack 11099
Add new stack 11974
Add new stack 12074
Add new stack 13457
Add new stack 14576
False 15350 NoAction 0.534 0.693 should be Activity End
Add new stack 15377
Add new stack 15720
Add new stack 16049
NO MATCH RES 16032
Add new stack 17273
Add new stack 17801
Add new stack 19585
Add new stack 20245
False 20665 NoAction 0.534 0.693 should be Activity End
Add new stack 20696
Add new stack 20928
False 21651 NoAction 0.712 0.602 should be Acti

In [106]:
import sys
import pickle
model_size = [
    sys.getsizeof(pickle.dumps(crf)),
    sys.getsizeof(pickle.dumps(activity_classifier)),
    sys.getsizeof(pickle.dumps(activity_model_data))
]

In [107]:
sum(model_size) / (1024**2)

0.7847757339477539

In [108]:
from statistics import mean,stdev
print("average processing time:",mean(processing_times)*1000)
print("max buffer size:", max(buffer_sizes),f"({max(buffer_sizes) / len(records) * 100})")


average processing time: 0.2697816080295304
max buffer size: 152 (0.24754490822923964)


In [109]:
 # classify stacks
# TODO Move into Model
def confidence_weighted_majority_voting(predictions):
    """
    Perform confidence-weighted majority voting on each sublist of predictions.

    :param predictions: A list of dictionaries where each dictionary contains predictions and their confidences.
    :return: A list of majority voted predictions for each sublist.
    """    

    # Initialize variables to store cumulative confidences for each prediction
    cumulative_confidences = {label: 0.0 for label in predictions[0][0].keys()}
    
    # Calculate cumulative confidences for each prediction across all dictionaries in the sublist
    for prediction_dict in predictions:
        for label, confidence in prediction_dict[0].items():
            cumulative_confidences[label] += confidence
    
    # Find the prediction with the maximum cumulative confidence
    return max(cumulative_confidences, key=cumulative_confidences.get)

def classify_stack(stack: Stack):
    seq = seq2features([event.to_features() for event in stack], 10,10)
    pred = activity_classifier.predict_marginals([[ele] for ele in seq])
    pred_cwmv = confidence_weighted_majority_voting(pred)
    return pred_cwmv
    
stack_predictions = [classify_stack(stack )for stack in stacks_out]


In [110]:
print(f"Contained {len(stacks)} stack in queue")
if len(stacks)> 0: 
    print([[e.frame_number for e in stack ] for stack in stacks])

start = [stack[0].frame_number for stack in stacks_out]
end = [stack[-1].frame_number for stack in stacks_out]

res_df = pd.DataFrame({"start_pred":start, "end_pred":end})

eval_df = df_gt[["start", "actual_end"]].merge(res_df,how="left", left_on ="start", right_on = "start_pred").fillna(-1).astype(int)
eval_df["pred_true"] = eval_df["actual_end"] == eval_df["end_pred"]
display(eval_df)
print(f"Accuracy of matching start and end sequences: {eval_df['pred_true'].mean()}")
print(f"Overall matching accuracy: {0.5 + eval_df['pred_true'].mean()/2}")

Contained 1 stack in queue
[[27306, 27339, 27347, 27397, 27406, 27414, 27469, 27477, 27544, 27551, 27559, 27637, 27677, 27726, 27782, 27790, 27842, 27852, 27921, 27935, 27961, 27971, 27985, 28049, 28054, 28086, 28089, 28111, 28115, 28149, 28154, 28157, 28167, 28228, 28232, 28251, 28256, 28258, 28267, 28304, 28312, 28377, 28384]]


Unnamed: 0,start,actual_end,start_pred,end_pred,pred_true
0,17,325,17,325,True
1,356,1192,356,1192,True
2,1212,1520,1212,1520,True
3,1582,2336,1582,2336,True
4,2354,2664,2354,2664,True
5,2708,4461,2708,4461,True
6,3057,4871,3057,4871,True
7,4467,4881,4467,4881,True
8,4939,6164,4939,6164,True
9,5606,6859,5606,6859,True


Accuracy of matching start and end sequences: 1.0
Overall matching accuracy: 1.0


In [111]:
df_aa_test = pd.DataFrame(df_test[["frame.number", "ActivityAction"]])
df_aa_test["ActivityAction"] = "NoAction"
df_aa_test.loc[df_aa_test["frame.number"].isin(eval_df["end_pred"]), "ActivityAction"] = "Activity End"
df_aa_test.loc[df_aa_test["frame.number"].isin(eval_df["start_pred"]), "ActivityAction"] = "Activity Start"
print(classification_report(test_labels, df_aa_test["ActivityAction"]))

                precision    recall  f1-score   support

  Activity End       1.00      1.00      1.00        37
Activity Start       1.00      1.00      1.00        37
      NoAction       1.00      1.00      1.00      1239

      accuracy                           1.00      1313
     macro avg       1.00      1.00      1.00      1313
  weighted avg       1.00      1.00      1.00      1313


In [112]:
# Function to check if intervals overlap
def intervals_overlap(row, df):
    overlapping_names = []
    overlapping_bps = set()
    for index, other_row in df.iterrows():
        if row.name != index and row['start'] <= other_row['actual_end'] and row['actual_end'] >= other_row['start']:
            overlapping_names.append(f"{other_row['activity_name']} {other_row['bp_id']}")
            overlapping_bps.add(other_row['bp_id'])
    return overlapping_names, list(overlapping_bps)

df_gt[["overlapping_activities", "overlapping_bps"]] = df_gt.apply(intervals_overlap, axis=1, df = df_gt, result_type="expand")

In [113]:
# Create dataframe with mapping of frame numbers to event stacks
frame_numbers = [event.frame_number for idx,stack in enumerate(stacks_out) for event in stack]
stack_numbers = [idx for idx,stack in enumerate(stacks_out) for event in stack]
applicant_ids = [event.attributes["applicant_id"] for idx,stack in enumerate(stacks_out) for event in stack]
activity_ids = [event.attributes["activity_id"] for idx,stack in enumerate(stacks_out) for event in stack]
mail_ids = [event.attributes["mail_id"] for idx,stack in enumerate(stacks_out) for event in stack]
sniff_time =  [event.sniff_time for idx,stack in enumerate(stacks_out) for event in stack]
case_id = [stack.case_id["id"]  if stack.case_id else -1 for idx, stack in enumerate(stacks_out) for event in stack]

df_frame_numbers = pd.DataFrame(data={"frame.number": frame_numbers, "sniff_time": sniff_time, "stack_idx": stack_numbers, "applicant_id": applicant_ids,"activity_id": activity_ids, "mail_id":mail_ids,"case_id": case_id})

# Merge Activity Name from ground truth frame to event sequences for evaluation
merged_df = df_frame_numbers.merge(df_gt[["activity_name","start","bp_id"]], how="left",left_on="frame.number", right_on="start").drop(columns="start")

merged_df[["activity_name","bp_id"]] = merged_df.groupby("stack_idx")[["activity_name","bp_id"]].ffill()
#merged_df["activity_name"] = merged_df.groupby("stack_idx")["bp_id"].ffill()

# Merge with filtered interleaved test data
merged_df = df_test.merge(merged_df, on="frame.number")

unique_no_nan = lambda x: list(filter(None, pd.unique(x)))
first_unique = lambda x: unique_no_nan(x)[0]

def compare_values(x,y):
    # Multi index and casting magic - I just want to compare the bp_ids lol
    x = int(x[0])
    y = int(y[0])

    return x == y


res = merged_df.groupby("stack_idx").agg(applicant_id = ("applicant_id", unique_no_nan),activity_id=("activity_id", unique_no_nan), mail_id=("mail_id", unique_no_nan),case_id=("case_id", first_unique),bp_id=("bp_id", unique_no_nan),frame_number_min=("frame.number","min"),frame_number_max =  ("frame.number","max"),sniff_time_min=("sniff_time_x","min"),sniff_time_max=("sniff_time_x","min"), activity_name=("activity_name", lambda x: x.head(1)))
res["stack_prediction"] = stack_predictions
# Apply the custom function to compare 'sale_order_line_id' and 'sale_order_line_id_case_id'
res["bp_true"] = res.apply(lambda x: compare_values(x["applicant_id"], x["bp_id"]), axis = 1)
res["activity_true"] = res["activity_name"] ==  res["stack_prediction"]
#res.loc["Mean","bp_true"] = res["bp_true"].mean()
#res.loc["Mean","activity_true"] = res["activity_true"].mean()

In [119]:
res["bp_id_1"] = res["bp_id"].apply(first_unique)

  unique_no_nan = lambda x: list(filter(None, pd.unique(x)))


In [120]:
res

Unnamed: 0_level_0,applicant_id,activity_id,mail_id,case_id,bp_id,frame_number_min,frame_number_max,sniff_time_min,sniff_time_max,activity_name,stack_prediction,bp_true,activity_true,bp_id_1
stack_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,[1224],[],[],1,[1224.0],17,325,2020-09-18 01:07:22.425912,2020-09-18 01:07:22.425912,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True,1224.0
1,[1224],[],[7950],1,[1224.0],356,1192,2020-09-18 01:07:28.935647,2020-09-18 01:07:28.935647,ResumeReviewActivity,ResumeReviewActivity,True,True,1224.0
2,[1225],[],[],2,[1225.0],1212,1520,2020-09-18 01:07:30.784745,2020-09-18 01:07:30.784745,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True,1225.0
3,[1225],[],[7954],2,[1225.0],1582,2336,2020-09-18 01:07:35.257826,2020-09-18 01:07:35.257826,ResumeReviewActivity,ResumeReviewActivity,True,True,1225.0
4,[1226],[],[],3,[1226.0],2354,2664,2020-09-18 01:07:39.037302,2020-09-18 01:07:39.037302,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True,1226.0
5,[1226],[],[7958],3,[1226.0],2708,4461,2020-09-18 01:07:45.544565,2020-09-18 01:07:45.544565,ResumeReviewActivity,ResumeReviewActivity,True,True,1226.0
6,"[1225, 1227]",[732],[7960],2,[1225.0],3057,4871,2020-09-18 01:07:46.041104,2020-09-18 01:07:46.041104,ScheduleAnInterviewActivityCall,ScheduleAnInterviewActivityCall,True,True,1225.0
7,[1227],[],[],4,[1227.0],4467,4881,2020-09-18 01:07:47.605781,2020-09-18 01:07:47.605781,GenerateJobApplicationActivity,GenerateJobApplicationActivity,True,True,1227.0
8,[1227],[],[7964],4,[1227.0],4939,6164,2020-09-18 01:07:52.101027,2020-09-18 01:07:52.101027,ResumeReviewActivity,ResumeReviewActivity,True,True,1227.0
9,[1225],[732],"[7965, 7967]",2,[1225.0],5606,6859,2020-09-18 01:07:53.104460,2020-09-18 01:07:53.104460,PerformAnInterviewCall,PerformAnInterviewCall,True,True,1225.0


In [121]:
from sklearn.metrics import classification_report

print("------------------ Activity Type --------------")
print(classification_report(res["activity_name"], res["stack_prediction"]))

------------------ Activity Type --------------
                                 precision    recall  f1-score   support

               ContractProposal       1.00      1.00      1.00         1
 GenerateJobApplicationActivity       1.00      1.00      1.00        10
         PerformAnInterviewCall       1.00      1.00      1.00         5
      PerformAnInterviewMeeting       1.00      1.00      1.00         3
           ResumeReviewActivity       1.00      1.00      1.00        10
ScheduleAnInterviewActivityCall       1.00      1.00      1.00         5
     ScheduleAnInterviewMeeting       1.00      1.00      1.00         3

                       accuracy                           1.00        37
                      macro avg       1.00      1.00      1.00        37
                   weighted avg       1.00      1.00      1.00        37


In [122]:
first_int = lambda x: int(x[0])


pred = res["applicant_id"].map(first_int)
true = res["bp_id"].map(first_int)


print("------------------ Activity Type --------------")
print(classification_report(true, pred))

------------------ Activity Type --------------
              precision    recall  f1-score   support

        1224       1.00      1.00      1.00         2
        1225       1.00      1.00      1.00         4
        1226       1.00      1.00      1.00         2
        1227       1.00      1.00      1.00         2
        1228       1.00      1.00      1.00         6
        1229       1.00      1.00      1.00         7
        1230       1.00      1.00      1.00         2
        1231       1.00      1.00      1.00         6
        1232       1.00      1.00      1.00         4
        1233       1.00      1.00      1.00         2

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37


In [123]:
res[["case_id","bp_id_1"]].value_counts()

case_id  bp_id_1
6        1229.0     7
5        1228.0     6
8        1231.0     6
2        1225.0     4
9        1232.0     4
1        1224.0     2
3        1226.0     2
4        1227.0     2
7        1230.0     2
10       1233.0     2
Name: count, dtype: int64

-> 1 to 1 Matching of Case_id and Bp_id -> 1.0 is correct

In [124]:
res["activity_true"].mean()

1.0

In [125]:
res["bp_true"].mean()

1.0

In [126]:
df_gt

Unnamed: 0,activity_name,start,end,actual_end,overlapping_activities,bp_id,overlapping_bps,classification,Multi Class Classification,Single Class Classification,probability
0,GenerateJobApplicationActivity,17,356,325,[],1224,[],GenerateJobApplicationActivity,TRUE,True,0.994192
1,ResumeReviewActivity,356,1212,1192,[],1224,[],ResumeReviewActivity,TRUE,True,0.993149
2,GenerateJobApplicationActivity,1212,1582,1520,[],1225,[],GenerateJobApplicationActivity,TRUE,True,0.994192
3,ResumeReviewActivity,1582,2354,2336,[],1225,[],ResumeReviewActivity,TRUE,True,0.993329
4,GenerateJobApplicationActivity,2354,2708,2664,[],1226,[],GenerateJobApplicationActivity,TRUE,True,0.994192
5,ResumeReviewActivity,2708,4467,4461,[ScheduleAnInterviewActivityCall 1225],1226,[1225],"ResumeReviewActivity, ScheduleAnInterviewActiv...",TRUE,True,0.211112
6,ScheduleAnInterviewActivityCall,3057,4881,4871,"[ResumeReviewActivity 1226, GenerateJobApplica...",1225,"[1226, 1227]","ResumeReviewActivity, ScheduleAnInterviewActiv...",TRUE,True,0.021792
7,GenerateJobApplicationActivity,4467,4939,4881,[ScheduleAnInterviewActivityCall 1225],1227,[1225],GenerateJobApplicationActivity,FALSE,True,0.986619
8,ResumeReviewActivity,4939,6205,6164,[PerformAnInterviewCall 1225],1227,[1225],ResumeReviewActivity,FALSE,True,0.83989
9,PerformAnInterviewCall,5606,6879,6859,[ResumeReviewActivity 1227],1225,[1227],PerformAnInterviewCall,FALSE,True,0.962106


In [None]:
out = res.sort_values(by= "sniff_time_min")[["sniff_time_min","stack_prediction","case_id"]].reset_index(drop=True)
out.columns = ["timestamp", "activity", "case_id"]

In [None]:
out.to_csv("../../data_v3/out/hr_xes_out.csv", index = False)