In [1]:
import ast
import re

import metrics
import numpy as np
import pandas as pd
from event_loop.stack import Stack
from event_loop.event import keep_event

%load_ext autoreload
%load_ext memory_profiler

# Prerequisuites

## Activity Action Model
Train a Model with the task of classifying Start, End and NoAction events in the interleaved data. 
Training is done during a "warmup" phase with generated training data. 

### Load Data

In [2]:
# HR data in data/Train/R1 is missing frame.number. We take another (already filtered) dataset and apply our feature extraction to this one
df_train_in = pd.read_csv('../../data_v3/hr_extended_features.csv', converters={"MessageAttributes": ast.literal_eval})

In [3]:
# This is the Interleaved Data Set for our pipeline
df_il_in = pd.read_csv('../../data/HR-INTERLEAVED/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})

### Preprocessing

In [4]:
from event_loop.preprocessing.dataframe import extract_features, pre_process

# df_train is already at R4 filter level. Only feature extraction necessary
df_train = extract_features(df_train_in)

# data is at R1 Level. Apply filter and feature extraction
df_test = pre_process(df_il_in)

In [5]:
# Load start and end events from ground truth data.
# Tag according frames in interleaved data for testing
df_gt = pd.read_csv("../../data_v3/hr_ground_truth.csv")

start_indices = df_gt["start"].tolist()
end_indices = df_gt["actual_end"].tolist()

df_test["ActivityAction"] = df_test["frame.number"].apply(lambda x: "Activity Start" if x in start_indices else
("Activity End" if x in end_indices else "NoAction"))

In [6]:
# ------------ OPTIONAL ---------------
# TODO Duplicate with Activity Model - move down and delete
# Form sequences in training data by grouping
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

# check sequence length of training data
df_train["SequenceNumber"].value_counts()

SequenceNumber
13     132
21     115
6      115
1      115
5       96
      ... 
404     15
158     15
157     15
25      15
154     14
Name: count, Length: 933, dtype: int64

In [7]:
def mark_start_end(df):
    # Mark start event of each BusinessActivity Instance
    df["activityStart"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount() == 0
    # Mark end event of each Business Activity Instance
    df["activityEnd"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount(ascending=False) == 0
    # Merge start and end columns to form labels
    df["ActivityAction"] = df.apply(lambda row: "Activity Start" if row["activityStart"] else (
        "Activity End" if row["activityEnd"] else 'NoAction'), axis=1)

    return df.drop(["activityStart", 'activityEnd'], axis=1)

df_train = mark_start_end(df_train)

In [8]:
cols = ["event_with_roles", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]

In [9]:
def dict_to_features(dict):
    return [[{**d, "bias": 1.0}] for d in dict]


def extract_labels(labels):
    return [[y] for y in labels]

In [10]:
train_features = df_train[cols].to_dict("records")
train_features = dict_to_features(train_features)
train_labels = extract_labels(df_train["ActivityAction"])

In [11]:
test_features = df_test[cols].to_dict("records")
test_features = dict_to_features(test_features)
test_labels = extract_labels(df_test["ActivityAction"])

### Model Training

In [12]:
# optional Train Test split for evaluation on training data
# In prod case, we train on 100% training data and evaluate on interleaved data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3, random_state=42)

In [13]:
%%memit
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
crf.fit(train_features, train_labels)

peak memory: 529.62 MiB, increment: 0.02 MiB


### Optimization

In [14]:
from sklearn.metrics import make_scorer
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import metrics

# define fixed parameters and parameters to search
crf2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=200, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='macro', labels=np.unique(test_labels))

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=5, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=150, 
                        scoring=f1_scorer)
#rs.fit(train_features, train_labels)

#crf = rs.best_estimator_

### Evaluation

In [15]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn_crfsuite import metrics


def flatten(xss):
    return [x for xs in xss for x in xs]


def evaluate(model, x, y_true):
    y_pred = model.predict(x)
    print(metrics.flat_f1_score(y_true, y_pred, average='macro', labels=model.classes_))
    print(metrics.flat_classification_report(y_true, y_pred, model.classes_))
    [print(label, "\n", matrix) for matrix, label in
     zip(multilabel_confusion_matrix(flatten(y_true), flatten(y_pred), labels=model.classes_), model.classes_)]


In [16]:
evaluate(crf, test_features, test_labels)

0.9608303595410767
                precision    recall  f1-score   support

Activity Start       1.00      1.00      1.00        37
      NoAction       1.00      1.00      1.00      1239
  Activity End       0.94      0.84      0.89        37

      accuracy                           0.99      1313
     macro avg       0.98      0.95      0.96      1313
  weighted avg       0.99      0.99      0.99      1313

Activity Start 
 [[1276    0]
 [   0   37]]
NoAction 
 [[  68    6]
 [   2 1237]]
Activity End 
 [[1274    2]
 [   6   31]]


In [17]:
pred = crf.predict(test_features)

In [18]:
pred_mg = crf.predict_marginals(test_features)

In [19]:
#wrong_pred_idx = [i for i,(e1,e2) in enumerate(zip(pred,test_labels)) if e1 != e2]

In [20]:
#margs = [pred_mg[i] for i in wrong_pred_idx]
columns = pred_mg[0][0].keys()
flat_margs = [[entry[column] for column in columns] for sublist in pred_mg for entry in sublist]
df_margs = pd.DataFrame(flat_margs, columns=columns)

In [21]:
#df_eval = pd.DataFrame([(pred[i], test_labels[i], df_test.iloc[i]["frame.number"] ) for i in wrong_pred_idx],columns = ["predicted","true","frame.number"],)

df_eval = pd.DataFrame({"predicted": pred, "true": test_labels, "frame.number":df_test["frame.number"]}).reset_index(drop=True)

df_eval = pd.concat([df_eval, df_margs], axis = 1)

df_eval["pred_true"] = df_eval["predicted"] == df_eval["true"]



In [22]:
from scipy.stats import entropy

cols = ['Activity Start', 'NoAction', 'Activity End']

# Calculate entropy for each row using the specified columns
df_eval["entropy"] = df_eval[cols].apply(entropy, axis=1)


In [23]:
df_eval[~ df_eval["pred_true"]].sort_values(by='entropy', ascending=False)

Unnamed: 0,predicted,true,frame.number,Activity Start,NoAction,Activity End,pred_true,entropy
126,[NoAction],[Activity End],2336,0.000325,0.531728,0.467948,False,0.693818
403,[NoAction],[Activity End],8044,0.000325,0.531728,0.467948,False,0.693818
471,[NoAction],[Activity End],9329,0.000325,0.531728,0.467948,False,0.693818
749,[NoAction],[Activity End],15350,0.000325,0.531728,0.467948,False,0.693818
983,[NoAction],[Activity End],20665,0.000325,0.531728,0.467948,False,0.693818
1013,[NoAction],[Activity End],21651,0.000145,0.710542,0.289313,False,0.602914
313,[Activity End],[NoAction],6266,0.000151,0.257351,0.742498,False,0.571706
1214,[Activity End],[NoAction],26182,0.000151,0.257351,0.742498,False,0.571706


We observe a high entropy > 0.5 for all wrong classifications

-> Apply fallback model for this cases

## Activity Model
The activity model utilises multiple sliding windows over the training data for pattern matching



In [24]:
from sklearn.preprocessing import LabelEncoder
from numpy.lib.stride_tricks import sliding_window_view


def sequence_by_activities(data, seq_data):
    return [data[seq_data == i].values for i in range(seq_data.max())]


def get_unique_sequences(seq_data):
    # Convert each array to a tuple and create a set of tuples
    array_set = set(tuple(arr) for arr in seq_data)

    # Convert the set of tuples back to a list of NumPy arrays
    return [np.array(arr) for arr in array_set]


df_train["joined"] = df_train["event_with_roles"] + df_train["selective_file_data"]

# Label Encode Training Data 
le = LabelEncoder()
df_train["joined_LE"] = le.fit_transform(df_train["joined"])

# Mark groups of Instance Number and BusinessActivity with sequence numbers
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
# Align Sequence Numbers so that they start at 0
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

# Divides dataframe into arrays according to to Sequence Data Indicator
data_joined_LE = sequence_by_activities(df_train["joined_LE"], df_train["SequenceNumber"])

unique_sequences = get_unique_sequences(data_joined_LE)

print(f"Reduced the number of sequences from {len(data_joined_LE)} to {len(unique_sequences)} unique ones")

def get_activity_model_data(max_window_length):
    return [np.concatenate([sliding_window_view(seq, i) for seq in unique_sequences], axis=0) for i in
                       range(max_window_length)]
    


# form sliding window sequences of Size N for Training Data 
#activity_model_data = [np.concatenate([sliding_window_view(seq, i) for seq in unique_sequences], axis=0) for i in range(N)]

Reduced the number of sequences from 932 to 26 unique ones


# Action Loop

Main loop. Gets raw R1 data as input. 
Applies filtering, activity action and sequence classification

In [25]:
records = df_il_in.to_dict("records")

In [26]:
df_il_in

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,FileName,BusinessActivity,InstanceNumber,sniff_time,frame.number,synthetic_sniff_time,synthetic_sniff_time_str,session_generalized,HighestLayerProtocol,MessageType_WithRole,MessageType,MessageAttributes,pgsql.query,pgsql.target
0,0,0,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:07:22.425396,13,2020-09-18 13:37:18.715396,2020-09-18 13:37:18.715396,192.168.11.2-192.168.11.6 (1),tcp,End Point (HR Manager)->Odoo Application:[Conn...,Connection establish request (SYN),{},,
1,1,1,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:07:22.425634,14,2020-09-18 13:37:21.095634,2020-09-18 13:37:21.095634,192.168.11.2-192.168.11.6 (1),tcp,Odoo Application->End Point (HR Manager):[Conn...,Connection establish acknowledge (SYN+ACK),{},,
2,2,3,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:07:22.425912,17,2020-09-18 13:37:23.875912,2020-09-18 13:37:23.875912,192.168.11.2-192.168.11.6 (1),http,End Point (HR Manager)->Odoo Application:[Http...,HttpRequest:POST /xmlrpc/2/common HTTP/1.1\r\n,"{'': 'POST /xmlrpc/2/common HTTP/1.1\r\n', '_w...",,
3,3,5,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:07:22.429086,19,2020-09-18 13:37:55.619086,2020-09-18 13:37:55.619086,192.168.11.1-192.168.11.2 (1),tcp,Odoo Application->db Server/Mail Server:[Conne...,Connection establish request (SYN),{},,
4,4,6,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:07:22.429299,20,2020-09-18 13:37:57.749299,2020-09-18 13:37:57.749299,192.168.11.1-192.168.11.2 (1),tcp,db Server/Mail Server->Odoo Application:[Conne...,Connection establish acknowledge (SYN+ACK),{},,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61398,61398,65564,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:11:38.925295,28860,2020-10-18 06:11:34.205295,2020-10-18 06:11:34.205295,192.168.11.1-192.168.11.2 (231),pgsql,db Server/Mail Server->Odoo Application:[Pgsql...,PgsqlResponse:Ready for query,"{'pgsql.type': 'Ready for query', 'pgsql.lengt...",,
61399,61399,65562,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:11:38.925295,28860,2020-10-18 06:11:34.205295,2020-10-18 06:11:34.205295,192.168.11.1-192.168.11.2 (231),pgsql,db Server/Mail Server->Odoo Application:[Pgsql...,PgsqlResponse:Row description,"{'pgsql.type': 'Row description', 'pgsql.lengt...",,
61400,61400,65565,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:11:38.925470,28861,2020-10-18 06:11:35.955470,2020-10-18 06:11:35.955470,192.168.11.1-192.168.11.2 (231),pgsql,Odoo Application->db Server/Mail Server:[Pgsql...,PgsqlRequest:Simple query,"{'pgsql.type': 'Simple query', 'pgsql.length':...",COMMIT,
61401,61401,65566,rec_over_http_prod_1.pcap,HrRecruitmentProcess,over,2020-09-18 01:11:38.925552,28862,2020-10-18 06:11:36.775552,2020-10-18 06:11:36.775552,192.168.11.1-192.168.11.2 (231),pgsql,db Server/Mail Server->Odoo Application:[Pgsql...,PgsqlResponse:Command completion,"{'pgsql.type': 'Command completion', 'pgsql.le...",,


In [27]:
from event_loop.event import Event


def get_max_from_dict(d: dict):
    return max(d, key= lambda k: d[k])
    

def classify_event(event: Event): 
    margs = crf.predict_marginals_single([event.to_features()])[0]
    pred = get_max_from_dict(margs)
    
    e = entropy([p for p in margs.values()])
    
    #print(f"{event.frame_number} {pred} {margs[pred]:.3f} {e:.3f},")
    
    
    true_val = df_eval[df_eval["frame.number"] == event.frame_number]["true"].iloc[0][0]
    #print(true_val)
    
    # Change to entropy of prediction
    if e > ENTROPY_THRESHOLD: 
    #if pred != true_val:
        # If pred is wrong we have two options for "wrong classifications" 
        # 1 -> We have No Action predicted although the stack should end here 
        # Idea 1: If the stack did not change after N events, emit it. 
        
        # Mark the confidence on the event.
        event.confidence = False 


        # 2 -> We have End predicted although the stack should continue. 
        if pred != true_val:
            print(f"False {event.frame_number} {pred} {margs[pred]:.3f} {e:.3f} should be {true_val}")
       
        event.activity_action = pred
            
    else: 
        event.confidence = True
        event.activity_action = pred

In [28]:
def search_stack_for_request_frame(frame_number):
    for index, stack in enumerate(stacks):
        if stack.contains_request_frame(frame_number): 
            return index
    return -1

def search_window_for_sequence(seq): 
    """
    Check for pattern matches with the training data and return the count
    :param seq: array_like
                sequence of events
            
    :return: number of occurences of seq in training data
    """
    return np.sum(np.all(activity_model_data[len(seq)] == seq, axis = 1))


def classify_by_train_sequences(event: Event,n : int, exclude_indices: list[int]): 
    # search for existing stacks in training data 
    sequences = [le.transform([ e.to_activity_model_string() for e in stack]+[event.to_activity_model_string()]) for stack in stacks]  
    
    # loop to max 2 elements down
    for i in range(n, 1, -1):   
        res = [search_window_for_sequence(seq[-i:]) if j not in exclude_indices else -1 for j,seq in enumerate(sequences)]
        
        max_res = max(res)
        max_res_count = res.count(max_res)
        idx = np.argmax(res)
        
        if max_res > 0: 
            #print("res:",res, max_res, max_res_count, "->", idx)
            return idx
        
    return -1

def search_stream_index(event: Event, exclude_indices: list[int]) -> int: 
    indices = [i for i,stack in enumerate(stacks) if stack.contains_stream_index(event.stream_index) and i not in exclude_indices]

    if len(indices) == 1: 
        return indices[0]
    else:
        return -1
    

def check_stack_attributes(stacks: list[Stack], event: Event) -> int:
    for key, value in event.attributes.items():
        if key in HR_ATTRIBUTES and value:
            indices = [i for i, stack in enumerate(stacks) if stack.contains_attribute(key,value)]
            print()
            
            if len(indices) ==1: 
                print("MATCH", indices)
                # we have a clear match -> return idx
                return indices[0]

    return -1


def exclude_stacks_by_attribute(stacks: list[Stack], event: Event): 
    
    exclude_indices = []
    
    for key, value in event.attributes.items():
        if key in HR_ATTRIBUTES and value:
            exclude_indices.extend(i for i, stack in enumerate(stacks) if stack.has_attribute(key) and not stack.contains_attribute(key, value))


    return exclude_indices
    
    

In [29]:
%autoreload 1

import time
# Parameter
EVENT_LOOP_CUTOFF_NO_ACTION = 3
EVENT_LOOP_CUTOFF_END_EVENT = 3
ENTROPY_THRESHOLD = 0.5 #0.5
MAX_WINDOW_SIZE = 5
VERBOSE = False
SETTING = "HR"

# init variables
event_buffer: list[Event] = []
stacks: list[Stack] = []
stacks_out: list[Stack] = []
event_loop_index = 0


# TODO Change config to something like this:
config_dict = {
    "PTP": {
        "1to1" : ["applicant_id", "activity_id"],
        "1toN": ["mail_id"]
    }
}

HR_ATTRIBUTES = ["applicant_id", "activity_id"]




check_pop_idx = None

activity_model_data = get_activity_model_data(MAX_WINDOW_SIZE)


processing_times = []
buffer_sizes = []
for i, event_data in enumerate(records):
    # log size of event stacks
    buffer_sizes.append(sum([len(stack) for stack in stacks]))
    start_time = time.time()

    # Filter Event Stream
    if not keep_event(event_data):
        # skip event in loop
        continue
    
    # count every not filtered event for event loop index
    event_loop_index += 1

    # Extract Features and generate Event Object
    event = Event(event_data, event_loop_index, event_buffer, SETTING)
    event_buffer.append(event)
    
    classify_event(event)
    
    # Activity Action Classification
    activity_action = event.activity_action
    
    # Activity Matching
    if activity_action == "Activity Start": 
        print(f"Add new stack {event.frame_number}")
        stacks.append(Stack(SETTING,event))
        
    if activity_action == "NoAction": 
        if len(stacks) == 1: 
            if VERBOSE: print(f"Add Between Event {event.frame_number} to only stack")
            stacks[0].append_event(event)
        elif event.origin_request_frame: 
            idx = search_stack_for_request_frame(event.origin_request_frame)
            if VERBOSE: print(f"Add Between Event {event.frame_number} by request frame\t{idx}")
            stacks[idx].append_event(event)
        else: 
            if VERBOSE: print("Classify Between Event", event.frame_number)
            # Check attributes of each stack
            
            # we can filter out stacks that already have attributes different to the event
            exclude_indices = exclude_stacks_by_attribute(stacks, event)
    
            stack_index = -1
                    
            if stack_index == -1:                   
                stack_index = classify_by_train_sequences(event, 4, exclude_indices)
            
            if stack_index == -1: 
                stack_index = check_stack_attributes(stacks, event)
            # for elements that are not matchable based on 2 sequences we fall back to stream index
            if stack_index == -1: 
                stack_index = search_stream_index(event, exclude_indices)    
            
            # fallback - no match add to first stack
            if stack_index == -1:
                print("NO NO MATCH")
                stack_index = 0
                
            stacks[stack_index].append_event(event)
        
    if activity_action == "Activity End":
        
        stack_index = search_stack_for_request_frame(event.origin_request_frame)
        print("Search by request frame", stack_index)
        stacks[stack_index].append_event(event)
        
        #if not event.confidence: 
        #check_pop_idx = idx
        #else:
        if event.confidence: 
            stack = stacks.pop(stack_index)
            stacks_out.append(stack)
     
    
    # Loop through all currently open stacks
    for idx, stack in enumerate(stacks):
        last_event = stack[-1]
        # check for non-confident "No Action" Classifications. These could be "Activity End" Instead
        if not last_event.confidence and last_event.activity_action == "NoAction":
            # If a stack has not been continued for N event loops 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_NO_ACTION: 
                stacks.pop(idx)
                stacks_out.append(stack)
                print("POP Unsure No Action Event", event_loop_index, last_event.event_loop_index)
                
    for idx, stack in enumerate(stacks): 
        last_event = stack.events[-1]
        if not last_event.confidence and last_event.activity_action == "Activity End": 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_END_EVENT: 
            
                # we are now sure to pop the stack. 
                print("POP Unsure Activity End Stack", event_loop_index, last_event.event_loop_index)
                stacks.pop(idx)
                stacks_out.append(stack)  
    end_time = time.time()
    processing_times.append(end_time - start_time)
                
# pop all stacks that are still left
for stack in stacks: 
    stacks_out.append(stack)  

Add new stack 17
Search by request frame 0
Add new stack 356
Search by request frame 0
Add new stack 1212
Search by request frame 0
Add new stack 1582
False 2336 NoAction 0.532 0.694 should be Activity End
Add new stack 2354
POP Unsure No Action Event 131 127
Search by request frame 0
Add new stack 2708
Add new stack 3057
Search by request frame 0
Add new stack 4467
Search by request frame 0
Search by request frame 0
Add new stack 4939
Add new stack 5606
Search by request frame 0
False 6266 Activity End 0.742 0.572 should be NoAction
Search by request frame 0
Search by request frame 0
Add new stack 6879
Search by request frame 0
Add new stack 7261
False 8044 NoAction 0.532 0.694 should be Activity End
Add new stack 8060
POP Unsure No Action Event 408 404
Search by request frame 0
Add new stack 8421
Add new stack 9105
False 9329 NoAction 0.532 0.694 should be Activity End
POP Unsure No Action Event 476 472
Search by request frame 0
Add new stack 10272
Add new stack 10384
Search by reque

In [30]:
len(processing_times)

1313

In [31]:
max(buffer_sizes) / len(records)

0.002508020780743612

In [32]:
import pickle
with open("../../output/models/HR_activity_boundaries.pkl", "wb") as f:
    pickle.dump(crf, f)
    
with open("../../output/models/HR_activity.pkl", "wb") as f:
    pickle.dump(crf, f)

In [33]:
from statistics import mean,stdev
mean(processing_times ) *1000

0.26438768274985647

In [34]:
print(f"Contained {len(stacks)} stack in queue")
if len(stacks)> 0: 
    print([[e.frame_number for e in stack ] for stack in stacks])

start = [stack[0].frame_number for stack in stacks_out]
end = [stack[-1].frame_number for stack in stacks_out]

res_df = pd.DataFrame({"start_pred":start, "end_pred":end})

eval_df = df_gt[["start", "actual_end"]].merge(res_df,how="left", left_on ="start", right_on = "start_pred").fillna(-1).astype(int)
eval_df["pred_true"] = eval_df["actual_end"] == eval_df["end_pred"]

display(eval_df)
print(f"Accuracy of matching start and end sequences: {eval_df['pred_true'].mean()}")

Contained 0 stack in queue


Unnamed: 0,start,actual_end,start_pred,end_pred,pred_true
0,17,325,17,325,True
1,356,1192,356,1192,True
2,1212,1520,1212,1520,True
3,1582,2336,1582,2336,True
4,2354,2664,2354,2664,True
5,2708,4461,2708,4461,True
6,3057,4871,3057,4871,True
7,4467,4881,4467,4881,True
8,4939,6164,4939,6164,True
9,5606,6859,5606,6859,True


Accuracy of matching start and end sequences: 1.0


In [35]:
# Function to check if intervals overlap
def intervals_overlap(row, df):
    overlapping_names = []
    overlapping_bps = set()
    for index, other_row in df.iterrows():
        if row.name != index and row['start'] <= other_row['actual_end'] and row['actual_end'] >= other_row['start']:
            overlapping_names.append(f"{other_row['activity_name']} {other_row['bp_id']}")
            overlapping_bps.add(other_row['bp_id'])
    return overlapping_names, list(overlapping_bps)

df_gt[["overlapping_activities", "overlapping_bps"]] = df_gt.apply(intervals_overlap, axis=1, df = df_gt, result_type="expand")

In [36]:
# Create dataframe with mapping of frame numbers to event stacks
frame_numbers = [event.frame_number for idx,stack in enumerate(stacks_out) for event in stack]
stack_numbers = [idx for idx,stack in enumerate(stacks_out) for event in stack]
applicant_ids = [event.attributes["applicant_id"] for idx,stack in enumerate(stacks_out) for event in stack]
activity_ids = [event.attributes["activity_id"] for idx,stack in enumerate(stacks_out) for event in stack]
mail_ids = [event.attributes["mail_id"] for idx,stack in enumerate(stacks_out) for event in stack]


df_frame_numbers = pd.DataFrame(data={"frame.number": frame_numbers, "stack_idx": stack_numbers, "applicant_id": applicant_ids,"activity_id": activity_ids, "mail_id":mail_ids})

# Merge Activity Name from ground truth frame to event sequences for evaluation
merged_df = df_frame_numbers.merge(df_gt[["activity_name","start","bp_id"]], how="left",left_on="frame.number", right_on="start").drop(columns="start")

merged_df[["activity_name","bp_id"]] = merged_df.groupby("stack_idx")[["activity_name","bp_id"]].ffill()
#merged_df["activity_name"] = merged_df.groupby("stack_idx")["bp_id"].ffill()

# Merge with filtered interleaved test data
merged_df = df_test.merge(merged_df, on="frame.number")

In [38]:
merged_df.groupby("stack_idx").agg({"applicant_id": "unique", "activity_id": "unique","mail_id": "unique","bp_id": "unique","frame.number": ("min","max")})

Unnamed: 0_level_0,applicant_id,activity_id,mail_id,bp_id,frame.number,frame.number
Unnamed: 0_level_1,unique,unique,unique,unique,min,max
stack_idx,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,"[None, 1224]",[None],[None],[1224.0],17,325
1,"[None, 1224]",[None],"[None, 7950]",[1224.0],356,1192
2,"[None, 1225]",[None],[None],[1225.0],1212,1520
3,"[None, 1225]",[None],"[None, 7954]",[1225.0],1582,2336
4,"[None, 1226]",[None],[None],[1226.0],2354,2664
5,"[None, 1226]",[None],"[None, 7958]",[1226.0],2708,4461
6,"[None, 1225]","[None, 732]","[None, 7960]",[1225.0],3057,4871
7,"[None, 1227]",[None],[None],[1227.0],4467,4881
8,"[None, 1227]",[None],"[None, 7964]",[1227.0],4939,6164
9,"[None, 1225]","[None, 732]","[None, 7965, 7967]",[1225.0],5606,6859


In [39]:
df_gt

Unnamed: 0,activity_name,start,end,actual_end,overlapping_activities,bp_id,overlapping_bps,classification,Multi Class Classification,Single Class Classification,probability
0,GenerateJobApplicationActivity,17,356,325,[],1224,[],GenerateJobApplicationActivity,TRUE,True,0.994192
1,ResumeReviewActivity,356,1212,1192,[],1224,[],ResumeReviewActivity,TRUE,True,0.993149
2,GenerateJobApplicationActivity,1212,1582,1520,[],1225,[],GenerateJobApplicationActivity,TRUE,True,0.994192
3,ResumeReviewActivity,1582,2354,2336,[],1225,[],ResumeReviewActivity,TRUE,True,0.993329
4,GenerateJobApplicationActivity,2354,2708,2664,[],1226,[],GenerateJobApplicationActivity,TRUE,True,0.994192
5,ResumeReviewActivity,2708,4467,4461,[ScheduleAnInterviewActivityCall 1225],1226,[1225],"ResumeReviewActivity, ScheduleAnInterviewActiv...",TRUE,True,0.211112
6,ScheduleAnInterviewActivityCall,3057,4881,4871,"[ResumeReviewActivity 1226, GenerateJobApplica...",1225,"[1226, 1227]","ResumeReviewActivity, ScheduleAnInterviewActiv...",TRUE,True,0.021792
7,GenerateJobApplicationActivity,4467,4939,4881,[ScheduleAnInterviewActivityCall 1225],1227,[1225],GenerateJobApplicationActivity,FALSE,True,0.986619
8,ResumeReviewActivity,4939,6205,6164,[PerformAnInterviewCall 1225],1227,[1225],ResumeReviewActivity,FALSE,True,0.83989
9,PerformAnInterviewCall,5606,6879,6859,[ResumeReviewActivity 1227],1225,[1227],PerformAnInterviewCall,FALSE,True,0.962106


# Dump Models for Memory Evaluation