In [10]:
import ast
import re
from event_loop.preprocessing.dataframe import *

import metrics
import numpy as np
import pandas as pd


%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prerequisuites

## Activity Action Model
Train a Model with the task of classifying Start, End and NoAction events in the interleaved data. 
Training is done during a "warmup" phase with generated training data. 

### Load Data

In [11]:
# HR data in data/Train/R1 is missing frame.number. We take another (already filtered) dataset and apply our feature extraction to this one
df_train_in = pd.read_csv('../../data_v3/hr_extended_features.csv', converters={"MessageAttributes": ast.literal_eval})

In [12]:
# This is the Interleaved Data Set for our pipeline
df_il_in = pd.read_csv('../../data/HR-INTERLEAVED/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})

In [13]:
df_train_in

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,FileName,BusinessActivity,frame.number,sniff_time_str,sniff_time,InstanceNumber,synthetic_sniff_time,...,file_data,IsStartActivity,IsEndActivity,ActivityAction,IsEndBP,IsStartBP,selective_filter_data,origin_request_method,origin_selective_filter_data,is_archive_applicant
0,0,0,0,GenerateJobApplicationActivity_1.pcap,GenerateJobApplicationActivity,26,2020-05-12 09:57:23.762710,2020-05-12 09:57:23.762710,1,2020-05-13 10:07:18.092710,...,['version'],,0.0,Activity Start,False,True,version,,,False
1,1,1,1,GenerateJobApplicationActivity_1.pcap,GenerateJobApplicationActivity,127,2020-05-12 09:57:28.092427,2020-05-12 09:57:28.092427,1,2020-05-13 22:08:59.592427,...,"['server_version', '12.0-20190820', 'server_ve...",0.0,0.0,NoAction,False,False,,version,version,False
2,2,2,2,GenerateJobApplicationActivity_1.pcap,GenerateJobApplicationActivity,135,2020-05-12 09:57:28.094494,2020-05-12 09:57:28.094494,1,2020-05-13 22:09:20.264494,...,"['authenticate', 'odoo01', 'dana.wireless@gmai...",0.0,0.0,NoAction,False,False,,,,False
3,3,3,3,GenerateJobApplicationActivity_1.pcap,GenerateJobApplicationActivity,192,2020-05-12 09:57:28.580005,2020-05-12 09:57:28.580005,1,2020-05-13 23:30:15.860005,...,,0.0,0.0,NoAction,False,False,,,,False
4,4,4,4,GenerateJobApplicationActivity_1.pcap,GenerateJobApplicationActivity,216,2020-05-12 09:57:28.729685,2020-05-12 09:57:28.729685,1,2020-05-13 23:55:12.809685,...,['6'],0.0,0.0,NoAction,False,False,IsNumber,authenticate,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33068,33068,11958,11958,ResumeReviewActivity_250.pcap,ResumeReviewActivity,729,2020-05-14 00:17:45.346366,2020-05-14 00:17:45.346366,250,2020-05-14 04:15:16.416366,...,,0.0,0.0,NoAction,False,False,,,,False
33069,33069,11959,11959,ResumeReviewActivity_250.pcap,ResumeReviewActivity,766,2020-05-14 00:17:45.361497,2020-05-14 00:17:45.361497,250,2020-05-14 04:17:47.741497,...,['1'],0.0,0.0,NoAction,False,False,IsNumber,execute_kw,hr.applicant_write_stage_id_2,True
33070,33070,11960,11960,ResumeReviewActivity_250.pcap,ResumeReviewActivity,774,2020-05-14 00:17:45.363203,2020-05-14 00:17:45.363203,250,2020-05-14 04:18:04.803203,...,"['execute_kw', 'odoo01', '6', '123456789', 'hr...",0.0,0.0,NoAction,False,False,hr.applicant_archive_applicant,,,False
33071,33071,11961,11961,ResumeReviewActivity_250.pcap,ResumeReviewActivity,840,2020-05-14 00:17:45.427437,2020-05-14 00:17:45.427437,250,2020-05-14 04:28:47.207437,...,,0.0,0.0,NoAction,False,False,,,,False


### Preprocessing

In [14]:
# data is at R1 Level. Apply filter and feature extraction
df_train = pre_process(df_train_in)

df_test = pre_process(df_il_in)

In [15]:
# Load start and end events from ground truth data.
# Tag according frames in interleaved data for testing
df_gt = pd.read_csv("../../data_v3/hr_ground_truth.csv")

start_indices = df_gt["start"].tolist()
end_indices = df_gt["actual_end"].tolist()

df_test["ActivityAction"] = df_test["frame.number"].apply(lambda x: "Activity Start" if x in start_indices else
("Activity End" if x in end_indices else "NoAction"))

In [16]:
# ------------ OPTIONAL ---------------
# TODO Duplicate with Activity Model - move down and delete
# Form sequences in training data by grouping
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

# check sequence length of training data
df_train["SequenceNumber"].value_counts()

SequenceNumber
13     132
21     115
6      115
1      115
5       96
      ... 
404     15
158     15
157     15
25      15
154     14
Name: count, Length: 933, dtype: int64

In [17]:
df_train

Unnamed: 0,BusinessActivity,InstanceNumber,frame.number,synthetic_sniff_time,event_with_roles,request_method_call,file_data,pgsql.query,pgsql.target,selective_file_data,origin_method,origin_file_data,SequenceNumber
0,GenerateJobApplicationActivity,1,26,2020-05-13 10:07:18.092710,End Point (HR Manager)->Odoo Application:[Http...,version,[],,,version,,,25
1,GenerateJobApplicationActivity,1,127,2020-05-13 22:08:59.592427,Odoo Application->End Point (HR Manager):[Http...,server_version,"[server_version, 12.0-20190820, server_version...",,,server_version,version,version,25
2,GenerateJobApplicationActivity,1,135,2020-05-13 22:09:20.264494,End Point (HR Manager)->Odoo Application:[Http...,authenticate,"[odoo01, dana.wireless@gmail.com, 123456789]",,,,,,25
3,GenerateJobApplicationActivity,1,192,2020-05-13 23:30:15.860005,Odoo Application->db Server/Mail Server:[Pgsql...,,,"INSERT INTO ""res_users_log"" (""id"", ""create_uid...",res_users_log,,,,25
4,GenerateJobApplicationActivity,1,216,2020-05-13 23:55:12.809685,Odoo Application->End Point (HR Manager):[Http...,IsNumber,[6],,,IsNumber,authenticate,,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33068,ResumeReviewActivity,250,729,2020-05-14 04:15:16.416366,Odoo Application->db Server/Mail Server:[Pgsql...,,,"UPDATE ""mail_message_res_partner_needaction_re...",mail_message_res_partner_needaction_rel,,,,728
33069,ResumeReviewActivity,250,766,2020-05-14 04:17:47.741497,Odoo Application->End Point (HR Manager):[Http...,IsNumber,[1],,,IsNumber,execute_kw,hr.applicant_write_stage_id_2,728
33070,ResumeReviewActivity,250,774,2020-05-14 04:18:04.803203,End Point (HR Manager)->Odoo Application:[Http...,execute_kw,"[odoo01, 6, 123456789, hr.applicant, archive_a...",,,hr.applicant_archive_applicant,,,728
33071,ResumeReviewActivity,250,840,2020-05-14 04:28:47.207437,Odoo Application->db Server/Mail Server:[Pgsql...,,,"UPDATE ""hr_applicant"" SET ""active""=false,""writ...",hr_applicant,,,,728


In [18]:
def mark_start_end(df):
    # Mark start event of each BusinessActivity Instance
    df["activityStart"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount() == 0
    # Mark end event of each Business Activity Instance
    df["activityEnd"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount(ascending=False) == 0
    # Merge start and end columns to form labels
    df["ActivityAction"] = df.apply(lambda row: "Activity Start" if row["activityStart"] else (
        "Activity End" if row["activityEnd"] else 'NoAction'), axis=1)

    return df.drop(["activityStart", 'activityEnd'], axis=1)


df_train = mark_start_end(df_train)

In [19]:
cols = ["event_with_roles", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]


In [20]:
def dict_to_features(dict):
    return [[{**d, "bias": 1.0}] for d in dict]


def extract_labels(labels):
    return [[y] for y in labels]

In [21]:
# exclude from training data 
df_train_filt = df_train[~df_train["SequenceNumber"].isin([128])]


In [22]:
train_features = df_train_filt[cols].to_dict("records")
train_features = dict_to_features(train_features)
train_labels = extract_labels(df_train_filt["ActivityAction"])

In [23]:
test_features = df_test[cols].to_dict("records")
test_features = dict_to_features(test_features)
test_labels = extract_labels(df_test["ActivityAction"])

In [24]:
from sklearn.preprocessing import LabelEncoder

event_with_roles_encoder = LabelEncoder().fit(df_train_filt["event_with_roles"])
method_call_encoder = LabelEncoder().fit(df_train_filt["request_method_call"])
file_data_encoder = LabelEncoder().fit(df_train_filt["selective_file_data"])

In [25]:
def label_encode_DF(df): 
    event_with_roles_LE = event_with_roles_encoder.transform(df["event_with_roles"])
    request_method_call_LE = method_call_encoder.transform(df["request_method_call"])
    selective_file_data_LE = file_data_encoder.transform(df["selective_file_data"])
    origin_method_LE = method_call_encoder.transform(df["origin_method"])
    origin_file_data_LE = file_data_encoder.transform(df["origin_file_data"])
    
    return pd.DataFrame({"event_with_roles": event_with_roles_LE, "request_method_call": request_method_call_LE, "selective_file_data": selective_file_data_LE, "origin_method":origin_method_LE, "origin_file_data":origin_file_data_LE})



In [26]:
train_features_LE = label_encode_DF(df_train_filt)
test_features_LE = label_encode_DF(df_test)

In [27]:
train_labels_list = df_train_filt["ActivityAction"].values
test_labels_list = df_test["ActivityAction"].values

### Model Training

#### CRF

In [28]:
# optional Train Test split for evaluation on training data
# In prod case, we train on 100% training data and evaluate on interleaved data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3, random_state=42)

In [29]:
%%time
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
crf.fit(train_features, train_labels)

CPU times: user 1.15 s, sys: 6.06 ms, total: 1.16 s
Wall time: 1.17 s


#### NB 

In [30]:
X_train, X_test, y_train, y_test = train_test_split(train_features_LE, train_labels_list, test_size=0.3, random_state=42)

In [31]:
from sklearn.naive_bayes import CategoricalNB

clf = CategoricalNB()

In [32]:
clf.fit(X_train, y_train)

In [33]:
from sklearn.model_selection import RandomizedSearchCV
import scipy
from sklearn.metrics import make_scorer, f1_score

clf = CategoricalNB()
params_space = {
    "alpha": scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(f1_score, average="macro")

rs = RandomizedSearchCV(clf, params_space, cv = 3, verbose =1,n_jobs=-1, n_iter=300, scoring = f1_scorer)

rs.fit(train_features_LE, train_labels_list)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [34]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
clf = rs.best_estimator_

best params: {'alpha': 8.507183650312988e-05}
best CV score: 0.9617850152106532


In [35]:
from sklearn.metrics import classification_report

def evaluate(model, X, y): 
    pred = model.predict(X)
    print(classification_report(y, pred))
    
#print("------  Test Data  ------")
#evaluate(clf, X_test, y_test)
print("------ INTERLEAVED ------")
evaluate(clf, test_features_LE, test_labels_list)

------ INTERLEAVED ------
                precision    recall  f1-score   support

  Activity End       0.80      1.00      0.89        37
Activity Start       1.00      1.00      1.00        37
      NoAction       1.00      0.99      1.00      1239

      accuracy                           0.99      1313
     macro avg       0.93      1.00      0.96      1313
  weighted avg       0.99      0.99      0.99      1313


#### XGBoost

In [36]:
df_train_cat = df_train_filt[cols].astype('category')

In [37]:
from xgboost import XGBClassifier

bst = XGBClassifier(n_estimators=10, max_depth=3, learning_rate=1, objective='binary:logistic',enable_categorical= True)

LE = LabelEncoder()
bst.fit(df_train_cat, LE.fit_transform(train_labels_list))

In [38]:
evaluate(bst, test_features_LE, LE.transform(test_labels_list))

              precision    recall  f1-score   support

           0       0.71      0.14      0.23        37
           1       1.00      1.00      1.00        37
           2       0.97      1.00      0.99      1239

    accuracy                           0.97      1313
   macro avg       0.90      0.71      0.74      1313
weighted avg       0.97      0.97      0.97      1313


### Optimization

In [39]:
from sklearn.metrics import make_scorer
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import metrics

# define fixed parameters and parameters to search
crf2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=200, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='macro', labels=np.unique(test_labels))

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=5, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=150, 
                        scoring=f1_scorer)
#rs.fit(train_features, train_labels)

#crf = rs.best_estimator_

### Evaluation

#### CRF

In [40]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn_crfsuite import metrics


def flatten(xss):
    return [x for xs in xss for x in xs]


def evaluate(model, x, y_true):
    y_pred = model.predict(x)
    print(metrics.flat_f1_score(y_true, y_pred, average='macro', labels=model.classes_))
    print(metrics.flat_classification_report(y_true, y_pred, model.classes_))
    [print(label, "\n", matrix) for matrix, label in
     zip(multilabel_confusion_matrix(flatten(y_true), flatten(y_pred), labels=model.classes_), model.classes_)]


In [41]:
evaluate(crf, test_features, test_labels)

0.9608303595410767
                precision    recall  f1-score   support

Activity Start       1.00      1.00      1.00        37
      NoAction       1.00      1.00      1.00      1239
  Activity End       0.94      0.84      0.89        37

      accuracy                           0.99      1313
     macro avg       0.98      0.95      0.96      1313
  weighted avg       0.99      0.99      0.99      1313

Activity Start 
 [[1276    0]
 [   0   37]]
NoAction 
 [[  68    6]
 [   2 1237]]
Activity End 
 [[1274    2]
 [   6   31]]


In [42]:
pred = crf.predict(test_features)

In [43]:
pred_mg = crf.predict_marginals(test_features)

In [44]:
#margs = [pred_mg[i] for i in wrong_pred_idx]
columns = pred_mg[0][0].keys()
flat_margs = [[entry[column] for column in columns] for sublist in pred_mg for entry in sublist]
df_margs = pd.DataFrame(flat_margs, columns=columns)

In [45]:
#df_eval = pd.DataFrame([(pred[i], test_labels[i], df_test.iloc[i]["frame.number"] ) for i in wrong_pred_idx],columns = ["predicted","true","frame.number"],)

df_eval = pd.DataFrame({"predicted": pred, "true": test_labels, "frame.number":df_test["frame.number"]}).reset_index(drop=True)

df_eval = pd.concat([df_eval, df_margs], axis = 1)

df_eval["pred_true"] = df_eval["predicted"] == df_eval["true"]



In [46]:
from scipy.stats import entropy

entropy_cols = ['Activity Start', 'NoAction', 'Activity End']

# Calculate entropy for each row using the specified columns
df_eval["entropy"] = df_eval[entropy_cols].apply(entropy, axis=1)


In [47]:
df_eval[~ df_eval["pred_true"]].sort_values(by='entropy', ascending=False)

Unnamed: 0,predicted,true,frame.number,Activity Start,NoAction,Activity End,pred_true,entropy
126,[NoAction],[Activity End],2336,0.000312,0.53365,0.466037,False,0.693477
403,[NoAction],[Activity End],8044,0.000312,0.53365,0.466037,False,0.693477
471,[NoAction],[Activity End],9329,0.000312,0.53365,0.466037,False,0.693477
749,[NoAction],[Activity End],15350,0.000312,0.53365,0.466037,False,0.693477
983,[NoAction],[Activity End],20665,0.000312,0.53365,0.466037,False,0.693477
1013,[NoAction],[Activity End],21651,0.000192,0.711912,0.287896,False,0.602024
313,[Activity End],[NoAction],6266,0.000212,0.259278,0.74051,False,0.574246
1214,[Activity End],[NoAction],26182,0.000212,0.259278,0.74051,False,0.574246


In [48]:
eval_cols = ["event_with_roles","pgsql.query", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]

df_eval = df_eval.merge(df_test[["frame.number", *eval_cols]], how="left",left_on="frame.number", right_on="frame.number")

We observe a high entropy > 0.5 for all wrong classifications

-> Apply fallback model for this cases

#### NB

In [49]:
pred = clf.predict(test_features_LE)

In [50]:
pred_mg = clf.predict_proba(test_features_LE)

In [51]:
df_margs = pd.DataFrame(pred_mg, columns=clf.classes_)

In [52]:
df_margs

Unnamed: 0,Activity End,Activity Start,NoAction
0,7.130387e-36,1.000000e+00,5.785738e-18
1,1.960372e-23,6.924403e-16,1.000000e+00
2,4.109396e-34,4.864234e-13,1.000000e+00
3,1.547944e-35,1.857871e-21,1.000000e+00
4,1.172857e-11,1.386913e-20,1.000000e+00
...,...,...,...
1308,6.979605e-36,8.377050e-22,1.000000e+00
1309,8.841824e-03,1.199641e-29,9.911582e-01
1310,2.001431e-32,2.402154e-18,1.000000e+00
1311,6.463639e-36,7.757779e-22,1.000000e+00


In [53]:
#df_eval = pd.DataFrame([(pred[i], test_labels[i], df_test.iloc[i]["frame.number"] ) for i in wrong_pred_idx],columns = ["predicted","true","frame.number"],)

df_eval = pd.DataFrame({"predicted": pred, "true": test_labels_list, "frame.number":df_test["frame.number"]}).reset_index(drop=True)

df_eval = pd.concat([df_eval, df_margs], axis = 1)

df_eval["pred_true"] = df_eval["predicted"] == df_eval["true"]

In [54]:
from scipy.stats import entropy

entropy_cols = ['Activity Start', 'NoAction', 'Activity End']

# Calculate entropy for each row using the specified columns
df_eval["entropy"] = df_eval[entropy_cols].apply(entropy, axis=1)


In [55]:
df_eval.sort_values(by='entropy', ascending=False)

Unnamed: 0,predicted,true,frame.number,Activity End,Activity Start,NoAction,pred_true,entropy
1131,NoAction,NoAction,24180,0.032334,3.185643e-25,9.676665e-01,True,1.427625e-01
1202,NoAction,NoAction,25739,0.032334,3.185643e-25,9.676665e-01,True,1.427625e-01
656,NoAction,NoAction,13602,0.014758,1.453987e-25,9.852424e-01,True,7.686629e-02
1150,NoAction,NoAction,24632,0.014758,1.453987e-25,9.852424e-01,True,7.686629e-02
896,NoAction,NoAction,18997,0.014758,1.453987e-25,9.852424e-01,True,7.686629e-02
...,...,...,...,...,...,...,...,...
226,Activity End,Activity End,4461,1.000000,2.815668e-29,4.329327e-25,True,2.428903e-23
1267,Activity End,Activity End,27222,1.000000,2.815668e-29,4.329327e-25,True,2.428903e-23
311,Activity End,Activity End,6164,1.000000,2.815668e-29,4.329327e-25,True,2.428903e-23
64,Activity End,Activity End,1192,1.000000,2.815668e-29,4.329327e-25,True,2.428903e-23


In [56]:
eval_cols = ["event_with_roles","pgsql.query", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]

df_eval = df_eval.merge(df_test[["frame.number", *eval_cols]], how="left",left_on="frame.number", right_on="frame.number")

#### XGBoost

In [57]:
test_features_LE

Unnamed: 0,event_with_roles,request_method_call,selective_file_data,origin_method,origin_file_data
0,0,7,28,0,0
1,4,6,26,7,28
2,0,2,0,0,0
3,21,0,0,0,0
4,4,1,1,2,0
...,...,...,...,...,...
1308,28,0,0,0,0
1309,5,1,1,3,14
1310,3,3,3,0,0
1311,22,0,0,0,0


In [58]:
pred = bst.predict(test_features_LE)
pred = LE.inverse_transform(pred)
pred_mg = bst.predict_proba(test_features_LE)

df_margs = pd.DataFrame(pred_mg, columns=LE.classes_)

In [59]:
test_features_LE.values[0]

array([ 0,  7, 28,  0,  0])

In [60]:
test_features_LE

Unnamed: 0,event_with_roles,request_method_call,selective_file_data,origin_method,origin_file_data
0,0,7,28,0,0
1,4,6,26,7,28
2,0,2,0,0,0
3,21,0,0,0,0
4,4,1,1,2,0
...,...,...,...,...,...
1308,28,0,0,0,0
1309,5,1,1,3,14
1310,3,3,3,0,0
1311,22,0,0,0,0


In [61]:
pred_mg = bst.predict_proba([test_features_LE.iloc[0].values.tolist()])

In [62]:
#df_eval = pd.DataFrame([(pred[i], test_labels[i], df_test.iloc[i]["frame.number"] ) for i in wrong_pred_idx],columns = ["predicted","true","frame.number"],)

df_eval = pd.DataFrame({"predicted": pred, "true": test_labels_list, "frame.number":df_test["frame.number"]}).reset_index(drop=True)

df_eval = pd.concat([df_eval, df_margs], axis = 1)

df_eval["pred_true"] = df_eval["predicted"] == df_eval["true"]

In [63]:
from scipy.stats import entropy

entropy_cols = ['Activity Start', 'NoAction', 'Activity End']

# Calculate entropy for each row using the specified columns
df_eval["entropy"] = df_eval[entropy_cols].apply(entropy, axis=1)


In [64]:
df_eval.sort_values(by='entropy', ascending=False)

Unnamed: 0,predicted,true,frame.number,Activity End,Activity Start,NoAction,pred_true,entropy
774,NoAction,NoAction,16179,0.213764,0.121911,0.664325,True,0.858069
1057,NoAction,NoAction,22720,0.213764,0.121911,0.664325,True,0.858069
1123,NoAction,NoAction,23977,0.213764,0.121911,0.664325,True,0.858069
358,NoAction,NoAction,7293,0.213764,0.121911,0.664325,True,0.858069
405,NoAction,NoAction,8093,0.213764,0.121911,0.664325,True,0.858069
...,...,...,...,...,...,...,...,...
75,NoAction,NoAction,1440,0.000017,0.000015,0.999968,True,0.000382
719,NoAction,NoAction,14969,0.000017,0.000015,0.999968,True,0.000382
1270,Activity Start,Activity Start,27306,0.000006,0.999978,0.000016,True,0.000267
750,Activity Start,Activity Start,15377,0.000006,0.999978,0.000016,True,0.000267


In [65]:
# XGBoost Entropy Cutoff -> 0.6

## Activity Classifier

In [66]:
def sequence_by_activities(data, seq_data):
    return [data[seq_data == i] for i in range(seq_data.max())]

In [67]:
feature_cols = ["event_with_roles", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]


In [68]:
# List of dataframes each containing one activity sequence
train_activity_sequences = sequence_by_activities(df_train, df_train["SequenceNumber"])

In [69]:
# Sequences without window features

def dict_to_feature_sequence(dict):
    return [{**d, "bias": 1.0} for d in dict]

def df_to_features(df):
    return dict_to_feature_sequence(df.to_dict("records"))

train_features_seq = [df_to_features(df[feature_cols]) for df in train_activity_sequences]
train_labels_seq = [df["BusinessActivity"].values for df in train_activity_sequences]

In [70]:
# Single Events no window features 

def dict_to_feature(dict):
    return [[{**d, "bias": 1.0}] for d in dict]

def extract_labels(labels):
    return [[y] for y in labels]

train_features = dict_to_feature(df_train[feature_cols].to_dict("records"))
train_labels = extract_labels(df_train["BusinessActivity"])

In [71]:
# Single Events w. window features

# Apply sequencing - flatten later

def seq2features(seq, bw, fw): 
    return [event2features(seq, i, bw, fw) for i in range(len(seq))]

def event2features(seq, i, bw, fw):
    features = {"bias": 1.0}
    
    features.update({
        f"0:{k}": v for k,v in seq[i].items()
    })
    
    for j in range(1, bw+1): 
        index = i-j
        if index >= 0: 
            features.update({
                f"-{j}:{k}": v for k,v in seq[index].items()
            })
        else: 
            features.update({
                 f"-{j}:{k}": "NoMessage" for k,_ in seq[i].items()
            })
        
    for j in range(1,fw+1): 
        index = i + j
        if index < len(seq): 
             features.update({
                f"+{j}:{k}": v for k,v in seq[index].items()
            })
        else: 
            features.update({
                 f"+{j}:{k}": "NoMessage" for k,_ in seq[i].items()
            })
            
    return features

train_features_seq_window = [seq2features(seq[feature_cols].to_dict("records"), 10,10) for seq in train_activity_sequences]
train_labels_seq_window = [seq["BusinessActivity"] for seq in train_activity_sequences]

In [72]:
def flatten_and_encapsulate(list_of_list):
    return [[item] for sublist in list_of_list for item in sublist]

X_train = flatten_and_encapsulate(train_features_seq_window)
y_train = flatten_and_encapsulate(train_labels_seq_window)

In [73]:
%%time
import sklearn_crfsuite

activity_classifier= sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
activity_classifier.fit(X_train, y_train)

CPU times: user 13 s, sys: 81.9 ms, total: 13.1 s
Wall time: 13.2 s


In [74]:
def confidence_weighted_majority_voting(predictions):
    """
    Perform confidence-weighted majority voting on each sublist of predictions.

    :param predictions: A list of dictionaries where each dictionary contains predictions and their confidences.
    :return: A list of majority voted predictions for each sublist.
    """
    majority_voted_predictions = []
    for sublist in predictions:
        if not sublist:
            # If the sublist is empty, append None to the majority voted predictions
            majority_voted_predictions.append(None)
        else:
            # Initialize variables to store cumulative confidences for each prediction
            cumulative_confidences = {label: 0.0 for label in sublist[0].keys()}
            
            # Calculate cumulative confidences for each prediction across all dictionaries in the sublist
            for prediction_dict in sublist:
                for label, confidence in prediction_dict.items():
                    cumulative_confidences[label] += confidence
            
            # Find the prediction with the maximum cumulative confidence
            majority_voted_prediction = max(cumulative_confidences, key=cumulative_confidences.get)
            majority_voted_predictions.append(majority_voted_prediction)

    return majority_voted_predictions

## Activity Model
The activity model utilises multiple sliding windows over the training data for pattern matching



In [75]:
from sklearn.preprocessing import LabelEncoder
from numpy.lib.stride_tricks import sliding_window_view


def get_unique_sequences(seq_data):
    # Convert each array to a tuple and create a set of tuples
    array_set = set(tuple(arr) for arr in seq_data)

    # Convert the set of tuples back to a list of NumPy arrays
    return [np.array(arr) for arr in array_set]


df_train["joined"] = df_train["event_with_roles"] + df_train["selective_file_data"]

# Label Encode Training Data 
le = LabelEncoder()
df_train["joined_LE"] = le.fit_transform(df_train["joined"])

# Mark groups of Instance Number and BusinessActivity with sequence numbers
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
# Align Sequence Numbers so that they start at 0
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

# Divides dataframe into arrays according to to Sequence Data Indicator
data_joined_LE = sequence_by_activities(df_train["joined_LE"], df_train["SequenceNumber"])

unique_sequences = get_unique_sequences(data_joined_LE)

print(f"Reduced the number of sequences from {len(data_joined_LE)} to {len(unique_sequences)} unique ones")

def get_activity_model_data(max_window_length):
    return [np.concatenate([sliding_window_view(seq, i) for seq in unique_sequences], axis=0) for i in
                       range(max_window_length)]
    

# form sliding window sequences of Size N for Training Data 
#activity_model_data = get_activity_model_data(4)

Reduced the number of sequences from 932 to 26 unique ones


# Action Loop

Main loop. Gets raw R1 data as input. 
Applies filtering, activity action and sequence classification

In [76]:
records = df_il_in.to_dict("records")

In [77]:
from event_loop.event import Event


def get_max_from_dict(d: dict):
    return max(d, key= lambda k: d[k])
    

def classify_event(event: Event): 
    #margs = crf.predict_marginals_single([event.to_features()])[0]
    margs = bst.predict_proba([event.to_feature_list()])
    
    pred = get_max_from_dict(margs)
    
    e = entropy([p for p in margs.values()])
    
    #print(f"{event.frame_number} {pred} {margs[pred]:.3f} {e:.3f},")
    
    
    true_val = df_eval[df_eval["frame.number"] == event.frame_number]["true"].iloc[0][0]
    #print(true_val)
    
    # Change to entropy of prediction
    if e > ENTROPY_THRESHOLD: 
    #if pred != true_val:
        # If pred is wrong we have two options for "wrong classifications" 
        # 1 -> We have No Action predicted although the stack should end here 
        # Idea 1: If the stack did not change after N events, emit it. 
        
        # Mark the confidence on the event.
        event.confidence = False 


        # 2 -> We have End predicted although the stack should continue. 
        if pred != true_val:
            print(f"False {event.frame_number} {pred} {margs[pred]:.3f} {e:.3f} should be {true_val}")
       
        event.activity_action = pred
            
    else: 
        event.confidence = True
        event.activity_action = pred

In [78]:
from event_loop.stack import Stack


def search_stack_for_request_frame(frame_number):
    for index, stack in enumerate(stacks):
        if stack.contains_request_frame(frame_number): 
            return index
    return -1

def search_window_for_sequence(seq): 
    """
    Check for pattern matches with the training data and return the count
    :param seq: array_like
                sequence of events
            
    :return: number of occurences of seq in training data
    """
    return np.sum(np.all(activity_model_data[len(seq)] == seq, axis = 1))


def classify_by_train_sequences(event: Event,n : int, exclude_indices: list[int]): 
    # search for existing stacks in training data 
    sequences = [le.transform([ e.to_activity_model_string() for e in stack]+[event.to_activity_model_string()]) for stack in stacks]  
    
    # loop to max 2 elements down
    for i in range(n, 1, -1):   
        res = [search_window_for_sequence(seq[-i:]) if j not in exclude_indices else -1 for j,seq in enumerate(sequences)]
        
        max_res = max(res)
        max_res_count = res.count(max_res)
        idx = np.argmax(res)
        
        if max_res > 0: 
            #print("res:",res, max_res, max_res_count, "->", idx)
            return idx
        
    return -1

def search_stream_index(event: Event, exclude_indices: list[int]) -> int: 
    indices = [i for i,stack in enumerate(stacks) if stack.contains_stream_index(event.stream_index) and i not in exclude_indices]

    if len(indices) == 1: 
        return indices[0]
    else:
        return -1
    

def check_stack_attributes(stacks: list[Stack], event: Event, exclude_indices: list[int]) -> int:
    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            indices = [i for i, stack in enumerate(stacks) if stack.contains_attribute(key,value) and i not in exclude_indices]
            print()
            
            if len(indices) ==1: 
                print("MATCH", indices)
                # we have a clear match -> return idx
                return indices[0]

    return -1

def check_stack_attributes_case_id(stacks: list[Stack], event: Event, exclude_indices: list[int]) -> int:
    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            indices = [i for i, stack in enumerate(stacks) if stack.case_id == Stack.case_id_from_attribute(key, value)]
            
            if len(indices) ==1: 
                # we have a clear match -> return idx
                return indices[0]

    return -1


def exclude_stacks_by_attribute(stacks: list[Stack], event: Event, stacks_out: list[Stack]) -> list[int]: 
    
    exclude_indices = []

        
    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            # exlucde all stacks that have a different attribute 
            exclude_indices.extend(i for i, stack in enumerate(stacks) if stack.has_attribute(key) and not stack.contains_attribute(key, value))
    return exclude_indices

def exclude_stacks_by_attribute_case_id(stacks: list[Stack], event: Event, stacks_out: list[Stack]) -> list[int]: 
    
    exclude_indices = []

    for key, value in event.attributes.items():
        if key in PTP_ATTRIBUTES and value:
            # exclude all stacks that have a different attribute 
            # 
            for i, stack in enumerate(stacks): 
                event_case_id = Stack.case_id_from_attribute(key, value)
                if event_case_id and stack.case_id and event_case_id != stack.case_id: 
                    exclude_indices.append(i)
            #exclude_indices.extend(i for i, stack in enumerate(stacks) if Stack.case_id_from_attribute(key, value) != stack.case_id and stack.case_id)
    return exclude_indices

In [79]:
from event_loop.preprocessing.event import keep_event
%autoreload 2



# Parameter
EVENT_LOOP_CUTOFF_NO_ACTION = 3
EVENT_LOOP_CUTOFF_END_EVENT = 3
ENTROPY_THRESHOLD = 0.4 #0.5
MAX_WINDOW_SIZE = 10
VERBOSE = False
SETTING = "HR"

# init variables
event_buffer: list[Event] = []
attribute_buffer: list[dict] = []
stacks: list[Stack] = []
stacks_out: list[Stack] = []
event_loop_index = 0


# TODO Change config to something like this:
config_dict = {
    "PTP": {
        "1to1" : ["applicant_id", "activity_id"],
        "1toN": ["mail_id"]
    }
}

HR_ATTRIBUTES = ["applicant_id", "activity_id"]
PTP_ATTRIBUTES = ["sale_order_id", "sale_order_line_id","purchase_requisition_id","purchase_requisition_line_id",]


activity_model_data = get_activity_model_data(MAX_WINDOW_SIZE)


for i, event_data in enumerate(records):

    # Filter Event Stream
    if not keep_event(event_data):
        # skip event in loop
        continue
    
    # count every not filtered event for event loop index
    event_loop_index += 1

    # Extract Features and generate Event Object
    event = Event(event_data, event_loop_index, event_buffer, SETTING)
    event_buffer.append(event)
    
    # Activity Action Classification
    classify_event(event)
    activity_action = event.activity_action
    
    # Activity Matching
    if activity_action == "Activity Start": 
        print(f"Add new stack {event.frame_number}")
        stacks.append(Stack(SETTING,event))
        
    if activity_action == "NoAction": 
        if len(stacks) == 1: 
            if VERBOSE: print(f"Add Between Event {event.frame_number} to only stack")
            stacks[0].append_event(event)
        elif event.origin_request_frame: 
            idx = search_stack_for_request_frame(event.origin_request_frame)
            if VERBOSE: print(f"Add Between Event {event.frame_number} by request frame\t{idx}")
            stacks[idx].append_event(event)
        else: 
            if VERBOSE: print("Classify Between Event", event.frame_number)
            # Check attributes of each stack
            
            # we can filter out stacks that already have attributes different to the event
            exclude_indices =  exclude_stacks_by_attribute(stacks, event, stacks_out)
    
            stack_index:int = check_stack_attributes(stacks, event, exclude_indices)
                    
            if stack_index == -1:        
                stack_index = classify_by_train_sequences(event, 4, exclude_indices)
            
            # for elements that are not matchable based on 2 sequences we fall back to stream index
            if stack_index == -1: 
                stack_index = search_stream_index(event, exclude_indices)    
            
            # fallback - no match add to first stack
            if stack_index == -1:
                res = next((i for i in range(len(stacks)) if i not in exclude_indices and stacks[i].confidence),-1)
                if VERBOSE: print("NO NO MATCH", res, exclude_indices)
                stack_index = res
                
            stacks[stack_index].append_event(event)
        
    if activity_action == "Activity End":
        
        stack_index = search_stack_for_request_frame(event.origin_request_frame)
        if VERBOSE : print("Search by request frame", stack_index)
        stacks[stack_index].append_event(event)
        
        #if not event.confidence: 
        #check_pop_idx = idx
        #else:
        if event.confidence: 
            if len(stacks) > 1: 
                if VERBOSE: print("POP Confident Stack")
                stack = stacks.pop(stack_index)
                stacks_out.append(stack)
            else: 
                event.confidence = False
     

    # Loop through all currently open stacks
    for idx, stack in enumerate(stacks):
        last_event = stack[-1]
        # check for non-confident "No Action" Classifications. These could be "Activity End" Instead
        if not last_event.confidence and last_event.activity_action == "NoAction":
            # If a stack has not been continued for N event loops 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_NO_ACTION: 
                stacks.pop(idx)
                stacks_out.append(stack)
                if VERBOSE: print("POP Unsure No Action Event", event_loop_index, last_event.event_loop_index)
                
    for idx, stack in enumerate(stacks): 
        last_event = stack.events[-1]
        if not last_event.confidence and last_event.activity_action == "Activity End": 
            if event_loop_index - last_event.event_loop_index > EVENT_LOOP_CUTOFF_END_EVENT: 
            
                # we are now sure to pop the stack. 
                if VERBOSE: print("POP Unsure Activity End Stack", event_loop_index, last_event.event_loop_index)
                stacks.pop(idx)
                stacks_out.append(stack)  
                
# pop all stacks that are still left
for stack in stacks: 
    stacks_out.append(stack)  


XGBoostError: [14:11:50] /Users/runner/work/xgboost/xgboost/src/c_api/../data/array_interface.h:492: Unicode-8 is not supported.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000002b23d4994 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x00000002b23e6aec xgboost::ArrayInterface<2, false>::AssignType(xgboost::StringView) + 1272
  [bt] (2) 3   libxgboost.dylib                    0x00000002b23e625c xgboost::ArrayInterface<2, false>::Initialize(std::__1::map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, xgboost::Json, std::__1::less<void>, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const, xgboost::Json>>> const&) + 156
  [bt] (3) 4   libxgboost.dylib                    0x00000002b24d571c xgboost::data::ArrayAdapter::ArrayAdapter(xgboost::StringView) + 148
  [bt] (4) 5   libxgboost.dylib                    0x00000002b24d552c xgboost::data::DMatrixProxy::SetArrayData(xgboost::StringView) + 76
  [bt] (5) 6   libxgboost.dylib                    0x00000002b23f8a58 XGBoosterPredictFromDense + 388
  [bt] (6) 7   libffi.dylib                        0x0000000194e9f050 ffi_call_SYSV + 80
  [bt] (7) 8   libffi.dylib                        0x0000000194ea7adc ffi_call_int + 1208
  [bt] (8) 9   _ctypes.cpython-39-darwin.so        0x0000000100af33ec PyInit__ctypes + 25288



In [None]:
# classify stacks
# TODO Move into Model
def confidence_weighted_majority_voting(predictions):
    """
    Perform confidence-weighted majority voting on each sublist of predictions.

    :param predictions: A list of dictionaries where each dictionary contains predictions and their confidences.
    :return: A list of majority voted predictions for each sublist.
    """    

    # Initialize variables to store cumulative confidences for each prediction
    cumulative_confidences = {label: 0.0 for label in predictions[0][0].keys()}
    
    # Calculate cumulative confidences for each prediction across all dictionaries in the sublist
    for prediction_dict in predictions:
        for label, confidence in prediction_dict[0].items():
            cumulative_confidences[label] += confidence
    
    # Find the prediction with the maximum cumulative confidence
    return max(cumulative_confidences, key=cumulative_confidences.get)

 
 
 
    
def classify_stack(stack: Stack):
    seq = seq2features([event.to_features() for event in stack], 10,10)
    pred = activity_classifier.predict_marginals([[ele] for ele in seq])
    pred_cwmv = confidence_weighted_majority_voting(pred)
    return pred_cwmv
    
stack_predictions = [classify_stack(stack )for stack in stacks_out]


In [None]:
print(f"Contained {len(stacks)} stack in queue")
if len(stacks)> 0: 
    print([[e.frame_number for e in stack ] for stack in stacks])

start = [stack[0].frame_number for stack in stacks_out]
end = [stack[-1].frame_number for stack in stacks_out]

res_df = pd.DataFrame({"start_pred":start, "end_pred":end})

eval_df = df_gt[["start", "actual_end"]].merge(res_df,how="left", left_on ="start", right_on = "start_pred").fillna(-1).astype(int)
eval_df["pred_true"] = eval_df["actual_end"] == eval_df["end_pred"]

display(eval_df)
print(f"Accuracy of matching start and end sequences: {eval_df['pred_true'].mean()}")
print(f"Overall matching accuracy: {0.5 + eval_df['pred_true'].mean()/2}")

In [None]:
# Function to check if intervals overlap
def intervals_overlap(row, df):
    overlapping_names = []
    overlapping_bps = set()
    for index, other_row in df.iterrows():
        if row.name != index and row['start'] <= other_row['actual_end'] and row['actual_end'] >= other_row['start']:
            overlapping_names.append(f"{other_row['activity_name']} {other_row['bp_id']}")
            overlapping_bps.add(other_row['bp_id'])
    return overlapping_names, list(overlapping_bps)

df_gt[["overlapping_activities", "overlapping_bps"]] = df_gt.apply(intervals_overlap, axis=1, df = df_gt, result_type="expand")

In [None]:
# Create dataframe with mapping of frame numbers to event stacks
frame_numbers = [event.frame_number for idx,stack in enumerate(stacks_out) for event in stack]
stack_numbers = [idx for idx,stack in enumerate(stacks_out) for event in stack]
applicant_ids = [event.attributes["applicant_id"] for idx,stack in enumerate(stacks_out) for event in stack]
activity_ids = [event.attributes["activity_id"] for idx,stack in enumerate(stacks_out) for event in stack]
mail_ids = [event.attributes["mail_id"] for idx,stack in enumerate(stacks_out) for event in stack]
sniff_time =  [event.sniff_time for idx,stack in enumerate(stacks_out) for event in stack]
case_id = [stack.case_id["id"]  if stack.case_id else -1 for idx, stack in enumerate(stacks_out) for event in stack]

df_frame_numbers = pd.DataFrame(data={"frame.number": frame_numbers, "sniff_time": sniff_time, "stack_idx": stack_numbers, "applicant_id": applicant_ids,"activity_id": activity_ids, "mail_id":mail_ids,"case_id": case_id})

# Merge Activity Name from ground truth frame to event sequences for evaluation
merged_df = df_frame_numbers.merge(df_gt[["activity_name","start","bp_id"]], how="left",left_on="frame.number", right_on="start").drop(columns="start")

merged_df[["activity_name","bp_id"]] = merged_df.groupby("stack_idx")[["activity_name","bp_id"]].ffill()
#merged_df["activity_name"] = merged_df.groupby("stack_idx")["bp_id"].ffill()

# Merge with filtered interleaved test data
merged_df = df_test.merge(merged_df, on="frame.number")

In [None]:
unique_no_nan = lambda x: list(filter(None, pd.unique(x)))
first_unique = lambda x: unique_no_nan(x)[0]

def compare_values(x,y):
    # Multi index and casting magic - I just want to compare the bp_ids lol
    x = int(x[0])
    y = int(y[0])

    return x == y


res = merged_df.groupby("stack_idx").agg(applicant_id = ("applicant_id", unique_no_nan),activity_id=("activity_id", unique_no_nan), mail_id=("mail_id", unique_no_nan),case_id=("case_id", first_unique),bp_id=("bp_id", unique_no_nan),frame_number_min=("frame.number","min"),frame_number_max =  ("frame.number","max"),sniff_time_min=("sniff_time","min"),sniff_time_max=("sniff_time","min"), activity_name=("activity_name", lambda x: x.head(1)))
res["stack_prediction"] = stack_predictions
# Apply the custom function to compare 'sale_order_line_id' and 'sale_order_line_id_case_id'
res["bp_true"] = res.apply(lambda x: compare_values(x["applicant_id"], x["bp_id"]), axis = 1)
res["activity_true"] = res["activity_name"] ==  res["stack_prediction"]
#res.loc["Mean","bp_true"] = res["bp_true"].mean()
#res.loc["Mean","activity_true"] = res["activity_true"].mean()

In [None]:
res["bp_id_1"] = res["bp_id"].apply(first_unique)

In [None]:
res

In [None]:
res[["case_id","bp_id_1"]].value_counts()

-> 1 to 1 Matching of Case_id and Bp_id -> 1.0 is correct

In [None]:
res["activity_true"].mean()

In [None]:
res["bp_true"].mean()

In [None]:
out = res.sort_values(by= "sniff_time_min")[["sniff_time_min","stack_prediction","case_id"]].reset_index(drop=True)
out.columns = ["timestamp", "activity", "case_id"]

In [None]:
out.to_csv("../../data_v3/out/hr_xes_out.csv", index = False)