# Activity Classifier

Goal
: Predict the activity action of a sequence of events



In [1]:
import pandas as pd
import ast

In [2]:
df_train_in = pd.read_csv('../../data/VALID/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})
df_test =pd.read_csv('../../data_v3/ptp_emitted_events_test.csv')

In [3]:
from event_loop.preprocessing.dataframe import pre_process

df_train = pre_process(df_train_in)

## Preprocessing

In [4]:
# Mark groups of Instance Number and BusinessActivity with sequence numbers
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
# Align Sequence Numbers so that they start at 0
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

In [5]:
def sequence_by_activities(data, seq_data):
    return [data[seq_data == i] for i in range(seq_data.max())]

In [6]:
feature_cols = ["event_with_roles", "request_method_call", "selective_file_data", 
        "origin_method","origin_file_data"]


In [7]:
# List of dataframes each containing one activity sequence
train_activity_sequences = sequence_by_activities(df_train, df_train["SequenceNumber"])

test_activity_sequences = sequence_by_activities(df_test, df_test["stack_idx"])

In [8]:
df_test

Unnamed: 0.1,Unnamed: 0,BusinessActivity,InstanceNumber,frame.number,synthetic_sniff_time,event_with_roles,request_method_call,file_data,pgsql.query,pgsql.target,...,ActivityAction,stack_idx,sale_order_id,sale_order_line_id,purchase_requisition_id,purchase_requisition_line_id,purchase_order_id,case_id,activity_name,bp_id
0,0,order_to_cash_interleaved_10cases,2022,96,2022-08-04 21:07:16.297676,End Point (Employee)->Odoo Application:[HttpRe...,version,[],,,...,Activity Start,0,,,,,,0,CreatePurchaseRequest,399.0
1,1,order_to_cash_interleaved_10cases,2022,137,2022-08-04 21:52:26.138633,Odoo Application->End Point (Employee):[HttpRe...,server_version,"['server_version', '12.0-20190820', 'server_ve...",,,...,NoAction,0,,,,,,0,CreatePurchaseRequest,399.0
2,2,order_to_cash_interleaved_10cases,2022,145,2022-08-04 21:52:45.170536,End Point (Employee)->Odoo Application:[HttpRe...,authenticate,"['odoo01', 'user.suername@company.com', 'PWD12...",,,...,NoAction,0,,,,,,0,CreatePurchaseRequest,399.0
3,3,order_to_cash_interleaved_10cases,2022,209,2022-08-04 22:49:06.248610,Odoo Application->db Server/Mail Server:[Pgsql...,,,"INSERT INTO ""res_users_log"" (""id"", ""create_uid...",res_users_log,...,NoAction,0,,,,,,0,CreatePurchaseRequest,399.0
4,4,order_to_cash_interleaved_10cases,2022,232,2022-08-04 23:08:15.473521,Odoo Application->End Point (Employee):[HttpRe...,IsNumber,['2'],,,...,NoAction,0,,,,,,0,CreatePurchaseRequest,399.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,3904,order_to_cash_interleaved_10cases,2022,112133,2022-08-21 23:39:41.657434,Odoo Application->db Server/Mail Server:[Pgsql...,,,"INSERT INTO ""account_payment"" (""id"", ""create_u...",account_payment,...,NoAction,59,,,,,,8,SubmitPayment,407.0
3905,3905,order_to_cash_interleaved_10cases,2022,112135,2022-08-21 23:40:17.501018,Odoo Application->db Server/Mail Server:[Pgsql...,,,"INSERT INTO ""mail_followers"" (""id"", ""partner_i...",mail_followers,...,NoAction,59,,,,,,8,SubmitPayment,407.0
3906,3906,order_to_cash_interleaved_10cases,2022,112137,2022-08-21 23:40:24.921760,Odoo Application->db Server/Mail Server:[Pgsql...,,,INSERT INTO mail_followers_mail_message_subtyp...,mail_followers_mail_message_subtype_rel,...,NoAction,59,,,,,,8,SubmitPayment,407.0
3907,3907,order_to_cash_interleaved_10cases,2022,112175,2022-08-21 23:43:05.357802,Odoo Application->db Server/Mail Server:[Pgsql...,,,"INSERT INTO ""mail_message"" (""id"", ""create_uid""...",mail_message,...,NoAction,59,,,,,,8,SubmitPayment,407.0


In [9]:
# We train a crf with linked / not linked features on the sequential data and evaluate on the test data

In [10]:
# Sequences without window features

def dict_to_feature_sequence(dict):
    return [{**d, "bias": 1.0} for d in dict]

def df_to_features(df):
    return dict_to_feature_sequence(df.to_dict("records"))

train_features_seq = [df_to_features(df[feature_cols]) for df in train_activity_sequences]
train_labels_seq = [df["BusinessActivity"].values for df in train_activity_sequences]


test_features_seq = [df_to_features(df[feature_cols]) for df in test_activity_sequences]
test_labels_seq = [df["activity_name"].values for df in test_activity_sequences]

In [11]:
# Single Events no window features 

def dict_to_feature(dict):
    return [[{**d, "bias": 1.0}] for d in dict]

def extract_labels(labels):
    return [[y] for y in labels]

train_features = dict_to_feature(df_train[feature_cols].to_dict("records"))
train_labels = extract_labels(df_train["BusinessActivity"])

test_features = dict_to_feature(df_test[feature_cols].to_dict("records"))
test_labels = extract_labels(df_test["activity_name"])

In [12]:
# Single Events w. window features

# Apply sequencing - flatten later

def seq2features(seq, bw, fw): 
    return [event2features(seq, i, bw, fw) for i in range(len(seq))]

def event2features(seq, i, bw, fw):
    features = {"bias": 1.0}
    
    features.update({
        f"0:{k}": v for k,v in seq[i].items()
    })
    
    for j in range(1, bw+1): 
        index = i-j
        if index >= 0: 
            features.update({
                f"-{j}:{k}": v for k,v in seq[index].items()
            })
        else: 
            features.update({
                 f"-{j}:{k}": "NoMessage" for k,_ in seq[i].items()
            })
        
    for j in range(1,fw+1): 
        index = i + j
        if index < len(seq): 
             features.update({
                f"+{j}:{k}": v for k,v in seq[index].items()
            })
        else: 
            features.update({
                 f"+{j}:{k}": "NoMessage" for k,_ in seq[i].items()
            })
            
    return features

seq2features(train_activity_sequences[0][feature_cols].to_dict("records"), 5, 5)

[{'bias': 1.0,
  '0:event_with_roles': 'End Point (Procurement)->Odoo Application:[HttpRequest:POST /xmlrpc/2/common HTTP/1.1\\r\\n]',
  '0:request_method_call': 'version',
  '0:selective_file_data': 'version',
  '0:origin_method': '',
  '0:origin_file_data': '',
  '-1:event_with_roles': 'NoMessage',
  '-1:request_method_call': 'NoMessage',
  '-1:selective_file_data': 'NoMessage',
  '-1:origin_method': 'NoMessage',
  '-1:origin_file_data': 'NoMessage',
  '-2:event_with_roles': 'NoMessage',
  '-2:request_method_call': 'NoMessage',
  '-2:selective_file_data': 'NoMessage',
  '-2:origin_method': 'NoMessage',
  '-2:origin_file_data': 'NoMessage',
  '-3:event_with_roles': 'NoMessage',
  '-3:request_method_call': 'NoMessage',
  '-3:selective_file_data': 'NoMessage',
  '-3:origin_method': 'NoMessage',
  '-3:origin_file_data': 'NoMessage',
  '-4:event_with_roles': 'NoMessage',
  '-4:request_method_call': 'NoMessage',
  '-4:selective_file_data': 'NoMessage',
  '-4:origin_method': 'NoMessage',
  

In [13]:
train_features_seq_window = [seq2features(seq[feature_cols].to_dict("records"), 10,10) for seq in train_activity_sequences]
train_labels_seq_window = [seq["BusinessActivity"] for seq in train_activity_sequences]

In [14]:
test_features_seq_window = [seq2features(seq[feature_cols].to_dict("records"), 10,5) for seq in test_activity_sequences]
test_labels_seq_window = [seq["activity_name"].tolist() for seq in test_activity_sequences]

In [15]:
from sklearn.model_selection import train_test_split

APPLY_SEQUENCES = False
EVAL_ON_IL = False

if APPLY_SEQUENCES: 
    train_features = train_features_seq
    test_features = test_features_seq
    train_labels = train_labels_seq
    test_labels = test_labels_seq

    
else: 
    X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3, random_state=42)

In [16]:
X_train = train_features_seq_window
X_test = test_features_seq_window
y_train = train_labels_seq_window
y_test = test_labels_seq_window

In [17]:
def flatten(list_of_list): 
    return [item for sublist in list_of_list for item in sublist]

def flatten_and_encapsulate(list_of_list):
    return [[item] for sublist in list_of_list for item in sublist]


X_train = flatten_and_encapsulate(train_features_seq_window)
X_test_seq = test_features_seq_window
X_test = flatten_and_encapsulate(test_features_seq_window)
y_train = flatten_and_encapsulate(train_labels_seq_window)
y_test_seq = test_labels_seq_window
y_test = flatten_and_encapsulate(test_labels_seq_window)

# Model Training

In [18]:
%%time
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 10.2 s, sys: 112 ms, total: 10.3 s
Wall time: 10.5 s


In [19]:
from sklearn.metrics import make_scorer
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import metrics
import numpy as np

# define fixed parameters and parameters to search
crf2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='macro', labels=np.unique(test_labels))

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=5, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
#rs.fit(X_train, y_train)

#crf = rs.best_estimator_

In [20]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn_crfsuite import metrics


def flatten(xss):
    return [x for xs in xss for x in xs]


def evaluate(model, x, y_true):
    y_pred = model.predict(x)
    print(metrics.flat_f1_score(y_true, y_pred, average='macro', labels=model.classes_))
    print(metrics.flat_classification_report(y_true, y_pred, model.classes_))
    [print(label, "\n", matrix) for matrix, label in
     zip(multilabel_confusion_matrix(flatten(y_true), flatten(y_pred), labels=model.classes_), model.classes_)]


In [33]:
y_pred = crf.predict(X_test)

In [22]:
X_test[0]

[{'bias': 1.0,
  '0:event_with_roles': 'End Point (Employee)->Odoo Application:[HttpRequest:POST /xmlrpc/2/common HTTP/1.1\\r\\n]',
  '0:request_method_call': 'version',
  '0:selective_file_data': 'version',
  '0:origin_method': nan,
  '0:origin_file_data': nan,
  '-1:event_with_roles': 'NoMessage',
  '-1:request_method_call': 'NoMessage',
  '-1:selective_file_data': 'NoMessage',
  '-1:origin_method': 'NoMessage',
  '-1:origin_file_data': 'NoMessage',
  '-2:event_with_roles': 'NoMessage',
  '-2:request_method_call': 'NoMessage',
  '-2:selective_file_data': 'NoMessage',
  '-2:origin_method': 'NoMessage',
  '-2:origin_file_data': 'NoMessage',
  '-3:event_with_roles': 'NoMessage',
  '-3:request_method_call': 'NoMessage',
  '-3:selective_file_data': 'NoMessage',
  '-3:origin_method': 'NoMessage',
  '-3:origin_file_data': 'NoMessage',
  '-4:event_with_roles': 'NoMessage',
  '-4:request_method_call': 'NoMessage',
  '-4:selective_file_data': 'NoMessage',
  '-4:origin_method': 'NoMessage',
  '

In [23]:
crf.predict([X_test[0]])

[['CreatePurchaseRequest']]

In [24]:
# Try Majority Voting

In [25]:
pred_margs = crf.predict_marginals(X_test_seq)
pred = crf.predict(X_test_seq)

In [26]:
def confidence_weighted_majority_voting(predictions):
    """
    Perform confidence-weighted majority voting on each sublist of predictions.

    :param predictions: A list of dictionaries where each dictionary contains predictions and their confidences.
    :return: A list of majority voted predictions for each sublist.
    """
    majority_voted_predictions = []
    for sublist in predictions:
        if not sublist:
            # If the sublist is empty, append None to the majority voted predictions
            majority_voted_predictions.append(None)
        else:
            # Initialize variables to store cumulative confidences for each prediction
            cumulative_confidences = {label: 0.0 for label in sublist[0].keys()}
            
            # Calculate cumulative confidences for each prediction across all dictionaries in the sublist
            for prediction_dict in sublist:
                for label, confidence in prediction_dict.items():
                    cumulative_confidences[label] += confidence
            
            # Find the prediction with the maximum cumulative confidence
            majority_voted_prediction = max(cumulative_confidences, key=cumulative_confidences.get)
            majority_voted_predictions.append(majority_voted_prediction)

    return majority_voted_predictions

cwmv_pred = confidence_weighted_majority_voting(pred_margs)

In [27]:
def majority_voting(predictions):
    """
    Perform majority voting on each sublist of predictions.

    :param predictions: A list of lists where each sublist contains predictions.
    :return: A list of majority voted predictions for each sublist.
    """
    majority_voted_predictions = []
    for sublist in predictions:
        if not sublist:
            # If the sublist is empty, append None to the majority voted predictions
            majority_voted_predictions.append(None)
        else:
            # Count the occurrences of each prediction in the sublist
            counts = {}
            for prediction in sublist:
                if prediction in counts:
                    counts[prediction] += 1
                else:
                    counts[prediction] = 1
            
            # Find the prediction with the maximum count (mode)
            majority_voted_prediction = max(counts, key=counts.get)
            majority_voted_predictions.append(majority_voted_prediction)

    return majority_voted_predictions

mv_pred = majority_voting(pred)
mv_true = majority_voting(test_labels_seq_window)

In [28]:
from sklearn.metrics import classification_report
import numpy as np

print(classification_report(mv_true, mv_pred, zero_division=0.0))

                       precision    recall  f1-score   support

         BidSelection       0.62      0.80      0.70        10
  CreateCallForTender       1.00      0.80      0.89        10
  CreatePurchaseOrder       0.00      0.00      0.00         8
CreatePurchaseRequest       1.00      1.00      1.00         9
            CreateRfq       0.47      0.80      0.59        10
         ReceiveGoods       1.00      0.88      0.93         8
        SubmitPayment       0.62      0.71      0.67         7

             accuracy                           0.73        62
            macro avg       0.67      0.71      0.68        62
         weighted avg       0.68      0.73      0.69        62


In [29]:
print(classification_report(mv_true, cwmv_pred, zero_division=0.0))

                       precision    recall  f1-score   support

         BidSelection       0.62      0.80      0.70        10
  CreateCallForTender       1.00      0.80      0.89        10
  CreatePurchaseOrder       1.00      0.12      0.22         8
CreatePurchaseRequest       1.00      1.00      1.00         9
            CreateRfq       0.50      0.80      0.62        10
         ReceiveGoods       1.00      0.88      0.93         8
        SubmitPayment       0.62      0.71      0.67         7

             accuracy                           0.74        62
            macro avg       0.82      0.73      0.72        62
         weighted avg       0.81      0.74      0.72        62


In [30]:
[print(label, "\n", matrix) for matrix, label in
     zip(multilabel_confusion_matrix(mv_true, cwmv_pred, labels=crf.classes_), crf.classes_)]

BidSelection 
 [[47  5]
 [ 2  8]]
CreateCallForTender 
 [[52  0]
 [ 2  8]]
CreatePurchaseOrder 
 [[54  0]
 [ 7  1]]
CreatePurchaseRequest 
 [[53  0]
 [ 0  9]]
CreateRfq 
 [[44  8]
 [ 2  8]]
ReceiveGoods 
 [[54  0]
 [ 1  7]]
SubmitPayment 
 [[52  3]
 [ 2  5]]


[None, None, None, None, None, None, None]

In [31]:
df_eval = pd.DataFrame({"true": mv_true, "pred": cwmv_pred})
df_eval["pred_true"] = df_eval["true"] == df_eval["pred"] 