In [1]:
import numpy as np
import pandas as pd
import scipy.stats
import sklearn_crfsuite
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import metrics

# Create Evaluation Data Set 

In [2]:
df_test_in = pd.read_csv('../../data_v3/hr_il_tagged.csv')

df_train_in = pd.read_csv('../../data_v3/hr_extended_features.csv')

df_test_in.rename(columns={'real_activity_action': "ActivityAction"}, inplace=True)

df_gt = pd.read_csv('../../data_v3/hr_ground_truth.csv')

In [3]:
df_test_in = pd.read_csv('../../data_v3/ptp_il_tagged.csv')

df_train_in = pd.read_csv('../../data_v3/ptp_extended_features.csv')

df_test_in.rename(columns={'real_activity_action': "ActivityAction"}, inplace=True)

df_gt = pd.read_csv('../../data_v3/ptp_ground_truth.csv')

In [4]:
#df_gt = pd.read_csv('../../data_v3/hr_ground_truth.csv')
#df_test["ActivityAction"]=np.select([df_test["frame.number"].isin(df_gt["start"].values),df_test["frame.number"].isin(df_gt["actual_end"].values)], ["Action Start", "Action End"],default="NoAction", )

1. Sort Values by Sniff Time
2. Validate Start and End Events after Sorting
    - Group by Instance Number and Business Activity
    - Check if frame.number is sequential 
    - Check if start activity and end activity are at beginning / end of the sequence
3. Features
    - Event with roles, request_method_call, selective_filter_data, origin_selective_filter_data
    - 

# 1 Preprocessing
1.1 Sort train set by sniff time
1.2 Form sequences
1.3 Validate sequences for ascending frame.number
1.4 Check for correct position of start and end events of each sequence

In [5]:
df_train_in

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,FileName,BusinessActivity,InstanceNumber,sniff_time,frame.number,synthetic_sniff_time,synthetic_sniff_time_str,session_generalized,...,writeoff_label,payment_term_id,request_method_call,starting_frame_number,selective_filter_data,origin_request_method,origin_selective_filter_data,IsStartActivity,IsEndActivity,ActivityAction
0,0,0,CreatePurchaseRequest_1.pcap,CreatePurchaseRequest,1,2020-12-21 08:08:44.637118,119,2020-12-22 11:39:44.237118,2020-12-22 11:39:44.237118,192.168.11.2-192.168.11.9 (1),...,,,version,,version,,,True,False,Activity Start
1,1,1,CreatePurchaseRequest_1.pcap,CreatePurchaseRequest,1,2020-12-21 08:08:44.688344,153,2020-12-22 11:48:16.548344,2020-12-22 11:48:16.548344,192.168.11.2-192.168.11.9 (1),...,,,server_version,119.0,,version,version,False,False,NoAction
2,2,2,CreatePurchaseRequest_1.pcap,CreatePurchaseRequest,1,2020-12-21 08:08:44.690577,161,2020-12-22 11:48:38.880577,2020-12-22 11:48:38.880577,192.168.11.2-192.168.11.9 (2),...,,,authenticate,,,,,False,False,NoAction
3,3,3,CreatePurchaseRequest_1.pcap,CreatePurchaseRequest,1,2020-12-21 08:08:44.985813,224,2020-12-22 12:37:51.535813,2020-12-22 12:37:51.535813,192.168.11.1-192.168.11.2 (7),...,,,,,,,,False,False,NoAction
4,4,4,CreatePurchaseRequest_1.pcap,CreatePurchaseRequest,1,2020-12-21 08:08:44.994598,245,2020-12-22 12:39:19.394598,2020-12-22 12:39:19.394598,192.168.11.2-192.168.11.9 (2),...,,,2,161.0,IsNumber,authenticate,,False,False,NoAction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25951,25951,25822,SubmitPayment_67.pcap,SubmitPayment,67,2020-12-22 10:19:04.470693,2776,2020-12-23 18:58:19.450693,2020-12-23 18:58:19.450693,192.168.11.1-192.168.11.2 (6),...,Write-Off,,,,,,,False,False,NoAction
25952,25952,25823,SubmitPayment_67.pcap,SubmitPayment,67,2020-12-22 10:19:04.472504,2778,2020-12-23 18:58:37.562504,2020-12-23 18:58:37.562504,192.168.11.1-192.168.11.2 (6),...,,,,,,,,False,False,NoAction
25953,25953,25824,SubmitPayment_67.pcap,SubmitPayment,67,2020-12-22 10:19:04.473377,2780,2020-12-23 18:58:46.293377,2020-12-23 18:58:46.293377,192.168.11.1-192.168.11.2 (6),...,,,,,,,,False,False,NoAction
25954,25954,25825,SubmitPayment_67.pcap,SubmitPayment,67,2020-12-22 10:19:04.498656,2838,2020-12-23 19:02:59.108656,2020-12-23 19:02:59.108656,192.168.11.1-192.168.11.2 (6),...,,,,,,,,False,False,NoAction


In [6]:
cols = ["event_with_roles", "request_method_call", "selective_filter_data", "origin_selective_filter_data","origin_request_method", "InstanceNumber",
        "BusinessActivity","ActivityAction"]

df_train = df_train_in.sort_values(by=['sniff_time'])[cols]
df_test = df_test_in[cols]

In [7]:
#df_train["ActivityLabel"] = df_train["ActivityAction"] + " "+df_train["BusinessActivity"]

In [8]:
# repalce request method call number 
def replace_numeric(value):
    return '' if pd.to_numeric(value, errors='coerce') == value else value

df_train["request_method_call"][pd.notnull(pd.to_numeric(df_train["request_method_call"], errors='coerce'))] = ""


In [9]:
# form sequences of length 20 
def reshape_sequence(data, seq_length):
    data = data.values
    num_seq = data.shape[0] // seq_length
    return data[:num_seq * seq_length].reshape((num_seq, seq_length, data.shape[1]))


train_seq = reshape_sequence(df_train, 1)
test_seq = reshape_sequence(df_test, 1)

In [10]:
df_train["selective_filter_data"].value_counts(dropna=False)

selective_filter_data
NaN                                       21745
IsNumber                                   1773
version                                     408
purchase.order_create                       208
purchase.order.line_create                  206
product.product_search_read                 134
purchase.order_search_read                   93
purchase.requisition.line_create             67
sale.order_create                            67
res.partner_search_read                      67
sale.order.line_create                       67
purchase.requisition_write_in_progress       67
purchase.requisition_write_open              67
purchase.requisition_write_ongoing           67
purchase.requisition_create                  66
purchase.order_write                         63
sale.order_write_done                        47
stock.picking_button_validate                47
account.payment_create                       47
account.invoice.line_create                  47
account.invoice_cr

In [11]:
df_train

Unnamed: 0,event_with_roles,request_method_call,selective_filter_data,origin_selective_filter_data,origin_request_method,InstanceNumber,BusinessActivity,ActivityAction
0,End Point (Employee)->Odoo Application: [HttpR...,version,version,,,1,CreatePurchaseRequest,Activity Start
1,Odoo Application->End Point (Employee): [HttpR...,server_version,,version,version,1,CreatePurchaseRequest,NoAction
2,End Point (Employee)->Odoo Application: [HttpR...,authenticate,,,,1,CreatePurchaseRequest,NoAction
3,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,1,CreatePurchaseRequest,NoAction
4,Odoo Application->End Point (Employee): [HttpR...,,IsNumber,,authenticate,1,CreatePurchaseRequest,NoAction
...,...,...,...,...,...,...,...,...
25951,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,67,SubmitPayment,NoAction
25952,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,67,SubmitPayment,NoAction
25953,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,67,SubmitPayment,NoAction
25954,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,67,SubmitPayment,NoAction


In [12]:
# build feature vectors

def seq2features(seq, bw, fw, feature_list):
    return [event2features(seq, i, bw, fw, feature_list) for i in range(len(seq))]



def event2features(seq, i, bw, fw, feature_list):
    # event features (position i in sequence)
    features = {
        feature_list[j]: seq[i, j] for j in range(len(feature_list))
    }

    ## backward features 1...bw

    for k in range(1,bw+1): 
        index = i - k
        if index >= 0: 
            features.update({
                f"-{k}:{feature_list[j]}" : seq[index,j] for j in range(len(feature_list))
            })

    ## forward features 1...fw

    for k in range(1, fw+1): 
        index = i + k
        if index < len(seq): 
            features.update({
                f"+{k}:{feature_list[j]}" : seq[index, j] for j in range(len(feature_list))
            })

    ## remove all empty String features        

    # features = {key: value for key, value in features.items() if value != ''}
    features.update({"bias": 1.0})
    return features


feature_list = [ "event_with_roles","request_method_call", "selective_file_data","origin_request_method", "origin_file_data"]
fw = 0
bw = 0

train_features = [seq2features(s, bw, fw, feature_list) for s in train_seq]
train_labels = train_seq[:, :, -1]
train_labels_activity = train_seq[:,:,-1]

test_features = [seq2features(s, bw, fw, feature_list) for s in test_seq]
test_labels = test_seq[:, :, -1]


In [13]:
train_features

[[{'event_with_roles': 'End Point (Employee)->Odoo Application: [HttpRequest:POST /xmlrpc/2/common HTTP/1.1\\r\\n]',
   'request_method_call': 'version',
   'selective_file_data': 'version',
   'origin_request_method': nan,
   'origin_file_data': nan,
   'bias': 1.0}],
 [{'event_with_roles': 'Odoo Application->End Point (Employee): [HttpResponse:HTTP/1.0 200 OK\\r\\n]',
   'request_method_call': 'server_version',
   'selective_file_data': nan,
   'origin_request_method': 'version',
   'origin_file_data': 'version',
   'bias': 1.0}],
 [{'event_with_roles': 'End Point (Employee)->Odoo Application: [HttpRequest:POST /xmlrpc/2/common HTTP/1.1\\r\\n]',
   'request_method_call': 'authenticate',
   'selective_file_data': nan,
   'origin_request_method': nan,
   'origin_file_data': nan,
   'bias': 1.0}],
 [{'event_with_roles': "Odoo Application->db Server/Mail Server: [PgsqlRequest:Simple query:INSERT:['res_users_log']]",
   'request_method_call': nan,
   'selective_file_data': nan,
   'origin

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels_activity, test_size=0.3, random_state=42)

### Train CRF Model

In [15]:
%%time
crf = sklearn_crfsuite.CRF(
    max_iterations=200,
    c1=0.1,
    c2=0.01,
    all_possible_transitions=True
    #all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 461 ms, sys: 8.38 ms, total: 469 ms
Wall time: 481 ms


In [16]:
def flatten(xss):
    return [x for xs in xss for x in xs]


def evaluate(model, x, y_true):
    y_pred = model.predict(x)
    print(metrics.flat_f1_score(y_true, y_pred, average='macro', labels=model.classes_))
    print(metrics.flat_classification_report(y_true, y_pred, model.classes_))
    print(multilabel_confusion_matrix(flatten(y_true), flatten(y_pred)))


evaluate(crf, X_test, y_test)

0.9207747787496299
                precision    recall  f1-score   support

      NoAction       0.99      1.00      1.00      7534
  Activity End       0.94      0.64      0.77       132
Activity Start       1.00      1.00      1.00       121

      accuracy                           0.99      7787
     macro avg       0.98      0.88      0.92      7787
  weighted avg       0.99      0.99      0.99      7787

[[[7650    5]
  [  47   85]]

 [[7666    0]
  [   0  121]]

 [[ 206   47]
  [   5 7529]]]


In [17]:
evaluate(crf, test_features, test_labels)

0.8487744079767344
                precision    recall  f1-score   support

      NoAction       0.99      1.00      0.99      3591
  Activity End       1.00      0.38      0.55        63
Activity Start       1.00      1.00      1.00        63

      accuracy                           0.99      3717
     macro avg       1.00      0.79      0.85      3717
  weighted avg       0.99      0.99      0.99      3717

[[[3654    0]
  [  39   24]]

 [[3654    0]
  [   0   63]]

 [[  87   39]
  [   0 3591]]]


In [18]:
df_test

Unnamed: 0,event_with_roles,request_method_call,selective_filter_data,origin_selective_filter_data,origin_request_method,InstanceNumber,BusinessActivity,ActivityAction
0,End Point (Employee)->Odoo Application: [HttpR...,version,version,,,2022,order_to_cash_interleaved_10cases,Activity Start
1,Odoo Application->End Point (Employee): [HttpR...,server_version,,version,version,2022,order_to_cash_interleaved_10cases,NoAction
2,End Point (Employee)->Odoo Application: [HttpR...,authenticate,,,,2022,order_to_cash_interleaved_10cases,NoAction
3,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,2022,order_to_cash_interleaved_10cases,NoAction
4,Odoo Application->End Point (Employee): [HttpR...,2,IsNumber,,authenticate,2022,order_to_cash_interleaved_10cases,NoAction
...,...,...,...,...,...,...,...,...
3712,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,2022,order_to_cash_interleaved_10cases,NoAction
3713,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,2022,order_to_cash_interleaved_10cases,NoAction
3714,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,2022,order_to_cash_interleaved_10cases,NoAction
3715,Odoo Application->db Server/Mail Server: [Pgsq...,,,,,2022,order_to_cash_interleaved_10cases,NoAction


In [19]:
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='macro', labels=np.unique(test_labels))

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits



KeyboardInterrupt



In [20]:
crf = rs.best_estimator_
evaluate(crf, X_test, y_test)
evaluate(crf, test_features, test_labels)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [21]:
evaluate(crf, test_features, test_labels)

AttributeError: 'NoneType' object has no attribute 'tag'

In [22]:
from collections import Counter


def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))


print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common())


Top likely transitions:


In [23]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))


print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:

Top negative:


# Categorical NB

In [26]:
train_array = np.array([list(item[0].values()) for item in train_features])[:,:-1]
test_array = np.array([list(item[0].values()) for item in test_features])[:,:-1]

train_array[train_array == "nan"] = ""
test_array[test_array == "nan"] = ""

# replace numeric values in method call
train_array[:,1] = np.where(np.logical_not(np.char.isdigit(train_array[:,1])), train_array[:,1], "")
test_array[:,1] = np.where(np.logical_not(np.char.isdigit(test_array[:,1])), test_array[:,1], "")

In [27]:
from sklearn.preprocessing import LabelEncoder

# Label Encode Data
label_encoders = [LabelEncoder() for _ in range(train_array.shape[1]-1)]

for i, encoder in enumerate(label_encoders):
    train_array[:,i] = encoder.fit_transform(train_array[:,i])
    test_array[:,i] = encoder.transform(test_array[:,i])
    
# Encode origin selective filter data same as selective filter data
train_array[:,-1] = label_encoders[-1].transform(train_array[:,-1])
test_array[:,-1] = label_encoders[-1].transform(test_array[:,-1])


ValueError: y contains previously unseen labels: 'product.product_search_read_name'

In [29]:
train_labels = train_labels.ravel()
test_labels = test_labels.ravel()

In [30]:
train_labels

array(['Activity Start', 'NoAction', 'NoAction', ..., 'NoAction',
       'NoAction', 'Activity End'], dtype=object)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_array, train_labels, test_size=0.3, random_state=42)

In [None]:
from sklearn.naive_bayes import CategoricalNB

clf = CategoricalNB()

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import scipy
from sklearn.metrics import make_scorer, f1_score

clf = CategoricalNB()
params_space = {
    "alpha": scipy.stats.expon(scale=0.05)   
}

f1_scorer = make_scorer(f1_score, average="macro")

rs = RandomizedSearchCV(clf, params_space, cv = 3, verbose =1,n_jobs=-1, n_iter=300, scoring = f1_scorer)

rs.fit(X_train, y_train)

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
clf = rs.best_estimator_

In [None]:
from sklearn.metrics import classification_report

def evaluate(model, X, y): 
    pred = model.predict(X)
    print(classification_report(y, pred))
    
print("------  Test Data  ------")
evaluate(clf, X_test, y_test)
print("------ INTERLEAVED ------")
evaluate(clf, test_array, test_labels)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(random_state=42)

In [None]:
clf.fit(X_train, y_train)

In [None]:
evaluate(clf, X_test, y_test)

In [None]:
evaluate(clf, test_array, test_labels)

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import scipy
from sklearn.metrics import make_scorer, f1_score

clf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}

f1_scorer = make_scorer(f1_score, average="macro")

rs = GridSearchCV(clf, param_grid, cv = 3, verbose =1,n_jobs=-1, scoring = f1_scorer)

rs.fit(X_train, y_train)

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
clf = rs.best_estimator_

In [None]:
print("------  Test Data  ------")
evaluate(clf, X_test, y_test)
print("------ INTERLEAVED ------")
evaluate(clf, test_array, test_labels)