In [167]:
import re

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pandas as pd

# surpress future warning related to SkLearn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

from numpy.lib.stride_tricks import sliding_window_view

from sklearn.metrics import classification_report, f1_score, multilabel_confusion_matrix

from sklearn.model_selection import train_test_split


In [168]:
df_train = pd.read_csv('../../data_v2/ptp_extended_features.csv')
df_test = pd.read_csv('../../data_v2/p2p_r4_il_og_mod.csv')

  df_train = pd.read_csv('../../data_v2/ptp_extended_features.csv')


In [169]:
# Mark start event of each BusinessActivity Instance
df_train["activityStart"] = df_train.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount() == 0
# Mark end event of each Business Activity Instance
df_train["activityEnd"] = df_train.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount(ascending=False) == 0
# Merge start and end columns to form labels
df_train["task_position"] = df_train.apply(lambda row: "position_start" if row["activityStart"] else ("position_end" if row["activityEnd"] else 'position_between'), axis=1)

df_train = df_train.drop(["activityStart", 'activityEnd'], axis=1)

In [170]:
# Form Sequences by Grouping
df_train = df_train.sort_values(by=["InstanceNumber", "BusinessActivity", "frame.number"])
df_train["SequenceNumber"] = df_train.groupby(["BusinessActivity", "InstanceNumber"]).ngroup()
df_train["SequenceNumber"] -= df_train['SequenceNumber'].min()

In [243]:
df_test[df_test["InstanceNumber"] == 37]

Unnamed: 0.1,Unnamed: 0,InstanceNumber,BusinessActivity,frame.number,event_with_roles,method_call,selective_file_data,origin_method,origin_file_data,method_call_2,task_position,file_data_str,request_method_call,selective_filter_data,origin_selective_filter_data


In [173]:
df_test[["request_method_call","selective_filter_data","origin_selective_filter_data"]] = df_test[["method_call", "selective_file_data","origin_file_data"]]

Generate Train Sequences and Labels

In [179]:
# Divides dataframe into arrays according to to Sequence Data Indicator
def sequence_by_activities(data, seq_data): 
    return [data[seq_data == i].values for i in range(seq_data.max())]


cols = ["event_with_roles","request_method_call","selective_filter_data","origin_selective_filter_data","task_position"]

train_data = sequence_by_activities(df_train[cols], df_train["SequenceNumber"])
train_labels = [seq[:,-1].tolist() for seq in train_data]

Generate Interleaved Sequences and Lables

In [182]:
# Reshapes the Dataframe into Sequences of Length N
def reshape_sequences(data, seq_length):
    data = data.values
    num_seq = data.shape[0] // seq_length
    return data[:num_seq*seq_length].reshape((num_seq, seq_length, data.shape[1]))

# Fits a Sliding window over the dataframe
def reshape_sliding_window(data, window_shape):
    return np.squeeze(sliding_window_view(data,window_shape),axis =1)

#test_data = reshape_sequences(df_test[cols], 15)
test_data = reshape_sliding_window(df_test[cols], (15, len(cols)))
test_labels = test_data[:,:,-1].tolist()

Extract Features

In [231]:
def seq2features(seq, bw, fw ,feature_list): 
    # seq [0] = Position in Sequence
    # seq [0,4] = Feature 4 an Position 0 
    return [event2features(seq,i, bw, fw, feature_list) for i in range(len(seq))]


def event2features(seq, i,bw, fw, feature_list): 
    
    features = {"bias": 1.0}
    # event features 
    features.update( {
        f"0:{feature_list[j]}": seq[i,j] for j in range(len(feature_list))        
    })
    
    # backward features 1...bw
    for k in range(1,bw+1): 
        index = i - k
        if index >= 0: 
            features.update({
                f"-{k}:{feature_list[j]}" : seq[index,j] for j in range(len(feature_list))
            })
        else: 
             features.update({
                f"-{k}:{feature_list[j]}" : "NoMessage" for j in range(len(feature_list))
            })
              
    # forward features 1...fw
    for k in range(1, fw+1): 
        index = i + k
        if index < len(seq): 
            features.update({
                f"+{k}:{feature_list[j]}" : seq[index, j] for j in range(len(feature_list))
            })
        else: 
            features.update({
                f"+{k}:{feature_list[j]}": "NoMessage" for j in range(len(feature_list))
            })
            
    # remove all empty String features        
    #features = {key: value for key, value in features.items() if value != ''}
    return features

In [232]:
features = ["event_with_roles","request_method_call","selective_filter_data","origin_selective_filter_data"]
bw = 5
fw = 10
train_features = [seq2features(s, bw,fw, features) for s in train_data]
test_features = [seq2features(s, bw,fw,features) for s in test_data]

In [233]:
train_features[0]

[{'bias': 1.0,
  '0:event_with_roles': 'End Point (Procurement)->Odoo Application: [HttpRequest:POST /xmlrpc/2/common HTTP/1.1\\r\\n]',
  '0:request_method_call': 'version',
  '0:selective_filter_data': 'version',
  '0:origin_selective_filter_data': nan,
  '-1:event_with_roles': 'NoMessage',
  '-1:request_method_call': 'NoMessage',
  '-1:selective_filter_data': 'NoMessage',
  '-1:origin_selective_filter_data': 'NoMessage',
  '-2:event_with_roles': 'NoMessage',
  '-2:request_method_call': 'NoMessage',
  '-2:selective_filter_data': 'NoMessage',
  '-2:origin_selective_filter_data': 'NoMessage',
  '-3:event_with_roles': 'NoMessage',
  '-3:request_method_call': 'NoMessage',
  '-3:selective_filter_data': 'NoMessage',
  '-3:origin_selective_filter_data': 'NoMessage',
  '-4:event_with_roles': 'NoMessage',
  '-4:request_method_call': 'NoMessage',
  '-4:selective_filter_data': 'NoMessage',
  '-4:origin_selective_filter_data': 'NoMessage',
  '-5:event_with_roles': 'NoMessage',
  '-5:request_metho

In [234]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3, random_state=42, shuffle=True)

In [235]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=200, 
    #all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1.07 s, sys: 49.2 ms, total: 1.12 s
Wall time: 1.32 s


In [236]:
def flatten(xss): 
    return [x for xs in xss for x in xs]

def evaluate(model, x, y_true): 
    y_pred = model.predict(x)
    print(metrics.flat_f1_score(y_true, y_pred, average='macro', labels = model.classes_))
    print(metrics.flat_classification_report(y_true, y_pred, model.classes_))
    print(multilabel_confusion_matrix(flatten(y_true), flatten(y_pred)))
    
evaluate(crf, X_test, y_test)
evaluate(crf, test_features, test_labels) 

1.0
                  precision    recall  f1-score   support

  position_start       1.00      1.00      1.00       138
position_between       1.00      1.00      1.00      6596
    position_end       1.00      1.00      1.00       121

        accuracy                           1.00      6855
       macro avg       1.00      1.00      1.00      6855
    weighted avg       1.00      1.00      1.00      6855

[[[ 259    0]
  [   0 6596]]

 [[6734    0]
  [   0  121]]

 [[6717    0]
  [   0  138]]]
0.3245144389491864
                  precision    recall  f1-score   support

  position_start       0.02      0.08      0.03       931
position_between       0.97      0.87      0.91     53683
    position_end       0.02      0.07      0.03       931

        accuracy                           0.84     55545
       macro avg       0.33      0.34      0.32     55545
    weighted avg       0.93      0.84      0.88     55545

[[[  262  1600]
  [ 7152 46531]]

 [[50974  3640]
  [  868    63]]

 

Optimize

In [237]:
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer



# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='macro', labels=np.unique(test_labels))

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [238]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.7628286625214997, 'c2': 0.014420346241095425}
best CV score: 1.0
model size: 0.01M


In [239]:
crf = rs.best_estimator_
evaluate(crf, X_test, y_test)
evaluate(crf, test_features, test_labels)

1.0
                  precision    recall  f1-score   support

  position_start       1.00      1.00      1.00       138
position_between       1.00      1.00      1.00      6596
    position_end       1.00      1.00      1.00       121

        accuracy                           1.00      6855
       macro avg       1.00      1.00      1.00      6855
    weighted avg       1.00      1.00      1.00      6855

[[[ 259    0]
  [   0 6596]]

 [[6734    0]
  [   0  121]]

 [[6717    0]
  [   0  138]]]
0.32275653603430093
                  precision    recall  f1-score   support

  position_start       0.02      0.07      0.03       931
position_between       0.97      0.87      0.91     53683
    position_end       0.02      0.07      0.03       931

        accuracy                           0.84     55545
       macro avg       0.33      0.33      0.32     55545
    weighted avg       0.93      0.84      0.88     55545

[[[  250  1612]
  [ 7156 46527]]

 [[50974  3640]
  [  868    63]]



In [240]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common())


Top likely transitions:
position_between -> position_between 6.206357
position_between -> position_end 2.714864
position_end -> position_between -0.726853


In [241]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
3.227225 position_start -1:event_with_roles:NoMessage
3.227225 position_start -1:request_method_call:NoMessage
3.227225 position_start -1:selective_filter_data:NoMessage
3.227225 position_start -1:origin_selective_filter_data:NoMessage
2.195751 position_end +1:event_with_roles:NoMessage
2.195751 position_end +1:request_method_call:NoMessage
2.195751 position_end +1:selective_filter_data:NoMessage
2.195751 position_end +1:origin_selective_filter_data:NoMessage
0.035105 position_between +3:selective_filter_data:IsNumber
0.034442 position_between +4:request_method_call:execute_kw
0.010474 position_between -1:request_method_call:version
0.010474 position_between -1:selective_filter_data:version
0.005720 position_between 0:request_method_call:server_version
-0.132205 position_between 0:event_with_roles:Odoo Application->db Server/Mail Server: [PgsqlRequest:Simple query:UPDATE:['fetchmail_server']]

Top negative:
3.227225 position_start -1:event_with_roles:NoMessage
3.227225 po