In [1]:
# Check out additional features for extraction

In [2]:
import ast

import pandas as pd

%load_ext autoreload

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
df_features = pd.read_csv('../../data_v3/ptp_il_tagged.csv')
df_test = pd.read_csv('../../data_v3/ptp_emitted_events_test.csv')

In [5]:
df_gt = pd.read_csv("../../data_v3/ptp_ground_truth.csv")

In [6]:
# HR data in data/Train/R1 is missing frame.number. We take another (already filtered) dataset and apply our feature extraction to this one
df_train_in = pd.read_csv('../../data/VALID/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})
# This is the Interleaved Data Set for our pipeline
df_il_in = pd.read_csv('../../data/PTP-INTERLEAVED/R1/R1.csv', converters={"MessageAttributes": ast.literal_eval})

In [7]:
from event_loop.preprocessing.dataframe import pre_process

df_train_pp = pre_process(df_train_in)
df_il_pp = pre_process(df_il_in)

In [8]:
def mark_start_end(df):
    # Mark start event of each BusinessActivity Instance
    df["activityStart"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount() == 0
    # Mark end event of each Business Activity Instance
    df["activityEnd"] = df.groupby(["BusinessActivity", "InstanceNumber", ]).cumcount(ascending=False) == 0
    # Merge start and end columns to form labels
    df["ActivityAction"] = df.apply(lambda row: "Activity Start" if row["activityStart"] else (
        "Activity End" if row["activityEnd"] else 'NoAction'), axis=1)

    return df.drop(["activityStart", 'activityEnd'], axis=1)

df_train_pp = mark_start_end(df_train_pp)

In [9]:
df_train_pp["pgsql.query_0"] = df_train_pp["pgsql.query"].str.split(" ").str[0]
df_il_pp["pgsql.query_0"]= df_il_pp["pgsql.query"].str.split(" ").str[0]

In [10]:
from event_loop.preprocessing.event_attributes import *

%autoreload 2


df_train_pp["sale_order_id"] = df_train_pp.apply(lambda x: extract_sale_order_id(x["pgsql.query"], x["pgsql.query_0"], x["pgsql.target"], x["file_data"], x["selective_file_data"], x["origin_file_data"]), axis=1)
df_train_pp["sale_order_line_id"] = df_train_pp.apply(lambda x: extract_sale_order_line_id(x["pgsql.query"], x["pgsql.query_0"], x["pgsql.target"], x["file_data"], x["selective_file_data"], x["origin_file_data"]), axis=1)
df_train_pp["purchase_requisition_id"] = df_train_pp.apply(lambda x: extract_purchase_requisition_id(x["pgsql.query"], x["pgsql.query_0"], x["pgsql.target"], x["file_data"], x["selective_file_data"], x["origin_file_data"]), axis=1)
df_train_pp["purchase_requisition_line_id"] = df_train_pp.apply(lambda x: extract_purchase_requisition_line_id(x["pgsql.query"], x["pgsql.query_0"], x["pgsql.target"], x["file_data"], x["selective_file_data"], x["origin_file_data"]), axis=1)
df_train_pp["purchase_order_id"] = df_train_pp.apply(lambda x: extract_purchase_order_id(x["pgsql.query"], x["pgsql.query_0"], x["pgsql.target"], x["file_data"], x["selective_file_data"], x["origin_file_data"]), axis=1)
df_train_pp["purchase_order_line_id"] = df_train_pp.apply(lambda x: extract_purchase_order_line_id(x["pgsql.query"], x["pgsql.query_0"], x["pgsql.target"], x["file_data"], x["selective_file_data"], x["origin_file_data"]), axis=1)

In [11]:
df_train_pp["file_data_str"] = df_train_pp["file_data"].apply(lambda x: ", ".join(x))

In [12]:
q = "58"

#cond1 = df_train_pp[['sale_order_id', 'sale_order_line_id']].isnull().all(axis=1) 
cond2 = (df_train_pp["file_data_str"].str.contains(q))| (df_train_pp["pgsql.query"].str.contains(q))
cond3 = df_train_pp["InstanceNumber"] == 1
cond4 = df_train_pp["BusinessActivity"] == "ReceiveGoods"

df_train_pp[cond3 & cond4][["InstanceNumber","pgsql.query","file_data","origin_file_data","pgsql.target","sale_order_id","sale_order_line_id","purchase_requisition_id","purchase_requisition_line_id"]] 

Unnamed: 0,InstanceNumber,pgsql.query,file_data,origin_file_data,pgsql.target,sale_order_id,sale_order_line_id,purchase_requisition_id,purchase_requisition_line_id
2797505,1,,[],,,,,,
2797560,1,,"[server_version, 12.0-20190820, server_version_info, 12, 0, 0, final, 0, server_serie, 12.0, protocol_version, 1]",version,,,,,
2797564,1,,"[odoo01, user.suername@company.com, PWD1234]",,,,,,
2797666,1,"INSERT INTO ""res_users_log"" (""id"", ""create_uid"", ""create_date"", ""write_uid"", ""write_date"") VALUES (nextval('res_users_log_id_seq'), 2, (now() at time zone 'UTC'), 2, (now() at time zone 'UTC')) RETURNING id",,,res_users_log,,,,
2797736,1,,[2],,,,,,
2797740,1,,"[odoo01, 2, PWD1234, purchase.order, search_read, id, =, 152, fields, picking_ids, limit, 1]",,,,,,
2797945,1,,"[id, 152, picking_ids, 55]",purchase.order_search_read,,,,,
2797949,1,,"[odoo01, 2, PWD1234, stock.picking, search_read, id, =, 55, fields, move_ids_without_package, limit, 1]",,,,,,
2798250,1,,"[id, 55, move_ids_without_package, 65]",stock.picking_search_read,,,,,
2798254,1,,"[odoo01, 2, PWD1234, stock.move, search_read, id, =, 65, fields, move_line_ids, limit, 1]",,,,,,


In [18]:
unique_no_nan = lambda x: list(filter(None, pd.unique(x)))

df_train_pp.groupby(["InstanceNumber"]).agg(
    {"sale_order_id": unique_no_nan, "sale_order_line_id": unique_no_nan, "purchase_requisition_id": unique_no_nan,
     "purchase_requisition_line_id": unique_no_nan, "purchase_order_id": unique_no_nan,
     'purchase_order_line_id': unique_no_nan})

Unnamed: 0_level_0,sale_order_id,sale_order_line_id,purchase_requisition_id,purchase_requisition_line_id,purchase_order_id,purchase_order_line_id
InstanceNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,[94],[118],[58],[58],"[152, 152, 153, 153, 154, 154, 153, 154]","[163, 164, 165, 164, 165]"
2,[95],[119],[59],[59],"[155, 155, 156, 156]","[166, 167]"
3,[96],[120],[60],[60],"[157, 157, 158, 158]","[168, 169]"
4,[97],[121],[61],[61],"[159, 159, 160, 160]","[170, 171]"
5,[98],[122],[62],[62],"[161, 161, 162, 162, 163, 163, 161, 162]","[172, 173, 174, 172, 173]"
...,...,...,...,...,...,...
63,[156],[180],[120],[120],"[346, 346, 347, 347]","[357, 358]"
64,[157],[181],[121],[121],"[348, 348, 349, 349, 350, 350, 351, 351, 349, 350, 351]","[359, 360, 361, 362, 360, 361, 362]"
65,[158],[182],[122],[122],"[352, 352, 353, 353, 354, 354]","[363, 364, 365]"
66,[159],[183],[123],[123],"[355, 355, 356, 356, 357, 357, 358, 358]","[366, 367, 368, 369]"


In [14]:
unique_no_nan = lambda x: list(filter(None, pd.unique(x)))

df_train_pp.groupby(["InstanceNumber","BusinessActivity"]).agg(
    {"sale_order_id": unique_no_nan, "sale_order_line_id": unique_no_nan, "purchase_requisition_id": unique_no_nan,
     "purchase_requisition_line_id": unique_no_nan, "purchase_order_id": unique_no_nan,
     'purchase_order_line_id': unique_no_nan})

Unnamed: 0_level_0,Unnamed: 1_level_0,sale_order_id,sale_order_line_id,purchase_requisition_id,purchase_requisition_line_id,purchase_order_id,purchase_order_line_id
InstanceNumber,BusinessActivity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,BidSelection,[],[],[58],[],[],[]
1,CreateCallForTender,[94],[],[58],[58],[],[]
1,CreatePurchaseOrder,[94],[118],[58],[],"[152, 152, 153, 154, 154, 153]","[163, 165, 164, 164, 165]"
1,CreatePurchaseRequest,[94],[118],[],[],[],[]
1,CreateRfq,[94],[],[58],[],"[152, 152, 153, 153, 154, 154]","[163, 164, 165]"
...,...,...,...,...,...,...,...
67,CreatePurchaseOrder,[160],[184],[124],[],"[360, 360, 359]","[371, 370]"
67,CreatePurchaseRequest,[160],[184],[],[],[],[]
67,CreateRfq,[160],[],[124],[],"[359, 359, 360, 360]","[370, 371]"
67,ReceiveGoods,[],[],[],[],[360],[371]


In [16]:
unique_no_nan = lambda x: list(filter(None, pd.unique(x)))

df_il_pp.groupby(["InstanceNumber", "BusinessActivity"]).agg(
    {"sale_order_id": unique_no_nan, "sale_order_line_id": unique_no_nan, "purchase_requisition_id": unique_no_nan,
     "purchase_requisition_line_id": unique_no_nan, "purchase_order_id": unique_no_nan,
     'purchase_order_line_id': unique_no_nan})

KeyError: "Column(s) ['purchase_order_id', 'purchase_order_line_id', 'purchase_requisition_id', 'purchase_requisition_line_id', 'sale_order_id', 'sale_order_line_id'] do not exist"

In [276]:
df_gt

Unnamed: 0,activity_name,start,end,actual_end,overlapping_activities,bp_id,overlapping_bps,classification,Multi Class Classification,Single Class Classification,probability
0,CreatePurchaseRequest,96,1367,1322,[],399,[],,,,
1,CreateCallForTender,1367,1940,1887,[],399,[],,,,
2,CreatePurchaseRequest,1940,2818,2793,[],400,[],,,,
3,CreateRfq,2818,6289,15871,[],399,[],,,,
4,CreateCallForTender,5563,6178,5965,[],400,[],,,,
...,...,...,...,...,...,...,...,...,...,...,...
58,SubmitPayment,100724,104625,104454,[],406,[],,,,
59,CreatePurchaseOrder,101210,105998,105925,[],408,[],,,,
60,SubmitPayment,106266,108727,108703,[],407,[],,,,
61,ReceiveGoods,108727,109758,109696,[],408,[],,,,
