# Introduction
This notebook contains the code enabling transformation of an OCEL 2.0 log file to tEKG in batch mode.

In [1]:
import pm4py
import pandas as pd
import numpy as np
import time

from datetime import datetime
from itertools import product

# Setup 
This section sets the initial variables and sets the environment.

In [2]:
# The notebook is set to transform the running example. 
# To transform other logs, you only need to set the log name properly using the experiment_name variable. It works if the log is in jsonocel format. 
# If your log is in another format, please refine the file_path variable. In this case, you also need to modify the pm4py.read.read_ocel2_json function in the next block to use the correct PM4Py function to read the correct format.

experiment_name = 'runningExample-course'
file_path = './ocel2/'+experiment_name+'.jsonocel'

experiment_path = './experiments/batch_'+experiment_name+'.json'

In [3]:
ocel = pm4py.read.read_ocel2_json(file_path)

In [4]:
# legend
# -------------------------------------------------------------------------------------------------
# Any number corresponds to the algorithm 1 line number in the paper
# The section within [] is within the loop given by the number before the open bracket.
# The section within () is within the if statement given by the number before the open parenthesis.

lbl_log = 'LOG'
lbl_class = 'CLASS'
lbl_event = 'EVENT'
lbl_entity =  'ENTITY'
lbl_snapshot = 'SNAPSHOT'
lbl_derived = 'DERIVED'
lbl_has = 'HAS'
lbl_observed = 'OBSERVED'
lbl_rel = 'REL'
lbl_corr='CORR'
lbl_df = 'DF'

lbl_meta_node_log = 'node:Log'  # lines 4-6
lbl_meta_node_class = 'node:Class' # lines 7-9
lbl_meta_node_event = 'node:Event' # lines 10[11-12]
lbl_meta_node_entity = 'node:Entity' # lines 15[16-17]
lbl_meta_node_snapshot = 'node:Snapshot' # lines 15[18,19[20-21]]

# lbl_meta_node_reified = 'node:Reified' # lines 34[35-37] includes the two following
lbl_meta_node_entity_reified = 'node:Reified_Entity' 
lbl_meta_node_snapshot_reified = 'node:Reified_Snapshot'

lbl_meta_rel_log_has_event = 'rel:has' #lines 10[13]
lbl_meta_rel_event_observed_class = 'rel:observed' #lines 10[14]
lbl_meta_rel_entity_snapshot_snapshot = 'rel:snapshot'  # lines 15[18,19[22]]
lbl_meta_rel_snapshot_rel_update_snapshot = 'rel:rel:SnapshotUpdate'  # lines 15[18,23[24(25)]]
lbl_meta_rel_entity_rel_entity = 'rel:rel:Entity' # lines 27[28]
lbl_meta_rel_snapshot_rel_snapshot = 'rel:rel:Snapshot' # lines 27[29[30,31[32(33)]]]

lbl_meta_rel_derived = 'rel:derived' # lines 34[38-39]


lbl_meta_rel_event_corr = 'rel:corr' # 
lbl_meta_rel_event_corr_entity = 'rel:corr:Entity' # lines 40[41]
lbl_meta_rel_event_corr_entity_reified = 'rel:corr:ReifiedEntity' # lines 40[42[43]]

lbl_meta_rel_event_corr_snapshot  = 'rel:corr:Snapshot' # lines 40[44,45[46(47)]]
lbl_meta_rel_event_corr_snapshot_reified  ='rel:corr:ReifiedSnapshot' # lines 40[44,45[46(48[49])]]

lbl_meta_rel_event_df_entity_event  ='rel_Event-df[entity]->Event'
lbl_meta_rel_event_df_snapshot_event  ='rel_Event-df[snapshot]->Event'
lbl_meta_rel_event_df_event='rel:df' # line 50

In [5]:
def processData(func, __element_label, __element_type=None, __filter=None):
    global meta_time, meta_data, df_data
    
    if __element_type==None:
        __element_type = __element_label

    start_time = time.time()
    #body (develop)
    df_tmp = func(__element_label, __element_type, __filter)
    # end
    elapsed_time = time.time() - start_time
    meta_time[__element_type] = elapsed_time

    #footer (fixed)
    df_tmp['__label']=__element_label
    df_tmp['__filter_type'] = __element_type
    
    meta_data[__element_type] = df_tmp.columns

    if df_data is None:
        df_data = df_tmp.drop_duplicates()
    else:
        df_data = pd.concat([df_data, df_tmp], axis=0, ignore_index=True)

In [6]:
meta_data = {}
meta_time = {}

df_data = None

test_mode = False

In [7]:
def test(lbl):
    if test_mode and (lbl in meta_data.keys()):
        return df_data[df_data.__filter_type==lbl][meta_data[lbl]]
    else:
        return None

# Log node
This section adds a node to tEKG for the log.

In [8]:
def f(__element_label, __element_type, __filter=None):
    return pd.DataFrame(['log name'], columns=['ID'])

processData(f, lbl_log, lbl_meta_node_log)

In [9]:
test(lbl_meta_node_log)

# Class nodes
This section adds nodes to tEKG for classes representing event types in OCEL.

In [10]:
def f(__element_label, __element_type, __filter=None):
    df_tmp = pd.DataFrame(ocel.events[ocel.event_activity].unique(), columns=['ID'])
    return df_tmp

processData(f, lbl_class , lbl_meta_node_class)

In [11]:
test(lbl_meta_node_class)

# Event nodes
This section adds nodes to tEKG for events corresponding to events in OCEL.

In [12]:
def f(__element_label, __element_type, __filter=None):
    df_tmp = ocel.events
    df_tmp = df_tmp.rename(columns={ocel.event_id_column: "EventID", ocel.event_timestamp: "timestamp", ocel.event_activity: "Activity"}, errors="raise")
    return df_tmp

processData(f, lbl_event, lbl_meta_node_event)

In [13]:
test(lbl_meta_node_event)

# Entity nodes
This section adds nodes to tEKG for entities corresponding to objects in OCEL.

In [14]:
def f(__element_label, __element_type, __filter=None):
    df_tmp = ocel.objects
    df_tmp = df_tmp.rename(columns={ocel.object_id_column: "ID", ocel.object_type_column: "EntityType"}, errors="raise")
    return df_tmp

processData(f, lbl_entity, lbl_meta_node_entity) 

In [15]:
test(lbl_meta_node_entity)

# Snapshot nodes
This section adds nodes to tEKG for snapshots that materialize OCEL objects when their value has changed.

In [16]:
def f(__element_label, __element_type, __filter=None):
    df_init = df_data[df_data['__filter_type']==__filter][meta_data[__filter]]
    df_init['timestamp'] = datetime(1970, 1, 1, 0, 0, 0, tzinfo=pd.Timestamp.utcnow().tzinfo)
    
    df_updates = ocel.object_changes.rename(columns={ocel.object_id_column: "ID", ocel.object_type_column: "EntityType", ocel.event_timestamp: "timestamp"}, errors="raise")
    
    df_tmp = pd.concat([df_init,df_updates], ignore_index=True).sort_values(['ID', 'timestamp'])
    df_tmp = df_tmp.groupby('ID').apply(lambda group: group.ffill())
    df_tmp = df_tmp.drop(columns=[ocel.changed_field])
    df_tmp['ENTITY_ID'] = df_tmp['ID']
    df_tmp['ID'] =  '(' + df_tmp['ID'] + ',' + df_tmp['timestamp'].astype('str') + ')' # setting snapshots id
    df_tmp = df_tmp.reset_index(drop=True)
    return df_tmp

processData(f, lbl_snapshot, lbl_meta_node_snapshot, lbl_meta_node_entity) 

  df_tmp = df_tmp.groupby('ID').apply(lambda group: group.ffill())
  df_tmp = df_tmp.groupby('ID').apply(lambda group: group.ffill())


In [17]:
test(lbl_meta_node_snapshot)

# has edges
This section adds edges labeled "has" to tEKG to connect the Log node to Event nodes.

In [18]:
def f(__element_label, __element_type, __filter=None):
    log_indx = df_data[df_data['__filter_type']==__filter[0]].index
    entity_indx = df_data[df_data['__filter_type']==__filter[1]].index
    df_tmp = pd.DataFrame(list(product(log_indx, entity_indx)), columns=[":START_ID",":END_ID"])
    return df_tmp

processData(f, lbl_has, lbl_meta_rel_log_has_event, (lbl_meta_node_log, lbl_meta_node_event))

In [19]:
test(lbl_meta_rel_log_has_event)

# observed edges
This section adds edges labeled "observed" to tEKG to connect Event nodes to Class nodes.

In [20]:
def f(__element_label, __element_type, __filter=None):
    df_events = df_data[df_data['__filter_type']==__filter[0]][['Activity']].reset_index()
    df_classes = df_data[df_data['__filter_type']==__filter[1]][['ID']].reset_index()
    df_tmp = df_classes.merge(df_events, left_on='ID', right_on='Activity')[['index_x', 'index_y']].rename(columns={'index_y':":START_ID", 'index_x':":END_ID"})
    return df_tmp

processData(f, lbl_observed, lbl_meta_rel_event_observed_class, (lbl_meta_node_event, lbl_meta_node_class))

In [21]:
test(lbl_meta_rel_event_observed_class)

# rel edges (Entity2Entity)
This section adds edges labeled "rel" to tEKG to connect Entity nodes to Entity nodes.

In [22]:
def f(__element_label, __element_type, __filter=None):
    df1 = df_data[df_data['__filter_type']==__filter][['ID', 'EntityType']].reset_index().rename(columns={'ID':'source_ID', 'EntityType':'source_EntityType'})
    df2 = df_data[df_data['__filter_type']==__filter][['ID', 'EntityType']].reset_index().rename(columns={'ID':'target_ID', 'EntityType':'target_EntityType'})
    
    df_tmp = df1\
        .merge(ocel.o2o, left_on='source_ID', right_on=ocel.object_id_column)[['index', 'source_ID', 'source_EntityType', 'ocel:qualifier', 'ocel:oid_2']].rename(columns={'index': ':START_ID', ocel.object_id_column+'_2':'target_ID'})\
        .merge(df2, on='target_ID').rename(columns={'index': ':END_ID'})
    
    df_tmp['EntityType'] = '(' + df_tmp['source_EntityType'] + ',' + df_tmp['target_EntityType'] + ')'
    df_tmp = df_tmp.rename(columns={'ocel:qualifier':'qual'})
    return df_tmp

processData(f, lbl_rel, lbl_meta_rel_entity_rel_entity, lbl_meta_node_entity)

In [23]:
test(lbl_meta_rel_entity_rel_entity)

# snapshot edges
This section adds edges labeled "snapshot" to tEKG to connect Entity nodes to Snapshot nodes.

In [24]:
def f(__element_label, __element_type, __filter=None):
    df_entity = df_data[df_data['__filter_type']==__filter[0]][list(meta_data[__filter[0]])].reset_index()
    df_entity_instance = df_data[df_data['__filter_type']==__filter[1]][list(meta_data[__filter[1]])].reset_index()
    df_tmp = df_entity_instance.merge(df_entity, left_on=['EntityType', 'ENTITY_ID'], right_on=['EntityType', 'ID'])[['index_x', 'index_y', 'timestamp', 'ENTITY_ID', 'EntityType']].rename(columns={'index_y':':START_ID', 'index_x':':END_ID', 'ENTITY_ID': 'source_ID', 'timestamp': 'source_timestamp'})
    df_tmp['target_ID'] = df_tmp['source_ID']
    return df_tmp

processData(f, lbl_snapshot, lbl_meta_rel_entity_snapshot_snapshot, (lbl_meta_node_entity, lbl_meta_node_snapshot))

In [25]:
test(lbl_meta_rel_entity_snapshot_snapshot)

# rel edges (update)
This section adds edges labeled "rel" to tEKG to connect Snapshot nodes to Snapshot nodes. These edges show the lifecycle of an object through time, during which the value of the object's properties changed over time. 

In [26]:
def f(__element_label, __element_type, __filter=None):
    df_tmp = df_data[df_data['__filter_type']==__filter][list(meta_data[__filter])].reset_index().sort_values(by=['ENTITY_ID', 'timestamp'])
    df_tmp['previous_index'] = df_tmp.groupby('ENTITY_ID')['index'].shift()
    df_tmp['source_timestamp'] = df_tmp.groupby('ENTITY_ID')['timestamp'].shift()
    df_tmp = df_tmp[['previous_index', 'index', 'timestamp', 'source_timestamp', 'ENTITY_ID', 'EntityType']].rename(columns={'previous_index': ':START_ID', 'index': ':END_ID', 'timestamp':'target_timestamp'})
    df_tmp = df_tmp[~df_tmp[':START_ID'].isna()]
    df_tmp['qual'] = 'update'

    df_tmp['source_ID'] =  '(' + df_tmp['ENTITY_ID'] + ',' + df_tmp['source_timestamp'].astype('str') + ')' # setting snapshots id
    df_tmp['target_ID'] =  '(' + df_tmp['ENTITY_ID'] + ',' + df_tmp['target_timestamp'].astype('str') + ')' # setting snapshots id
    # df_tmp = df_tmp.rename(columns={'ID': 'source_ID'})
    # df_tmp['target_ID'] = df_tmp['source_ID']
    df_tmp['source_EntityType'] = df_tmp['target_EntityType'] = df_tmp['EntityType']
    return df_tmp

processData(f, lbl_rel, lbl_meta_rel_snapshot_rel_update_snapshot, lbl_meta_node_snapshot)

In [27]:
test(lbl_meta_rel_snapshot_rel_update_snapshot)

# rel edges (Snapshot2Snapshot)
This section adds edges labeled "rel" to tEKG to connect Snapshot nodes to Snapshot nodes.

In [28]:
def f(__element_label, __element_type, __filter=None):
    df_tmp_o2o_rel = df_data[df_data['__filter_type']==__filter[0]][list(meta_data[__filter[0]])][[':START_ID', ':END_ID', 'qual']].rename(columns={':START_ID': 'o1', ':END_ID': 'o2'})
    df_tmp_o1_snapshot_rel = df_data[df_data['__filter_type']==__filter[1]][list(meta_data[__filter[1]])][[':START_ID', ':END_ID']].rename(columns={':START_ID': 'o1', ':END_ID': 'oi1'})
    df_tmp_o2_snapshot_rel = df_data[df_data['__filter_type']==__filter[1]][list(meta_data[__filter[1]])][[':START_ID', ':END_ID']].rename(columns={':START_ID': 'o2', ':END_ID': 'oi2'})
    
    df_tmp_o1 = df_data[df_data['__filter_type']==__filter[2]][list(meta_data[__filter[2]])]
    df_tmp_o1['oi1'] = df_tmp_o1.index
    df_tmp_o1 = df_tmp_o1[['oi1', 'timestamp', 'ID', 'EntityType']].rename(columns={'ID':'source_ID', 'EntityType':'source_EntityType'})
    
    df_tmp_o2 = df_data[df_data['__filter_type']==__filter[2]][list(meta_data[__filter[2]])]
    df_tmp_o2['oi2'] = df_tmp_o1.index
    df_tmp_o2 = df_tmp_o2[['oi2', 'timestamp', 'ID', 'EntityType']].rename(columns={'ID':'target_ID', 'EntityType':'target_EntityType'})
    
    
    df_tmp = df_tmp_o1_snapshot_rel.merge(df_tmp_o2o_rel, on='o1').merge(df_tmp_o1, on='oi1').merge(df_tmp_o2_snapshot_rel, on='o2').merge(df_tmp_o2, on='oi2').rename(columns={'timestamp_x':'timestamp_oi1', 'timestamp_y':'timestamp_oi2'})
    df_tmp = df_tmp[df_tmp['timestamp_oi1']>=df_tmp['timestamp_oi2']].sort_values(by=['oi1', 'o1', 'o2', 'timestamp_oi2'], ascending=False).groupby(['oi1', 'o1', 'o2']).first().reset_index()
    
    df_tmp = df_tmp.rename(columns={'oi1':':START_ID', 'oi2':':END_ID', 'timestamp_oi1':'source_timestamp', 'timestamp_oi2':'target_timestamp'})[[':START_ID', ':END_ID', 'qual', 'source_ID', 'source_timestamp', 'target_ID', 'target_timestamp', 'source_EntityType', 'target_EntityType']]
    
    
    df_tmp['EntityType'] = '(' + df_tmp['source_EntityType'] + ',' + df_tmp['target_EntityType'] + ')'
    return df_tmp

processData(f, lbl_rel, lbl_meta_rel_snapshot_rel_snapshot, (lbl_meta_rel_entity_rel_entity, lbl_meta_rel_entity_snapshot_snapshot, lbl_meta_node_snapshot))

In [29]:
test(lbl_meta_rel_snapshot_rel_snapshot)

# Entity nodes (reified)
This section adds nodes labeled "Entity" to tEKG for the reified ones.

In [30]:
def f(__element_label, __element_type, __filter=None):
    df_tmp = df_data[df_data['__filter_type']==__filter][meta_data[__filter]]
    df_tmp['ID'] = '(' + df_tmp['source_ID'] + ',' + df_tmp['target_ID'] + ')'
    df_tmp['EntityType'] = '(' + df_tmp['source_EntityType'] + ',' + df_tmp['target_EntityType'] + ')'
    df_tmp = df_tmp.rename(columns={'ocel:qualifier':'qual', ':START_ID':'__source_idx', ':END_ID': '__target_idx'})
    return df_tmp

processData(f, lbl_entity , lbl_meta_node_entity_reified, lbl_meta_rel_entity_rel_entity)

In [31]:
test(lbl_meta_node_entity_reified)

# Snapshot nodes (reified)
This section adds nodes labeled "Snapshot" to tEKG for the reified ones.

In [32]:
def f(__element_label, __element_type, __filter=None):
    cols = list(set(meta_data[__filter[0]]) or set(meta_data[__filter[1]]))
    df_tmp = df_data[df_data['__filter_type'].isin([__filter[0], __filter[1]])][cols].rename(columns={':START_ID':'__source_idx', ':END_ID':'__target_idx'}).reset_index(drop=True)
    df_tmp['ID'] = '(' + df_tmp['source_ID'] + ',' + df_tmp['target_ID'] + ')'
    df_tmp = df_tmp.rename(columns={'ocel:qualifier':'qual'})
    return df_tmp

processData(f, lbl_snapshot , lbl_meta_node_snapshot_reified , (lbl_meta_rel_snapshot_rel_update_snapshot , lbl_meta_rel_snapshot_rel_snapshot))

In [33]:
test(lbl_meta_node_snapshot_reified )

# derived edges
This section adds edges labeled "derived" to tEKG to connect reified nodes (either Snapshot or Entity) to their corresponding Snapshot or Entity nodes.

In [34]:
def f(__element_label, __element_type, __filter=None):
    df_tmp = df_data[df_data['__filter_type'].isin([__filter[0], __filter[1]])][['__source_idx', '__target_idx']]
    df_tmp['index'] = df_tmp.index
    df_tmp = pd.concat([df_tmp[['index', '__source_idx']].rename(columns={'index':':START_ID', '__source_idx':':END_ID'}), df_tmp[['index', '__target_idx']].rename(columns={'index':':START_ID', '__target_idx':':END_ID'})], ignore_index=True).drop_duplicates().reset_index(drop=True)
    df_tmp = df_tmp.rename(columns={'ocel:qualifier':'qual'})
    return df_tmp

processData(f, lbl_derived , lbl_meta_rel_derived, (lbl_meta_node_entity_reified, lbl_meta_node_snapshot_reified ))

In [35]:
test(lbl_meta_rel_derived)

# corr edges
This section adds edges labeled "corr" to tEKG. 

## corr (Entity)
This sub-section adds edges labeled "corr" to tEKG to connect Event nodes to Entity nodes.

In [36]:
def f(__element_label, __element_type, __filter=None):
    df_e2o = ocel.relations

    df_events = df_data[df_data['__filter_type']==__filter[0]][meta_data[__filter[0]]]
    df_events['event_indx'] = df_events.index
    df_e2o = df_e2o.merge(df_events, left_on='ocel:eid', right_on='EventID')[['event_indx', 'ocel:eid', 'ocel:oid', 'ocel:timestamp']].drop_duplicates()
    
    df_ent_ins = df_data[df_data['__filter_type']==__filter[1]][meta_data[__filter[1]]]
    df_ent_ins['ent_indx'] = df_ent_ins.index
    
    df_tmp = df_e2o.merge(df_ent_ins, left_on='ocel:oid', right_on='ID')[['event_indx', 'ent_indx', 'ocel:timestamp', 'EntityType', 'ID']].drop_duplicates()
    df_tmp = df_tmp.sort_values(by=['event_indx', 'ent_indx', 'ocel:timestamp']) # 'ocel:timestamp' is the event time
    df_tmp = df_tmp.groupby(['event_indx', 'ID', 'ocel:timestamp']).last().reset_index()[['event_indx', 'ent_indx', 'EntityType', 'ID']].drop_duplicates()
    df_tmp = df_tmp.rename(columns={'event_indx': ':START_ID', 'ent_indx':':END_ID'})
    
    df_tmp['source_ID'] = df_tmp['target_ID'] = df_tmp['ID']
    return df_tmp

processData(f, lbl_corr , lbl_meta_rel_event_corr_entity, (lbl_meta_node_event , lbl_meta_node_entity ))

In [37]:
test(lbl_meta_rel_event_corr_entity)

## corr (reified Entity)
This sub-section adds edges labeled "corr" to tEKG to connect Event nodes to reified Entity nodes.

In [38]:
def f(__element_label, __element_type, __filter=None):
    df_reified = df_data[df_data['__filter_type']==__filter[0]][meta_data[__filter[0]]]
    df_reified['rei_indx'] = df_reified.index
    df_reified = df_reified
    
    df_corr = df_data[df_data['__filter_type']==__filter[1]][meta_data[__filter[1]]]
    df_corr = df_corr[[':START_ID', ':END_ID']]
    
    df1 = df_corr.merge(df_reified, left_on=':END_ID', right_on='__source_idx')[[':START_ID', 'rei_indx', 'EntityType', 'ID', 'source_ID', 'target_ID']].drop_duplicates()
    df2 = df_corr.merge(df_reified, left_on=':END_ID', right_on='__target_idx')[[':START_ID', 'rei_indx', 'EntityType', 'ID', 'source_ID', 'target_ID']].drop_duplicates()
    
    df_tmp = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
    
    df_tmp = df_tmp.rename(columns={'rei_indx':':END_ID'})
    return df_tmp

processData(f, lbl_corr , lbl_meta_rel_event_corr_entity_reified, (lbl_meta_node_entity_reified, lbl_meta_rel_event_corr_entity))

In [39]:
test(lbl_meta_rel_event_corr_entity_reified)

## corr (Snapshot)
This sub-section adds edges labeled "corr" to tEKG to connect Event nodes to Snapshot nodes.

In [40]:
def f(__element_label, __element_type, __filter=None):
    df_e2o = ocel.relations

    df_events = df_data[df_data['__filter_type']==__filter[0]][meta_data[__filter[0]]]
    df_events['event_indx'] = df_events.index
    df_e2o = df_e2o.merge(df_events, left_on='ocel:eid', right_on='EventID')[['event_indx', 'ocel:eid', 'ocel:oid', 'ocel:timestamp']].drop_duplicates()
    
    df_ent_ins = df_data[df_data['__filter_type']==__filter[1]][meta_data[__filter[1]]]
    df_ent_ins['ent_indx'] = df_ent_ins.index
    
    df_tmp = df_e2o.merge(df_ent_ins, left_on='ocel:oid', right_on='ENTITY_ID')[['event_indx', 'ent_indx', 'ocel:timestamp', 'timestamp', 'EntityType', 'ENTITY_ID', 'ID']].drop_duplicates()
    df_tmp = df_tmp[df_tmp['ocel:timestamp']>=df_tmp['timestamp']].sort_values(by=['event_indx', 'ent_indx', 'ocel:timestamp', 'timestamp']) # 'ocel:timestamp' is the event time
    df_tmp = df_tmp.groupby(['event_indx', 'ENTITY_ID', 'ocel:timestamp']).last().reset_index()[['event_indx', 'ent_indx', 'EntityType', 'ID']].drop_duplicates()
    df_tmp = df_tmp.rename(columns={'event_indx': ':START_ID', 'ent_indx':':END_ID'})
    
    df_tmp['source_ID'] = df_tmp['target_ID'] = df_tmp['ID']
    return df_tmp

processData(f, lbl_corr , lbl_meta_rel_event_corr_snapshot, (lbl_meta_node_event , lbl_meta_node_snapshot))

In [41]:
test(lbl_meta_rel_event_corr_snapshot)

## corr (reified Snapshot)
This sub-section adds edges labeled "corr" to tEKG to connect Event nodes to reified Snapshot nodes.

In [42]:
def f(__element_label, __element_type, __filter=None):
    df_reified = df_data[df_data['__filter_type']==__filter[0]][meta_data[__filter[0]]]
    df_reified['rei_indx'] = df_reified.index
    df_reified = df_reified
    
    df_corr = df_data[df_data['__filter_type']==__filter[1]][meta_data[__filter[1]]]
    df_corr = df_corr[[':START_ID', ':END_ID']]
    
    df1 = df_corr.merge(df_reified, left_on=':END_ID', right_on='__source_idx')[[':START_ID', 'rei_indx', 'EntityType', 'ID', 'source_ID', 'target_ID']].drop_duplicates()
    df2 = df_corr.merge(df_reified, left_on=':END_ID', right_on='__target_idx')[[':START_ID', 'rei_indx', 'EntityType', 'ID', 'source_ID', 'target_ID']].drop_duplicates()
    
    df_tmp = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
    
    df_tmp = df_tmp.rename(columns={'rei_indx':':END_ID'})
    return df_tmp

processData(f, lbl_corr , lbl_meta_rel_event_corr_snapshot_reified, (lbl_meta_node_snapshot_reified, lbl_meta_rel_event_corr_snapshot  ))

In [43]:
test(lbl_meta_rel_event_corr_snapshot_reified)

# df edges
This section adds edges labeled "df" to tEKG to connect Events nodes to Entity or Snapshot nodes.

In [44]:
def f(__element_label, __element_type, __filter=None):
    def filter_dataframe_original(group):
        group['flag_original'] = (
                (
                    (group['Entity_ID']==group['Entity_Source_ID'])
                    &
                    (group['Entity_ID']==group['Entity_Target_ID'])
                )
            )
        return group

    def filter_dataframe_newinfo(group):
            group['flag_newinfo'] = (
                    (
                        (~group['Entity_Source_ID'].isin(group['Entity_ID']))
                        &
                        (~group['Entity_Target_ID'].isin(group['Entity_ID']))
                    )
                )
            return group
    
    df_events = df_data[df_data['__filter_type']==__filter[0]][meta_data[__filter[0]]]
    df_events['event_indx'] = df_events.index

    df_tmp = df_data[df_data['__filter_type'].isin(__filter[1])][[':END_ID', ':START_ID', 'EntityType', 'ID', 'source_ID', 'target_ID']]
    df_tmp = df_tmp.merge(df_events, left_on=':START_ID', right_on='event_indx').drop_duplicates()
    
    # group events based on correlated entity ids ordered by time to find next event
    df_tmp = df_tmp.sort_values([':END_ID', 'timestamp'])
    df_tmp['next_ev_id'] = df_tmp.groupby(':END_ID')[':START_ID'].shift(-1)
    df_tmp = df_tmp[~df_tmp['next_ev_id'].isna()].drop_duplicates()
    
    # creating dfs, 
    # we keep source event's timestamp temporarily so that we can later identify their orders when deleting parallel dfs
    df_tmp = df_tmp[[':START_ID', 'next_ev_id', 'EntityType', 'ID', 'source_ID', 'target_ID', 'timestamp']].rename(columns={'next_ev_id':':END_ID', 'ID':'Entity_ID', 'source_ID':'Entity_Source_ID', 'target_ID':'Entity_Target_ID'})
    df_tmp = df_tmp.sort_values([':START_ID', ':END_ID'])

    # start compiling parallel dfs    
    df_tmp = df_tmp.groupby([':START_ID', ':END_ID']).apply(filter_dataframe_original).reset_index(drop=True)
    df_tmp = df_tmp.groupby([':START_ID', ':END_ID']).apply(filter_dataframe_newinfo).reset_index(drop=True)
    
    df_tmp['flag_keep'] = df_tmp['flag_original'] | df_tmp['flag_newinfo']
    
    df_tmp = df_tmp.sort_values(['Entity_ID', 'timestamp'])
    df_tmp['prev_keep_flg'] = df_tmp.groupby('Entity_ID')['flag_keep'].shift(1)
    df_tmp['next_keep_flg'] = df_tmp.groupby('Entity_ID')['flag_keep'].shift(-1)
    
    df_tmp['flag_keep'] = df_tmp['flag_keep'] | (df_tmp['prev_keep_flg'] & df_tmp['next_keep_flg'])
    df_tmp = df_tmp[df_tmp['flag_keep']]
    df_tmp = df_tmp[[':START_ID', ':END_ID', 'EntityType', 'Entity_ID', 'Entity_Source_ID', 'Entity_Target_ID']]
    df_tmp.reset_index(drop=True, inplace=True)
    return df_tmp



processData(f, lbl_df , lbl_meta_rel_event_df_entity_event, (lbl_meta_node_event , [lbl_meta_rel_event_corr_entity , lbl_meta_rel_event_corr_entity_reified ]))
processData(f, lbl_df , lbl_meta_rel_event_df_snapshot_event, (lbl_meta_node_event, [lbl_meta_rel_event_corr_snapshot , lbl_meta_rel_event_corr_snapshot_reified  ]))

  df_tmp = df_tmp.groupby([':START_ID', ':END_ID']).apply(filter_dataframe_original).reset_index(drop=True)
  df_tmp = df_tmp.groupby([':START_ID', ':END_ID']).apply(filter_dataframe_newinfo).reset_index(drop=True)
  df_tmp = df_tmp.groupby([':START_ID', ':END_ID']).apply(filter_dataframe_original).reset_index(drop=True)
  df_tmp = df_tmp.groupby([':START_ID', ':END_ID']).apply(filter_dataframe_newinfo).reset_index(drop=True)


In [45]:
test(lbl_meta_rel_event_df_entity_event)

# Conclude

In [46]:
# This is to measure how long it took to add df edges by summing sub-measures.
meta_time[lbl_meta_rel_event_df_event ] = meta_time[lbl_meta_rel_event_df_entity_event] + meta_time[lbl_meta_rel_event_df_snapshot_event]

# Exporting files

In [47]:
import csv

In [48]:
import os, shutil
folder = './export'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

In [49]:
for k in meta_data.keys():
    if k.startswith('node'):
        cols = [c for c in meta_data[k] if not c.startswith('_')]
        cols.append(':LABEL')
        cols.append('id:ID')
        
        df_tmp = df_data[df_data.__filter_type==k][meta_data[k]]
        df_tmp = df_tmp.rename(columns={'__label':':LABEL'})
        df_tmp['id:ID'] = pd.to_numeric(df_tmp.index)
        df_tmp = df_tmp[cols]
        
        df_tmp.to_csv('./export/'+k.replace(':', '_')+'.csv', index=False, quoting=csv.QUOTE_ALL)

In [50]:
rel_nodes = []
rel_cols = []

for k in meta_data.keys():
    if k.startswith('rel'):
        rel_nodes.append(k)
        rel_cols += list(meta_data[k])

rel_cols = list(set(rel_cols))

In [51]:
cols = [c for c in rel_cols if not c.startswith('_')]
cols.append(':TYPE')


df_tmp = df_data[df_data.__filter_type.isin(rel_nodes)]
df_tmp = df_tmp.rename(columns={'__label':':TYPE'})
df_tmp[':START_ID'] = df_tmp[':START_ID'].astype(int)
df_tmp[':END_ID'] = df_tmp[':END_ID'].astype(int)
df_tmp = df_tmp[cols]

df_tmp.to_csv('./export/relations.csv', index=False, quoting=csv.QUOTE_ALL)

# Exporting meta data

In [52]:
import json

with open(experiment_path, "w") as fp:
    json.dump(meta_time, fp)