# Imports and Setup

In [38]:
import pm4py
import numpy as np
import pandas as pd
import pdb
import os
import yatter
from ruamel.yaml import YAML
import kglab
import re

In [5]:
## define paths
data_dir = '../data'
output_dir = '../output'
config_dir = '../config'
os.makedirs(data_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(config_dir, exist_ok=True)

mapping_file = data_dir + '/sample_mapping.yaml'
rml_output_path = output_dir + '/sample_mapping_rml.ttl'
kg_config_path = config_dir + '/sample_kg_config.yaml'

In [6]:
## define KGLab config
config = f"""
[test]
mappings={rml_output_path}
"""
with open(kg_config_path, 'w') as f:
    f.write(config)
# define KG namespaces   
namespaces = {
    'ex:' : "http://example.com/",
    'on:' : "https://stl.mie.utoronto.ca/ontologies/spm/"
}

In [7]:
def load_df_from_log(log_path):
  """
  Return a dataframe from a given XES log filepath or CSV
  """
  if any(log_path.lower().endswith(ext) for ext in ['.xes', '.xes.gz']):
    log = pm4py.read_xes(log_path)
    df = pm4py.convert_to_dataframe(log)
  elif log_path.lower().endswith('.csv'):
    df = pd.read_csv(log_path)

  return df

In [8]:
# view the sample event log
log_df = load_df_from_log('../data/sample_log.csv')
log_df.head()

Unnamed: 0,caseID,activityID,eventID,timestamp,resourceID
0,case_0,activity_A,event_0,2016-01-01 09:00:00.000000+00:00,user_1
1,case_0,activity_A,event_1,2016-01-01 09:15:00.000000+00:00,user_1
2,case_0,activity_C,event_2,2016-01-01 09:35:00.000000+00:00,user_1
3,case_1,activity_A,event_3,2016-01-02 09:00:00.000000+00:00,user_1
4,case_1,activity_B,event_4,2016-01-02 09:00:00.000000+00:00,user_0


In [9]:
## convert YARRRML to RML
yaml = YAML(typ='safe', pure=True)
yarrrml_content = yaml.load(open(mapping_file))
rml_content = yatter.translate(yarrrml_content)
rml_file = open(rml_output_path, 'w')
rml_file.write(rml_content)
rml_file.close()

2024-07-29 20:09:21,473 | INFO: Translating YARRRML mapping to [R2]RML
2024-07-29 20:09:21,474 | INFO: RML content is created!
2024-07-29 20:09:21,486 | INFO: Mapping has been syntactically validated.
2024-07-29 20:09:21,488 | INFO: Translation has finished successfully.


In [10]:
# init knowledge graph
kg = kglab.KnowledgeGraph(name="event-log-sample", namespaces=namespaces)
# create instances from mapping
kg.materialize('config.ini')
# save rdf instances
kg.save_rdf(output_dir + '/sample_log_instances.ttl')

2024-07-29 20:09:21,518 | DEBUG: CONFIGURATION: {'output_file': 'knowledge-graph', 'na_values': ',nan', 'safe_percent_encoding': '', 'read_parsed_mappings_path': '', 'write_parsed_mappings_path': '', 'mapping_partitioning': 'PARTIAL-AGGREGATIONS', 'logging_file': '', 'oracle_client_lib_dir': '', 'oracle_client_config_dir': '', 'udfs': '', 'output_dir': '', 'output_format': 'N-TRIPLES', 'only_printable_chars': 'no', 'infer_sql_datatypes': 'no', 'logging_level': 'INFO', 'number_of_processes': '24'}
2024-07-29 20:09:21,519 | DEBUG: DATA SOURCE `test`: {'mappings': '../output/sample_mapping_rml.ttl'}
2024-07-29 20:09:22,212 | INFO: 8 mapping rules retrieved.
2024-07-29 20:09:22,222 | DEBUG: All predicate maps are constant-valued, invariant subset is not enforced.
2024-07-29 20:09:22,227 | DEBUG: All graph maps are constant-valued, invariant subset is not enforced.
2024-07-29 20:09:22,231 | INFO: Mapping partition with 8 groups generated.
2024-07-29 20:09:22,233 | INFO: Maximum number of ru

In [86]:
## convert from RDF A-Box to PSL A-Box
ABox = np.array([])

## Query1: Simple unary predicates
df = kg.query_as_df(sparql="SELECT ?s ?o WHERE {?s a ?o}")
unary_preds = df.apply(lambda x: re.sub(r'.*:', '', x['o']) + '(' + re.sub(r'.*/|>$', '', x['s']) + ')', axis=1).values
ABox = np.concatenate((ABox, unary_preds), axis=0)

In [87]:
## Query 2: Timepoints
df = kg.query_as_df(sparql="SELECT ?s ?t WHERE {?s ns1:hasRecordedTime ?t}")
unique_timestamps = df['t'].unique()

# create timestamp mapping
timestamp_mapping = {timestamp: f'ts_{i}' for i, timestamp in enumerate(sorted(unique_timestamps))}

# apply mapping
df['new_t'] = df['t'].map(timestamp_mapping)

# create ordering relations over timestamps
unique_mapped_timestamps = sorted(df['new_t'].unique())
timestamp_pairs = [(unique_mapped_timestamps[i], unique_mapped_timestamps[i+1]) for i in range(len(unique_mapped_timestamps) - 1)]

before_relations = [f'before({t1},{t2})' for t1, t2 in timestamp_pairs]

timestamp_preds = [f'timepoint({t})' for t in unique_mapped_timestamps]

event_timings = df.apply(lambda x: 'hasRecordedTime({}, {})'.format(re.sub(r".*/|>$", '', x["s"]), x["new_t"]), axis=1).values

ABox = np.concatenate((ABox, timestamp_preds, event_timings, before_relations), axis=0)

In [88]:
## Query 3: Other binary relations
df = kg.query_as_df(sparql="SELECT ?s ?p ?o WHERE {?s ?p ?o . FILTER (?p != rdf:type && ?p != ns1:hasRecordedTime)}")
binary_relations = df.apply(lambda x: f'{re.sub(r".*:", "", x["p"])}({re.sub(r".*/|>$", "", x["s"])}, {re.sub(r".*/|>$", "", x["o"])})', axis=1).values

ABox = np.concatenate((ABox, binary_relations), axis=0)

In [89]:
## Lastly, add process instance relations
df = kg.query_as_df(sparql="SELECT ?s WHERE {?s a ns1:Event}")
process_instance = df.apply(lambda x: f'hasProcess({re.sub(r".*/|>$", "", x["s"])}, P1)', axis=1).values
process_instance

ABox = np.concatenate((ABox, process_instance), axis=0)

In [91]:
ABox

array(['Resource(user_0)', 'Resource(user_1)', 'Resource(user_2)',
       'Event(event_3)', 'Event(event_6)', 'Event(event_1)',
       'Event(event_2)', 'Event(event_9)', 'Event(event_7)',
       'Event(event_4)', 'Event(event_5)', 'Event(event_8)',
       'Event(event_0)', 'Case(case_0)', 'Case(case_2)', 'Case(case_1)',
       'Activity(activity_C)', 'Activity(activity_B)',
       'Activity(activity_A)', 'Activity(activity_D)', 'timepoint(ts_0)',
       'timepoint(ts_1)', 'timepoint(ts_2)', 'timepoint(ts_3)',
       'timepoint(ts_4)', 'timepoint(ts_5)', 'timepoint(ts_6)',
       'hasRecordedTime(event_3, ts_4)', 'hasRecordedTime(event_4, ts_4)',
       'hasRecordedTime(event_8, ts_2)', 'hasRecordedTime(event_1, ts_2)',
       'hasRecordedTime(event_2, ts_3)', 'hasRecordedTime(event_7, ts_1)',
       'hasRecordedTime(event_6, ts_0)', 'hasRecordedTime(event_0, ts_0)',
       'hasRecordedTime(event_5, ts_5)', 'hasRecordedTime(event_9, ts_6)',
       'before(ts_0,ts_1)', 'before(ts_1,ts_2

In [92]:
# Save ABox to file
with open(output_dir + '/sample_log_ABox.clif', 'w') as f:
    for item in ABox:
        f.write("%s\n" % item)