In [1]:
%reload_ext autoreload
%autoreload
import sys
sys.path.append("../src")

In [2]:
import os
import pandas as pd
import json
import csv
from tqdm import tqdm

from sapsam import parser, constants
from sapsam.SignavioConventionsChecker import bp_conventions_checker, syntax_checker

# Read data

In order to create our CSV file, let's first read in the data of all models in the dataset.

In [3]:
if os.path.exists(constants.DATA_CONVENTIONS / 'conventions.csv'):
    pass
else:
    df = parser.parse_model()
    display(df.head())

# Creating CSV file

To perform the API request for Best Practices (BP) conventions, we need the guideline ID of any workspace. Make sure your are allowed to access this workspace with your credentials.

To find the guideline ID, one way is to log on to the workspace, perform a manual BP check on a diagram with the mouse, and inspect the payload of the 'mgeditorchecker' HTTP request. The guideline ID can be found in the payload.

In [4]:
guideline_id = '4551c2229baa4c79a151b5a0cc1010d2'

The following iteration extracts the JSON data of each model, runs the syntax and BP checks through their respective API endpoints, and appends the result to `conventions.csv` for further processing.
> Note: currently the Signavio API allows 50 API calls/minute (25 models will be appended in 1 min). 

In [5]:
if os.path.exists(constants.DATA_CONVENTIONS / 'conventions.csv'):
    pass
else:
    os.mkdir(constants.DATA_CONVENTIONS)
    df = df[df['namespace'] == 'http://b3mn.org/stencilset/bpmn2.0#']
    
    with open(constants.DATA_CONVENTIONS / 'conventions.csv', mode='a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Model ID', 'Name', 'Syntax Errors', 'BP Violations Count'])
        if file.tell() == 0:
            writer.writeheader()
        print('Running conventions...')
                                
        for model_id in tqdm(df.index.unique()):
            model_data = df.loc[model_id]
            name = model_data['name']
            model_json = model_data['model_json']
            syntax_errors = syntax_checker(model_json)
            violations_count = bp_conventions_checker(name, model_id, guideline_id, model_json)
            writer.writerow({'Model ID': model_id, 'Name': name, 'Syntax Errors': syntax_errors, 'BP Violations Count': violations_count})

# Data analysis

Now that we have our file, let's parse and inspect the data.

In [None]:
df_conv = parser.parse_conventions()
df_conv.head()

Let's see how many models have either syntax or BP errors in them.

In [7]:
def has_error(json_str):
    json_data = json.loads(json_str)
    return bool(json_data.get('errors'))

In [8]:
syntax_error_count = df_conv['Syntax Errors'].apply(has_error).sum()
bp_error_count = df_conv['BP Violations Count'].apply(has_error).sum()
print(f"Number of models with at least one syntax error: {syntax_error_count}")
print(f"Number of models with at least one BP error: {bp_error_count}")

Number of models with at least one syntax error: 0
Number of models with at least one BP error: 1
