In [None]:
import pm4py
import importer
from helpers import sollmodell_helpers, warnings_off, data_prep
from main_repair import MainRepair
import pandas as pd
import math

In [None]:
warnings_off.turn_off_warnings()

In [None]:
input_path = ""
log = importer.read_from_input_file(input_path)

In [None]:
matnr = ''

kunde1: Fix datatype of `material_id`

In [None]:
log['material_id'] = log['material_id'].apply(lambda x: str(int(x)) if type(x)==float and not math.isnan(x) else str(x) if type(x)==int else x)

In [None]:
df_to_print = log.groupby('material_id').agg({'case:concept:name': 'nunique', 'concept:name': 'nunique'}).sort_values(by='case:concept:name', ascending=False)
df_to_print = df_to_print[(df_to_print['concept:name'] >= 8) &( df_to_print['case:concept:name'] >= 100)]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_to_print)

In [None]:
log_matnr = log[log['material_id']==matnr]

In [None]:
assert log_matnr.shape[0] > 0, "Material ID not in data."

### Soll-Modell

In [None]:
generierter_soll_log = sollmodell_helpers.create_soll_agrelnr(log_matnr)

In [None]:
soll_m, soll_im, soll_fm = pm4py.discover_petri_net_inductive(generierter_soll_log)
pm4py.view_petri_net(soll_m)

In [None]:
log_matnr_for_soll_m2 = log_matnr.__deepcopy__()
log_matnr_for_soll_m2 = log_matnr_for_soll_m2[~log_matnr_for_soll_m2['time:timestamp'].isna()]

soll_m2, soll_im2, soll_fm2 = pm4py.discover_petri_net_inductive(log_matnr_for_soll_m2,
                                                                 noise_threshold=0.8)
pm4py.view_petri_net(soll_m2)

### Soll-Modell Version 3: most common variants

In [None]:
soll_m3, soll_im3, soll_fm3 = sollmodell_helpers.create_soll_modell_by_variants(log_matnr_for_soll_m2,
                                                                                variants_cover_pct=0.3)

pm4py.view_petri_net(soll_m3, soll_im3, soll_fm3)

### Remove Outliers in KPI values

OEE-Aspect `Performance` contains outliers, values that are far higher than 1, even if Performance is defined as the ratio of Processing duration divided by Planned processing duration. It is supposed to be around 1, but if there are faulty or wrong processing durations tracked or errors in planning, there are outlier values that are much higher.

Outlier Remove: set all `Performance` values that are > 5 to 1. We do not want to remove outliers since each process step is part of a trace and for model repair we either incorporate a whole trace to the model or not at all. Therefore, each process step can be a valuable part of a trace even if Performance of this very step is an outlier and is set to 1 here.

In [None]:
log_matnr = data_prep.remove_outliers_in_kpi_values(log_matnr, outliers_threshold = 5)

### Calculate target KPI

In [None]:
target_value_col = 'OEE'
log_matnr[target_value_col] = log_matnr['Quality'] * log_matnr['Performance'] * log_matnr['Availability']

Group to get KPI values per trace (case:concept:name)

In [None]:
grouped_for_kpi = log_matnr.groupby('case:concept:name').agg({'concept:name': 'count', target_value_col: 'mean',
                                                    'Quality': 'mean', 'Performance': 'mean', 'Availability': 'mean'}).reset_index()

Create categorical target values by grouping into either **higher or lower than mean**

In [None]:
target_cols = [target_value_col, 'Quality', 'Performance', 'Availability']

column `OEE` is now the OEE per case (case:concept:name)

In [None]:
log_matnr = log_matnr.merge(grouped_for_kpi[['case:concept:name']+target_cols], on='case:concept:name', suffixes=['_activity', ''])

In [None]:
kpi_dict = dict(zip(grouped_for_kpi['case:concept:name'], grouped_for_kpi[target_value_col]))

Get mean of target KPI and set as threshold (preliminary)

In [None]:
thresholds_dict = {
    'mean': grouped_for_kpi[target_value_col].mean(),
    'median': grouped_for_kpi[target_value_col].median(),
    'q25': grouped_for_kpi[target_value_col].quantile(q=0.25),
    'q75': grouped_for_kpi[target_value_col].quantile(q=0.75)
}

for name, value in thresholds_dict.items():
    n_cases_greater_than_threshold = grouped_for_kpi[grouped_for_kpi[target_value_col]>=value].shape[0]
    print(f"{name}-value: {value} with {n_cases_greater_than_threshold} cases greater than {name}-threshold")

In [None]:
#satisfactory_threshold = q75
satisfactory_threshold = thresholds_dict['q25']
print(satisfactory_threshold)
grouped_for_kpi[grouped_for_kpi[target_value_col]>=satisfactory_threshold].shape

In [None]:
log_matnr = log_matnr[log_matnr['time:timestamp'].notna()]

### Set up MainRepairer object

In [None]:
soll_model_1 = ('Soll Modell PLANNED', soll_m, soll_im, soll_fm)
soll_model_2 = ('Soll Modell DISCOVERED', soll_m2, soll_im2, soll_fm2)

for soll_m_tuple in [soll_model_2]: # soll_model_1,
  print(soll_m_tuple[0])
  repairer = MainRepair(log_matnr,
                      soll_m_tuple[1],
                      soll_m_tuple[2],
                      soll_m_tuple[3],
                      target_KPI_values_per_case = kpi_dict,
                      satisfactory_values=[satisfactory_threshold])
  
  repairer.main()

  repairer.print_conformant_kpi_values()