In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# My packages
from source import parse_mxml as pm
from source import log_representation as lr
from source import plots as plts
from source import drift_detection as dd
from source import drift_localization as dl
from source import offline_streaming_clustering as off_sc
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed, parallel_backend

import random
random.seed(42)

import os
import glob

import gc
gc.enable()

from scipy.spatial import distance
from sklearn.base import clone as sk_clone 

from copy import deepcopy

In [2]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

def if_any(string, lista):
    # If the string contains any of the values
    # from the list 'lista'
    for l in lista:
        if l in string:
            return True
    return False

In [3]:
# List log files
logs = insensitive_glob("../process_mining_datasets/*/*k.MXML")
logs = [x.replace('..\\', '../') for x in logs if "2.5" not in x]

In [4]:
# reference objects and map them to strings in dict 
# used in further methods
objects = {
    "model": {
        "kmeans__k=6": KMeans(n_clusters=6, random_state=42),
        "kmeans__k=3": KMeans(n_clusters=3, random_state=42),
        "kmeans__k=2": KMeans(n_clusters=2, random_state=42),
    },
    
    "representation": {
        "binary": lambda x: lr.get_binary_representation(lr.get_traces_as_tokens(x)),
        "frequency": lambda x: lr.get_frequency_representation(lr.get_traces_as_tokens(x)),
        "transitions": lambda x: lr.get_binary_transitions_representation(lr.get_traces_as_tokens(x))
    }
}

In [5]:
# change patterns and they supported representations

binary_support = ["cb", "cf",  "cm", "rp", "pm", "fr"]
frequency_support = binary_support + ['lp', 'cp', ]
transitions_support = frequency_support + ["pl", "cd", "sw"]

### Pipeline Offline Clustering

In [6]:
OUTPUT_PATH = "../LoanApplications_Offline/"

In [7]:
def read_file_and_run_clustering_pipeline(args, return_result=False):
    """
    Read an event log file, represent it into a feature vector space and
    run the trace clustering method over windows. This method outputs results
    as gzip csv files into the "OUTPUT_PATH" folder, or return the result 
    as DataFrame when return_result = True.
    
    Parameters:
    -----------
        args (dict): Dictionary with the parameters and the log_file path
            requiring the following keys:
                example = {
                    'log': <PATH TO LOG_FILE>,
                    'representation': <KEY TO REPRESENTATIONS IN 'objects'>,
                    'parameters': [{
                        'model': <KEY TO MODEL IN 'objects'>, 
                        'sliding_window': <WHETHER TO USE SLIDING WINDOW>,
                        'window_size': <SIZE OF TRACE WINDOW TO USE>,
                        'sliding_step': <STEP OF SLIDING WINDOW>
                    }
        return_result (bool): Whether to return the result as DataFrame
            
    """
    
    # Treat file name to structure size and log type
    split = args["log"].split("\\")

    # Parse change pattern name
    cd_name = split[1]
    log_name = split[2][:-5]

    # Parse size of the event_log
    log_size = log_name.replace(cd_name, "").replace("new_", "")
    log_size = int(float(log_size.replace("k", "")) * 1000)
    
    # Set up true drifts indexes
    y_true = [x for x in range(int(log_size/10), log_size, int(log_size/10))]
    
    try:
        # Read log and apply trace representation technique
        log_read = pm.all_prep(open(args["log"]))
        df = objects["representation"][args["representation"]](
            log_read
        )
        
        col_names = df.columns
        
        for p in args["parameters"]:
            # String to identify results when exporting files
            tipo_mudanca = cd_name.replace("new_", "")
            
            cached_info = "_".join([
                tipo_mudanca,
                str(log_size),
                p["model"],
                args["representation"],
                str(p["window_size"]),
                str(p["sliding_window"])
            ])
            
            print(cached_info)
            
            # If already exists, return if needed
            file_to_export = OUTPUT_PATH + tipo_mudanca + '/' + cached_info + '.pickle.gzip'
            if os.path.exists(file_to_export):
                if return_result:
                    r_ = pd.read_pickle(
                        file_to_export,
                        compression='gzip'
                    )
                    return r_
                else:
                    continue
            
            # If file does not exists, run trace clustering step and export file
            all_metrics = off_sc.run_offline_clustering_window(
                sk_clone(objects["model"][p["model"]]),
                p["window_size"],
                df,
                p["sliding_window"],
                sliding_step=p['sliding_step']
            )
            
            if return_result:
                return all_metrics
            else:
                try:
                    os.makedirs(OUTPUT_PATH + tipo_mudanca + '/')
                except:
                    pass
                all_metrics.to_pickle(file_to_export, compression="gzip")

            gc.collect()
    except Exception as e:
        raise e

#### Run pipeline for specific case(s)

In [8]:
all_metrics = read_file_and_run_clustering_pipeline({
    'log': '../process_mining_datasets\\fr\\fr5k.mxml',
    'representation': 'binary',
    'parameters': [{
        'model': 'kmeans__k=3', 
        'sliding_window': False,
        'window_size': 125,
        'sliding_step': 1
    }]
}, return_result=True)

fr_5000_kmeans__k=3_binary_125_False


In [9]:
all_metrics.head(2)

Unnamed: 0_level_0,k,Silhouette,DBi,centroids,avg_dist_between_centroids,std_dist_between_centroids,volume_list,radius_list,dist_intra_cluster_list,skewness_list,...,total_MSE,avg_MSE,count_non_zero_MSE,diff_avg_dist_between_centroids,diff_std_dist_between_centroids,diff_volume,diff_radius,diff_dist_intra_cluster,diff_skewness,diff_cluster_std
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
125,2,0.723323,0.697017,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0,...",2.494916,0.346735,"[56, 38, 31]","[1.2374368670764582, 1.4872495948362938, 1.354...","[0.3149839298012803, 0.9139844893551741, 1.052...","[0.26546593660094936, -0.5953856692612958, -0....",...,,,,,,,,,,
250,2,0.735449,0.677818,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0,...",2.502506,0.340888,"[62, 32, 31]","[1.2089244968673232, 1.4856080405005891, 1.460...","[0.3567318187477347, 0.9033037741280492, 1.009...","[0.25675899072182007, -0.5898857386498104, -0....",...,0.018007,0.000948,4.0,0.00759,-0.005847,"[6, -6, 0]",0.004051,0.001236,3.5e-05,6.85761e-08


### Run Experiments with several parameters combinations

In [10]:
# Trace clustering parameters
grid_parameters = list(ParameterGrid({
    "sliding_window": [False],    
    "window_size": [75, 100, 125, 150, 175, 200],
    'sliding_step': [1],
    "model": [
        'kmeans__k=6',
        'kmeans__k=3',
        'kmeans__k=2'
    ] 
}))

# Trace vector representations
grid_logs = list(ParameterGrid([
    {
        "log": [x for x in logs if if_any(x, binary_support)],
        "representation": ['binary']
    },
    {
        "log": [x for x in logs if if_any(x, frequency_support)],
        "representation": ['frequency']
    },
    {
        "log": [x for x in logs if if_any(x, transitions_support)],
        "representation": ['transitions']
    }
]))

# Combining all parameters
combs = []
for x in grid_logs:
    dic = x.copy()
    dic['parameters'] = grid_parameters
    
    combs.append(dic)

len(combs), len(grid_parameters), len(combs) * len(grid_parameters) 

(75, 18, 1350)

#### Run parallely

In [None]:
final_resp = Parallel(n_jobs=3)(
    delayed(read_file_and_run_clustering_pipeline)(comb) for comb in tqdm_notebook(combs)
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))

### Detection Pipeline

In [11]:
# Drift detection parameters
drift_config = list(ParameterGrid([
    {
        "rolling_window": [3, 4, 5],
        "std_tolerance": [1.25, 1.5, 1.75, 2],
        'min_tol': [0.01, 0.007, 0.005, 0.003] 
    }
]))

In [12]:
# List all files obtained after the trace clustering pipeline

clusterizacoes = glob.glob(OUTPUT_PATH + "*/*.pickle.gzip")

In [14]:
# Combinations to run

combs_deteccao = []
for log in clusterizacoes:
    combs_deteccao.append({
        'input': log,
        'combinations': drift_config
    })

In [19]:
# Replace part of the OUTPUT_PATH to create a new folder 
# for the detection results

OLD_prefix = "LoanApplications_Offline"
NEW_prefix = "LoanApplications_Offline__DETECTION"

In [31]:
def drift_detect_pipeline(args, return_results=False):
    """
        Runs the drift detection method based on the output from the trace
        clustering pipeline for different combination of parameters and every
        feature from tracking the trace clustering evolution. The
        outputs are into a new folder named by the NEW_prefix variable in 
        gzip csv files.
        
        Parameters:
        -----------
            args (dict): Dictionary with the parameters and the log_file path
                requiring the following keys:
                    example = {
                    }
    """
    # Read file
    all_metrics = pd.read_pickle(args["input"], compression='gzip')
    
    # Parse information from file name
    path_file = args["input"].replace(".pickle.gzip", "").split('\\')
    args.update({
        "tipo_mudanca": path_file[-1].split('_')[0],
        "log_size": int(path_file[-1].split('_')[1]),
        "model": "_".join(path_file[-1].split('_')[2:5]),
        "representation": "_".join(path_file[-1].split('_')[5:-2]),
        "window_size": path_file[-1].split('_')[-2],
        "sliding_window": path_file[-1].split('_')[-1]
    })
    
    # Run detection for every combination of parameter    
    for combination in args['combinations']:
        c = deepcopy(combination)
        c.update({
            'input': args['input'],
            'tipo_mudanca': args['tipo_mudanca'],
            'log_size': args['log_size'],
            'model': args['model'],
            'representation': args['representation'],
            'window_size': args['window_size'],
            'sliding_window': args['sliding_window']
        })
        
        if return_results:
            return __drift_detect_pipeline(
                all_metrics, c, return_results
            )
        else:
            __drift_detect_pipeline(
                all_metrics, c, return_results
            )
    

def __drift_detect_pipeline(all_metrics, args, return_results=False):     
    base_name = args["input"].replace(".pickle.gzip", "")
    base_name = base_name.replace(OLD_prefix, NEW_prefix)
    
    # Create string with parameters to identify file
    to_string = [
        str(args["rolling_window"]),
        str(args["std_tolerance"]).replace(".", "-"), 
        str(args["min_tol"]).replace(".", "-")
    ]
    
    try:
        os.makedirs(base_name)
    except:
        pass
    
    final_name = base_name.replace("\\", "/") + "/" + "_".join(to_string) + ".pickle.gzip"
    
    if os.path.isfile(final_name):
        if return_results:
            return pd.read_pickle(final_name, compression='gzip')
        else:
            print("Already exists")
            return
    
    y_true = [x for x in range(int(args['log_size']/10), args['log_size'], int(args['log_size']/10))]
    
    # Runs the drift detection for every feature
    results = []
    for col in all_metrics.select_dtypes(include=np.number).columns:
        if (col not in ["k"] and not col.startswith("diff") ) or col in ["diff_centroids"]:
            r = deepcopy(args)
            r["measure"] = col

            detected_drifts, extra = dd.detect_concept_drift(
                all_metrics, 
                col,
                args["rolling_window"],
                args["std_tolerance"],
                args["min_tol"]
            )    
    
            # Calculate classification metrics
            metrics_results = dd.get_metrics(
                detected_drifts,
                y_true,
                int(args["window_size"])
            )

            r.update(args)
            r.update(metrics_results)

            results.append(r)

            gc.collect()
    
    # Export as file
    pd.DataFrame(results).to_pickle(
        final_name,
        compression="gzip"
    )
    
    if return_results:
        return pd.DataFrame(results)
    
    # print(col, len(results))

In [29]:
detection_results = drift_detect_pipeline({
    'input': '../LoanApplications_Offline\\cb\\cb_10000_kmeans__k=2_binary_100_False.pickle.gzip',
    'combinations': [{
       'min_tol': 0.01,
       'rolling_window': 3,
       'std_tolerance': 1.25
    }, {
       'min_tol': 0.02,
       'rolling_window': 3,
       'std_tolerance': 1.25
    }]
}, return_results=True)

In [38]:
detection_results.sort_values('F1', ascending=False).head(2)

Unnamed: 0,min_tol,rolling_window,std_tolerance,input,tipo_mudanca,log_size,model,representation,window_size,sliding_window,measure,Precision,Recall,F1,Delay,Correct_Predictions,Support,Drifts_Found,Resp
10,0.01,3,1.25,../LoanApplications_Offline\cb\cb_10000_kmeans...,cb,10000,kmeans__k=2,binary,100,False,avg_cluster_std,1.0,1.0,1.0,100.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1]",9,"[1100, 2100, 3100, 4100, 5100, 6100, 7100, 810...","[1000, 2000, 3000, 4000, 5000, 6000, 7000, 800..."
16,0.01,3,1.25,../LoanApplications_Offline\cb\cb_10000_kmeans...,cb,10000,kmeans__k=2,binary,100,False,count_non_zero_MSE,0.8,0.888889,0.842105,150.0,"[1, 1, 1, 1, 0, 1, 1, 1, 1]",8,"[1100, 2200, 3100, 4200, 4900, 5500, 6200, 710...","[1000, 2000, 3000, 4000, 5000, 6000, 7000, 800..."


#### Run parallely

In [None]:
final_resp = Parallel(n_jobs=3)(
    delayed(drift_detect_pipeline)(comb_d) for comb_d in tqdm_notebook(combs_deteccao)
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=54.0), HTML(value='')))