In [1]:
# wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=14UBHX6GTW_4YiyjNJB3EEq7Xb83AjuaK' -O process_mining_datasets.zip

import pandas as pd
# pd.set_option("max_columns", 200)
import numpy as np
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

# Meus pacotes
import parse_mxml as pm
import log_representation as lr
import plots as plts
import model_utils as mu
import drift_detection as dd
import offline_streaming_clustering as off_sc

from scipy.spatial import distance
from sklearn.base import clone as sk_clone 
import traceback 

from copy import deepcopy
import random
random.seed(42)
import os
import re
import warnings
warnings.filterwarnings("ignore")

import glob

from sklearn.cluster import KMeans, AgglomerativeClustering

import gc
gc.enable()

# # # # # # # # # # #
# LOAN APPLICATIONS #
# # # # # # # # # # #
aliases = {
    'Loan__application_received': 'START',
    'Appraise_property': 'A',
    'Approve_application': 'B',
    'Assess_eligibility': 'C',
    'Assess_loan_risk': 'D',
    'Cancel_application': 'E',
    'Check__application__form_completeness': 'F',
    'Check_credit_history': 'G',
    'Check_if_home_insurance_quote_is_requested': 'H',
    'Prepare_acceptance_pack': 'I',
    'Receive_updated_application': 'J',
    'Reject_application': 'K',
    'Return_application_back_to_applicant': 'L',
    'Send_acceptance_pack': 'M',
    'Send_home_insurance_quote': 'N',
    'Verify_repayment_agreement': 'O',
    'Loan__application_approved': 'END_A',
    'Loan_application_rejected': 'END_R',
    'Loan__application_canceled': 'END_C',
}

inv_aliases = {v: k for k, v in aliases.items()}

In [2]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

In [3]:
logs = insensitive_glob("../process_mining_datasets/*/*k.MXML")

In [4]:
logs = [x for x in logs if "2.5" not in x]

In [5]:
objects = {
    "model": {
        "kmeans__k=8": KMeans(n_clusters=8, random_state=42, n_jobs=6),
        "kmeans__k=6": KMeans(n_clusters=6, random_state=42, n_jobs=6),
        "kmeans__k=3": KMeans(n_clusters=3, random_state=42, n_jobs=3),
        "agglomerative__k=8_linkage=ward": AgglomerativeClustering(n_clusters=8, linkage='ward'),
        "agglomerative__k=6_linkage=ward": AgglomerativeClustering(n_clusters=6, linkage='ward'),
        "agglomerative__k=3_linkage=ward": AgglomerativeClustering(n_clusters=3, linkage='ward'),
    },
    
    "representation": {
        "binary": lambda x: lr.get_binary_representation(lr.get_traces_as_tokens(x)),
        "frequency": lambda x: lr.get_frequency_representation(lr.get_traces_as_tokens(x)),
        "tfidf": lambda x: lr.get_tfidf_representation(lr.get_traces_as_tokens(x)),
        "transitions": lambda x: lr.get_binary_transitions_representation(lr.get_traces_as_tokens(x)),
        "positions": lambda x: lr.get_positions_representation(lr.get_traces_as_tokens(x)),
        "positions_min_max": lambda x: lr.get_min_max_positions_representation(lr.get_traces_as_tokens(x)),
        "positions_avg": lambda x: lr.get_positions_representation(lr.get_traces_as_tokens(x), include_cols=["_avg"]),
    }
}

In [6]:
objects["model"].keys()

dict_keys(['kmeans__k=8', 'kmeans__k=6', 'kmeans__k=3', 'agglomerative__k=8_linkage=ward', 'agglomerative__k=6_linkage=ward', 'agglomerative__k=3_linkage=ward'])

In [7]:
config = {
    "log": [x for x in logs if "2.5" not in x],
    "representation": ['binary', 'frequency', 'tfidf', 'transitions', 'positions', 'positions_min_max', 'positions_avg'],
    "sliding_window": [False, True],
    "window_size": [100, 150, 200, 250],
    "model": [
        'kmeans__k=6', 'kmeans__k=3', 'agglomerative__k=6_linkage=ward', 'agglomerative__k=3_linkage=ward'
    ]
}

In [8]:
config = {
    "log": [x for x in logs if "2.5" not in x],
    "representation": ['binary', 'frequency', 'tfidf', 'transitions', 'positions', 'positions_min_max', 'positions_avg'],
    "sliding_window": [False, True],
    "window_size": [100, 150, 200, 250],
    "model": [
        'kmeans__k=8', 'agglomerative__k=8_linkage=ward'
    ]
}

In [9]:
from sklearn.model_selection import ParameterGrid
combinations = list(ParameterGrid(config))
len(combinations)

6048

In [10]:
def run_pipeline(args):
    try:
        r = {}
        
        # Tratamento do caminho do arquivo para estruturar nomes
        # e tamanho do log
        split = args["log"].split("/")

        cd_name = split[2]
        log_name = split[3][:-5]

        log_size = log_name.replace(cd_name, "")
        log_size = int(float(log_size.replace("k", "")) * 1000)

        # Monta o vetor de resposta
        y_true = [x for x in range(int(log_size/10), log_size, int(log_size/10))]

        # String para identificação da memória cache
        cached_info = "_".join([
            log_name, str(log_size), args["model"], args["representation"], str(args["window_size"]), str(args["sliding_window"])
        ])

        print(cached_info, end="\r")
        
        # Lê o log e aplica a representação
        log_read = pm.all_prep(args["log"], aliases)
        df = objects["representation"][args["representation"]](
            log_read
        )


        run_df, measures_df = off_sc.run_offline_clustering_window(
            sk_clone(objects["model"][args["model"]]),
            args["window_size"],
            df,
            args["sliding_window"]
        )

        all_metrics = run_df.join(measures_df)
        
        all_metrics.to_csv("../offline_window_clustering_results/" + cached_info + ".csv")
        
        test = pd.read_csv("../offline_window_clustering_results/" + cached_info + ".csv", index_col=0)
        assert test.shape == all_metrics.shape

    except Exception as e:
        traceback.print_exc()

In [83]:
run_pipeline(combinations[20])

centroids000_kmeans__k=8_tfidf_100_True
[[1.         0.         1.         1.         0.         0.
  0.         1.69555007 1.         1.         0.         0.
  0.         1.69555007 0.         0.         0.         0.
  1.        ]
 [1.         0.         1.         1.         0.         0.
  0.         1.69555007 2.         1.         0.         0.
  3.33821227 1.69555007 3.33821227 0.         0.         0.
  1.        ]
 [1.         0.         1.         1.         2.38509508 0.
  2.38509508 0.         1.         1.         1.69075006 1.69075006
  0.         0.         0.         0.         2.25772526 1.69075006
  1.        ]
 [1.         2.38270083 1.         1.         0.         2.38270083
  0.         0.         1.         1.         1.69075006 1.69075006
  0.         0.         0.         2.52831886 0.         1.69075006
  1.        ]
 [1.         0.         1.         1.         2.38509508 0.
  2.38509508 0.         1.         1.         1.69075006 1.69075006
  0.         0. 

In [3]:
from joblib import Parallel, delayed, parallel_backend
from tqdm import tqdm

In [4]:
final_resp = Parallel(n_jobs=40)(
    delayed(run_pipeline)(comb) for comb in tqdm(combinations)
)

NameError: name 'combinations' is not defined

In [12]:
test = pd.read_csv("../offline_window_clustering_results/sw7.5k_7500_kmeans__k=6_transitions_200_True.csv", index_col=0)

## Drift

In [25]:
drift_config = [
    {
        "input": os.listdir("../offline_window_clustering_results"),
        "detection_type": ["tradicional", "cumulative"],
        "rolling_window": [3, 4],
        "std_tolerance": [1.25, 1.5, 1.75, 2, 2.25],
    },
    {
        "input": os.listdir("../offline_window_clustering_results"),
        "detection_type": ["exponential"],
        "rolling_window": [3, 4],
        "std_tolerance": [1.25, 1.5, 1.75, 2, 2.25],
        "smooth_factor": [0.2, 0.4, 0.6, 0.8]
    }
]

In [26]:
def my_split(s):
    return list(filter(None, re.split(r'(\d+)', s)))

def parse_name_info(name):
    return {
        "tipo_mudanca": my_split(name[0])[0],
        "log_size": int(name[1]),
        "representation": name[-3],
        "window_size": int(name[-2]),
        "model": "_".join(name[2:-3]),
        "sliding_window": name[-1].split(".")[0]
    }

In [27]:
parse_name_info(drift_config[0]["input"][1].split("_"))

{'tipo_mudanca': 'sw',
 'log_size': 10000,
 'representation': 'binary',
 'window_size': 250,
 'model': 'kmeans__k=3',
 'sliding_window': 'True'}

In [28]:
from sklearn.model_selection import ParameterGrid
combinations = list(ParameterGrid(drift_config))
len(combinations)

1069080

In [29]:
def drift_detect_pipeline(args):
    all_metrics = pd.read_csv("../offline_window_clustering_results/" + args["input"], index_col=0)
    
    info = parse_name_info(args["input"].split("_"))
    # print(info)
    # Monta o vetor de resposta
    y_true = [x for x in range(int(info["log_size"]/10), info["log_size"], int(info["log_size"]/10))]
    
    results = []
    for col in all_metrics.select_dtypes(include=np.number).columns:
        if (col not in ["k"] and not col.startswith("diff") ) or col in ["diff_centroids"]:
            r = deepcopy(info)
            r["measure"] = col

            # print(col)
            # print(all_metrics[col].dtype)

            if args["detection_type"] == "tradicional":
                detected_drifts, extra = dd.detect_concept_drift(
                    all_metrics, 
                    col,
                    args["rolling_window"],
                    args["std_tolerance"]
                )

            if args["detection_type"] == "cumulative":
                detected_drifts, extra = dd.cumulative_detect_concept_drift(
                    all_metrics, 
                    col,
                    args["rolling_window"],
                    args["std_tolerance"]
                )


            if args["detection_type"] == "exponential":
                detected_drifts, extra = dd.exponential_smooth_detect_concept_drift(
                    all_metrics,
                    col,
                    args["rolling_window"],
                    args["std_tolerance"],
                    args["smooth_factor"]
                )


            metrics_results = dd.get_metrics(
                detected_drifts,
                y_true,
                info["window_size"]
            )


            r.update(args)
            r.update(metrics_results)

            results.append(r)

            gc.collect()
    
    
    to_string = [args["detection_type"], str(args["rolling_window"]), str(args["std_tolerance"]).replace(".", "-")]
    if "smooth_factor" in args:
        to_string += [str(args["smooth_factor"]).replace(".", "-")]
    
    base_name = args["input"].replace(".csv", "")
    
    try:
        os.makedirs("../drift_detection_results/" + base_name)
    except:
        pass
    
    pd.DataFrame(results).to_csv(
        "../drift_detection_results/" + base_name + "/" + "_".join(to_string) + ".csv",
        sep=";"
    )
    # print(col, len(results))

In [None]:
final_resp = Parallel(n_jobs=46)(
    delayed(drift_detect_pipeline)(comb) for comb in tqdm(combinations)
)


  0%|          | 0/1069080 [00:00<?, ?it/s][A
  0%|          | 46/1069080 [00:00<5:20:24, 55.61it/s][A
  0%|          | 92/1069080 [00:10<23:19:06, 12.73it/s][A

In [22]:
import traceback
traceback.print_last()

joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
    r = call_item()
  File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 608, in __call__
    return self.func(*args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/parallel.py", line 256, in __call__
    for func, args, kwargs in self.items]
  File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/parallel.py", line 256, in <listcomp>
    for func, args, kwargs in self.items]
  File "<ipython-input-17-4fc0d33e3375>", line 48, in drift_detect_pipeline
  File "/home/ubuntu/.jupyter/experimentos_mestrado/drift_detection.py", line 4