# Loading the datasets

In [None]:
from glob import glob

def templates_from_csv(filename):
    letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    with open(filename, 'r') as f:
        lines = f.readlines()
    for i in range(len(lines)):
        lines[i] = ','.join(lines[i].split(',')[1:])
        lines[i] = [
            _.rstrip().lstrip()
            for _ in lines[i].replace('"', "<*>").split('<*>')
            if any(letter in _ for letter in letters)
        ]
    return lines[1:]


def get_cluster_id(line, templates_uniques):
    uniques = []
    for cluster_id, unique in enumerate(templates_uniques):
        if all(u in line for u in unique):
            uniques.append(cluster_id)
    if len(uniques) == 1:
        return uniques[0]
    elif len(uniques) > 1:
        for unique in uniques:
            if all(
                    all(
                        _ in templates_uniques[unique]
                        for _ in templates_uniques[other_unique]
                    ) for other_unique in uniques if other_unique != unique
            ):
                return unique
            else:
                max_len = 0
                for unique in uniques:
                    new_len = sum(len(_) for _ in templates_uniques[unique])
                    if new_len > max_len:
                        max_len = new_len
                        best = unique
                return best
    else:
        pprint(templates_uniques)
        raise RuntimeError(
            "Could not identify cluster of this line:\n%s\n%s" %
            (line, [templates_uniques[unique] for unique in uniques])
        )



def load_data(root_log_path, load_modified=False):
    templates, logs = dict(), dict()
    for template_filename in glob(
        root_log_path + "*/*log_templates%s.csv" % (
            "_modified" if load_modified else ""
        )
    ):
        log_name = template_filename.split("/")[-1].split("_2k")[0]
        print("Unpacking %s dataset ..." % log_name, end='')
        templates[log_name] = templates_from_csv(template_filename)
        print("done")
        print("Checking %s integrity ...." % log_name, end='')
        with open(root_log_path + log_name + "/" + log_name + "_2k.log") as f:
            logs[log_name] = f.read().splitlines()
        for line in logs[log_name]:
            get_cluster_id(line, templates[log_name])
        print("OK")
        print("")
    return templates, logs

In [None]:
ROOT_LOG_PATH = "./logs/"

# templates_dict, logs_dict = load_data(ROOT_LOG_PATH)
templates_dict, logs_dict = load_data(ROOT_LOG_PATH, load_modified=True)

# Loading the pattern collections

In [None]:
import json

def load_pattern_collection(filename):
    with open(filename, 'r') as f:
        pattern_collection = json.load(f)
    return pattern_collection


def to_logmine_params(pattern_collection: dict):
    return [
        '"<%s>:/%s/"' % (name, re.replace("[", "\[").replace("]", "\]")) 
        for name, re in pattern_collection.items()
    ]


BASIC_COLLECTION = load_pattern_collection("./parameters/basic_collection.json")
SPECIFIC_COLLECTION = load_pattern_collection("./parameters/specific_collection.json")

# Metrics

In [None]:
from collections import defaultdict
from sklearn.metrics import adjusted_rand_score


def get_ground_truth_as_list_of_sets(lines, templates):
    map_cluster_index = defaultdict(set)
    for i, line in enumerate(lines):
        clust_id = get_cluster_id(line, templates)
        map_cluster_index[clust_id].add(i)
    return [v for v in map_cluster_index.values()]


def get_ground_truth_as_list(lines, templates):
    return [get_cluster_id(line, templates) for line in lines]


def get_parsing_accuracy(obtained_clusters, lines, templates):
    ground_truth_clusters = get_ground_truth_as_list_of_sets(lines, templates)
    obtained_clusters = [set(cluster) for cluster in obtained_clusters]
    clusters_in_common = [
        c for c in obtained_clusters if c in ground_truth_clusters
    ]
    return sum(len(cluster) for cluster in clusters_in_common) / len(lines)


def get_rand_index(obtained_clusters, lines, templates):
    ground_truth_clusters = get_ground_truth_as_list(lines, templates)
    obt_clust_as_list = [None for _ in range(len(lines))]
    for i, cluster in enumerate(obtained_clusters):
        for j in cluster:
            obt_clust_as_list[j] = i
    if not all(_ is not None for _ in obt_clust_as_list):
        new_clust_id = max(
            set([_ for _ in obt_clust_as_list if _ is not None])
        ) + 1
        obt_clust_as_list = [
            _ if _ is not None else new_clust_id
            for _ in obt_clust_as_list
        ]
    return adjusted_rand_score(ground_truth_clusters, obt_clust_as_list)


TIME = "time"
PA = "parsing accuracy"
ARI = "adjusted rand index"
NUM_CLUSTERS = "number of clusters"


METRICS = {
    "parsing accuracy": get_parsing_accuracy,
    "adjusted rand index": get_rand_index
}

# Logmine

In [None]:
from time import time
import subprocess

LOGMINE_REPO_PATH = "../logmine"


def logmine_clustering(
        logmine_repo_path: str,
        file_path: str,
        k1: float = None,
        k2: float = None,
        max_dist: float = None,
        logmine_regexps: list = None,
        verbose=False
):
    call_args = [logmine_repo_path + "/logmine"]
    if k1 is not None:
        call_args += ["-k1", str(k1)]
    if k2 is not None:
        call_args += ["-k2", str(k2)]
    if max_dist is not None:
        call_args += ["-m", str(max_dist)]
    call_args += ["-i", "1"]
    call_args += [file_path]

    if len(logmine_regexps) > 0:
        call_args += ["-v"] + logmine_regexps
    logmine_output = subprocess.check_output(call_args).decode('utf-8')
    if verbose:
        print(logmine_output)
    clusters = [
        [int(x) for x in line.split(' ')]
        for line in logmine_output.splitlines()
    ]
    return clusters

def evaluate_logmine_clustering(
        logmine_repo_path,
        file_path,
        ground_truth_templates,
        metrics,
        k1=None,
        k2=None,
        max_dist=None,
        logmine_regexps=None,
):
    if logmine_regexps is None:
        logmine_regexps = []
    start = time()
    clusters_as_list = logmine_clustering(
        logmine_repo_path,
        file_path,
        k1,
        k2,
        max_dist,
        logmine_regexps
    )
    computation_time = time() - start
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()
    results = {
        name: metric(clusters_as_list, lines, ground_truth_templates)
        for name, metric in metrics.items()
    }
    results[TIME] = computation_time
    results[NUM_CLUSTERS] = len(clusters_as_list)
    return results


In [None]:
LOG_NAME = "Proxifier"
MAX_DIST = 0.025

file_path = ROOT_LOG_PATH + LOG_NAME + "/" + LOG_NAME + "_2k.log"

for MAX_DIST in [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.36, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]:
    print(MAX_DIST, evaluate_logmine_clustering(
        LOGMINE_REPO_PATH,
        file_path,
        templates_dict[LOG_NAME],
        METRICS,
        max_dist=MAX_DIST,
        logmine_regexps=to_logmine_params(BASIC_COLLECTION)
    ))

# Drain

In [None]:
import configparser

PATH_TO_DRAIN_CONFIG = "./drain3config.ini"


def write_drain_config_file(
    map_name_re,
    extra_delimiters: list = ["_"],
    sim_th: float = 0.4,
    depth: int = 4,
    max_children: int = 100,
    max_clusters: int = 1024,
    path_to_config: str = PATH_TO_DRAIN_CONFIG,
):
    config = configparser.ConfigParser()
    config.read(path_to_config)

    config["MASKING"]["masking"] = json.dumps([
        {"regex_pattern": re, "mask_with": name}
        for name, re in map_name_re.items()
        if name != "any"
    ])
    config["DRAIN"]["sim_th"] = str(sim_th)
    config["DRAIN"]["depth"] = str(depth)
    config["DRAIN"]["max_children"] = str(max_children)
    config["DRAIN"]["max_clusters"] = str(max_clusters)

    with open(path_to_config, "w") as configfile:
        config.write(configfile)
try:
    from drain3 import TemplateMiner
    from drain3.template_miner_config import TemplateMinerConfig

    def drain_clustering(
            log_file_path: str,
            map_name_re,
            extra_delimiters: list = ["_"],
            sim_th: float = 0.4,
            depth: int = 4,
            max_children: int = 100,
            max_clusters: int = 1024,
            path_to_config: str = PATH_TO_DRAIN_CONFIG,
    ):
        write_drain_config_file(
            map_name_re,
            extra_delimiters,
            sim_th,
            depth,
            max_children,
            max_clusters,
            path_to_config
        )

        config = TemplateMinerConfig()
        config.load(path_to_config)
        config.profiling_enabled = False
        template_miner = TemplateMiner(config=config)

        with open(log_file_path, 'r') as f:
            lines = f.readlines()
        clusters_as_dict = defaultdict(list)
        for i, line in enumerate(lines):
            cluster_id = template_miner.add_log_message(line)["cluster_id"]
            clusters_as_dict[cluster_id].append(i)
        return clusters_as_dict
except:
    print("Warning: could not load Drain")

    def drain_clustering(
            log_file_path: str,
            map_name_re,
            extra_delimiters: list = ["_"],
            sim_th: float = 0.4,
            depth: int = 4,
            max_children: int = 100,
            max_clusters: int = 1024,
            path_to_config: str = PATH_TO_DRAIN_CONFIG,
            show_clusters=False
    ):
        print("Error! Drain is not installed !!(https://github.com/IBM/Drain3)")
        

def evaluate_drain_clustering(
    log_file_path: str,
    ground_truth_templates,
    metrics,
    map_name_re,
    extra_delimiters: list = ["_"],
    sim_th: float = 0.4,
    depth: int = 4,
    max_children: int = 100,
    max_clusters: int = 1024,
    path_to_config: str = PATH_TO_DRAIN_CONFIG,
    show_clusters=False
):
    start = time()
    clusters_as_dict = drain_clustering(
        log_file_path,
        map_name_re,
        extra_delimiters,
        sim_th,
        depth,
        max_children,
        max_clusters,
        path_to_config,
        show_clusters=show_clusters,
    )

    clusters_as_list = [v for v in clusters_as_dict.values()]
    computation_time = time() - start
    with open(log_file_path, 'r') as f:
        lines = f.readlines()
    results = {
        name: metric(clusters_as_list, lines, ground_truth_templates)
        for name, metric in metrics.items()
    }
    results[TIME] = computation_time
    results[NUM_CLUSTERS] = len(clusters_as_list)
    return results

In [None]:
for DRAIN_SIM_TH in [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]:
    for DRAIN_DEPTH in [3, 4, 5, 6]:
        print(DRAIN_SIM_TH, DRAIN_DEPTH, evaluate_drain_clustering(
            file_path,
            templates_dict[LOG_NAME],
            METRICS,
            BASIC_COLLECTION,
            sim_th=DRAIN_SIM_TH,
            depth=DRAIN_DEPTH
        ))