In [87]:
import itertools
from os import listdir
from os.path import isfile, join, exists, dirname
import sys
import os

import pandas as pd

from logai.dataloader.data_loader import FileDataLoader
from logai.dataloader.data_loader import FileConfig
from logai.information_extraction.log_parser import LogParser, LogParserConfig
from logai.algorithms.parsing_algo.drain import DrainParams
from logai.preprocess.preprocess import Preprocessor, PreprocessorConfig
from logai.dataloader.data_model import LogRecordObject
from logai.utils import constants

In [88]:
# const


SIM_THRE = 0.6

# functions

def read_log(filepath):
    name = filepath.split("/")[-1].split(".")[0]

    with open(filepath, "r") as f:
        loglines = []
        cluster_labels = []
        pattern = ",{}`".format(name)
        for l in f.readlines():
            start_index = l.find(",{}`".format(name))
            if start_index != -1:
                logline = l[start_index:-1].replace(pattern, "")
                if logline:
                    loglines.append(logline.strip())
                    cluster_labels.append(l.split(",")[0])
    f.close()
    res = pd.DataFrame(list(zip(loglines, cluster_labels)),
                     columns =["logline", "cluster_label"])
    return res

# read new ailtn
def read_new_logs(path):
    df = pd.read_csv(path, header=0, low_memory=False)
    res = df[['_raw', 'cluster_label']]
    res.columns = ['logline', 'cluster_label']
    return res


# Levenshtein distance
import numpy as np
def levenshteinDistanceDP(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))

    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1

    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2

    a = 0
    b = 0
    c = 0

    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if token1[t1-1] == token2[t2-1] \
                    or token1[t1-1] == "*" or token2[t2-1] == "*":
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]

                if a <= b and a <= c:
                    distances[t1][t2] = a + 1
                elif b <= a and b <= c:
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

def calc_distance(base, p):
    b_token = base.split(" ")
    p_token = p.split(" ")

    l = max(len(b_token), len(p_token))
    dis = levenshteinDistanceDP(b_token, p_token)
    sim = (l - dis) * 1.0 / l
    return dis, sim

def get_sim_table(parsed_loglines: pd.Series, lrt, cluster):
    pattern_counts = parsed_loglines.value_counts(normalize=True, sort=True, ascending=False).reset_index()
    pattern_counts.columns = ["pattern", "portion"]
    pattern_counts.sort_values(by="portion", ascending=False, inplace=True)
    base_pattern = pattern_counts["pattern"][0]

    similarity_table = []
    for index, row in pattern_counts.iterrows():
        pattern = row["pattern"]
        portion = row["portion"]
        dis, sim = calc_distance(base_pattern, pattern)
        similarity_table.append([lrt, cluster, portion, dis, sim, pattern, base_pattern])
    res = pd.DataFrame.from_dict(similarity_table)
    res.columns=["lrt", "cluster", "portion", "distance", "similarity", "pattern", "base_pattern"]
    return res, base_pattern

from logai.utils.functions import get_parameter_list


# read new ailtn
def read_new_logs(path):
    df = pd.read_csv(path, header=0, low_memory=False)
    res = df[['_raw', 'cluster_label']]
    res.columns = ['logline', 'cluster_label']
    return res

def parse_logs(loglines, cluster_label, lrt):
    drain_config = DrainParams(sim_th=0.1,                       extra_delimiters=[])
    log_parser_config = LogParserConfig(parsing_algorithm='drain',                              parsing_algo_params=drain_config)
    parser = LogParser(log_parser_config)
    parsed_result = parser.parse(loglines.dropna())
    parsed_result['cluster_label'] = cluster_label
    parsed_result['lrt'] = lrt
    return parsed_result




def get_parameter_list_2(row):
    parameter_list = []
    if not isinstance(row[constants.LOGLINE_NAME], str) or not isinstance(row[constants.PARSED_LOGLINE_NAME], str):
        return parameter_list
    ll = row[constants.LOGLINE_NAME].split()
    pp = row[constants.PARSED_LOGLINE_NAME].split()
    buffer = []

    i = 0
    j = 0
    consec_pattern = False
    while i < len(ll) and j < len(pp):
        # print(ll[i], pp[j])
        if ll[i] == pp[j]:
            if buffer:
                parameter_list.append(" ".join(buffer))
                buffer =[]
            consec_pattern = False
            i+=1
            j+=1
        elif pp[j] == "*":
            if consec_pattern:
                parameter_list.append(" ".join(buffer))
                buffer = [ll[i]]
            else:
                buffer.append(ll[i])
            consec_pattern = True
            i+=1
            j+=1
        else:
            buffer.append(ll[i])
            i += 1
    if buffer:
        if i < len(ll):
            parameter_list.append(" ".join(buffer + ll[i:]))
        else:
            parameter_list.append(" ".join(buffer))
    return parameter_list



In [89]:
TARGET_CLUSTER = 13
TARGET_LRT= 'ailtn'

OLD_DIR = "/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/"
DIR = "/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets_new_labels/"


old_file_list = os.listdir(OLD_DIR)
file_list = os.listdir(DIR)

new_labels = ["ailtn", "mqdbg", "mqfrm"]

old_path = list(filter(None, [os.path.join(OLD_DIR, f) if f.split(".")[0] not in new_labels else None for f in old_file_list]))
new_path = [os.path.join(DIR, f) for f in file_list]

paths = old_path + new_path

paths


parsing_res = []

logline_map = pd.DataFrame()

non_qualified_clusters = []

for path in paths:
    if "sf_datasets_new_labels" in path:
        file = path.split("/")[-1]
        lrt = file.split('_')[0]
    # if lrt != TARGET_LRT:
    #     continue
        print(lrt)
        print(path)
        logs = read_new_logs(path)
    else:
        file = path.split("/")[-1]
        lrt = file.split(".")[0]
        print(lrt)
        print(path)
        logs = read_log(path)

    logrecord = LogRecordObject(body=logs['logline'], attributes=logs[['cluster_label']])
    similarity = pd.DataFrame()
    custom_delimeter_regex = [r"`+|\s+"]
    preprocessor = Preprocessor(PreprocessorConfig(custom_delimiters_regex=custom_delimeter_regex))
    preprocessed_loglines = preprocessor.clean_log(logrecord.body)
    index_groups = preprocessor.group_log_index(logrecord.attributes, by=['cluster_label'])


    for i in index_groups.index:
        cluster_label = index_groups['cluster_label'].iloc[i]
        # if cluster_label != TARGET_CLUSTER:
        #     continue
        indices = index_groups['group_index'][i]
        if index_groups['cluster_label'].iloc[i] == -1:
            continue
        if len(indices) == 1:
            continue
        loglines_in_group = preprocessed_loglines.iloc[indices]
        parsed_result = parse_logs(loglines_in_group, cluster_label, lrt)
        logline_map = logline_map.append(parsed_result)
        temp_res = parsed_result
        uniq_patterns = parsed_result[constants.PARSED_LOGLINE_NAME].unique()
        num_p = len(uniq_patterns)

        if num_p > 1:
            similarity_table, base_pattern = get_sim_table(parsed_result[constants.PARSED_LOGLINE_NAME], lrt, cluster_label)
            if min(similarity_table['similarity']) < SIM_THRE:
                print("similarity is lower than threshold: {}".format(SIM_THRE))
                non_qualified_clusters.append((lrt, cluster_label, min(similarity_table['similarity'])))

        else:
            base_pattern = uniq_patterns[0]

        if "*" in base_pattern:
            parsed_result[constants.PARSED_LOGLINE_NAME] = base_pattern
            parsed_result[constants.PARAMETER_LIST_NAME] = parsed_result.apply(get_parameter_list_2, axis=1)

        para_list = parsed_result[constants.PARAMETER_LIST_NAME]
        para_list = list(map(set, itertools.zip_longest(*para_list, fillvalue=None)))
        para_list = [set(r) for r in para_list]

        ps_res = {
            "lrt": lrt,
            "cluster_label": cluster_label,
            "base_pattern": base_pattern,
            "parameter_list": para_list
        }
        parsing_res.append(ps_res)


mlmul
/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/mlmul.csv
ksgen
/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/ksgen.csv
augen
/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/augen.csv
s
/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/s.csv
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower than threshold: 0.6
similarity is lower 

In [90]:
Res_DIR = "/Users/qcheng/workspace/gitsoma/logai/logai/results/pattern_discovery/"

logline_map.to_csv(os.path.join(Res_DIR, "logline_map.csv"))

res_file = os.path.join(Res_DIR, "cluster_patterns.json")
pd.DataFrame.from_dict(parsing_res).to_json(res_file, orient='records')
pd.DataFrame(non_qualified_clusters, columns=["lrt", "cluster", "min_sim"]).to_csv(os.path.join(Res_DIR, "non_qualified_clusters.csv"), index=False, header=True)