In [1]:
from os import listdir
from os.path import isfile, join, exists, dirname
import sys
import os

import pandas as pd

from logai.dataloader.data_loader import FileDataLoader
from logai.dataloader.data_loader import DataLoaderConfig
from logai.information_extraction.log_parser import LogParser, LogParserConfig
from logai.algorithms.parsing_algo.drain import DrainParams
from logai.preprocess.preprocess import Preprocessor, PreprocessorConfig
from logai.dataloader.data_model import LogRecordObject

In [2]:
# functions

def read_log(name):
    filepath = os.path.join(DIR, "{}.csv".format(name))

    with open(filepath, "r") as f:
        loglines = []
        cluster_labels = []
        pattern = ",{}`".format(name)
        for l in f.readlines():
            start_index = l.find(",{}`".format(name))
            if start_index != -1:
                logline = l[start_index:-1].replace(pattern, "")
                if logline:
                    loglines.append(logline.strip())
                    cluster_labels.append(l.split(",")[0])
    f.close()
    return loglines, cluster_labels

# read new ailtn
def read_new_ailtn():
    path = '/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets_new_labels/ailtn_with_label.csv'
    df = pd.read_csv(path, header=0)
    return df['_raw'], df['cluster_label']

# Levenshtein distance
import numpy as np
def levenshteinDistanceDP(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))

    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1

    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2

    a = 0
    b = 0
    c = 0

    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if token1[t1-1] == token2[t2-1] \
                    or token1[t1-1] == "*" or token2[t2-1] == "*":
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]

                if a <= b and a <= c:
                    distances[t1][t2] = a + 1
                elif b <= a and b <= c:
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

def calc_distance(base, p):
    b_token = base.split(" ")
    p_token = p.split(" ")

    l = max(len(b_token), len(p_token))
    dis = levenshteinDistanceDP(b_token, p_token)
    sim = (l - dis) * 1.0 / l
    return dis, sim

def get_sim_table(parsed_loglines: pd.Series, lrt, cluster):
    pattern_counts = parsed_loglines.value_counts(normalize=True, sort=True, ascending=False).reset_index()
    pattern_counts.columns = ["pattern", "portion"]
    pattern_counts.sort_values(by="portion", ascending=False, inplace=True)
    base_pattern = pattern_counts["pattern"][0]

    similarity_table = []
    for index, row in pattern_counts.iterrows():
        pattern = row["pattern"]
        portion = row["portion"]
        dis, sim = calc_distance(base_pattern, pattern)
        similarity_table.append([lrt, cluster, portion, dis, sim, pattern, base_pattern])
    res = pd.DataFrame.from_dict(similarity_table)
    res.columns=["lrt", "cluster", "portion", "distance", "similarity", "pattern", "base_pattern"]
    return res, base_pattern

from logai.utils.functions import get_parameter_list


# read new ailtn
def read_new_logs(path):
    df = pd.read_csv(path, header=0, low_memory=False)
    res = df[['_raw', 'cluster_label']]
    res.columns = ['logline', 'cluster_label']
    return res

def parse_logs(loglines, cluster_label, lrt):
    drain_config = DrainParams(sim_th=0.1,                       extra_delimiters=[])
    log_parser_config = LogParserConfig(parsing_algorithm='drain',                              parsing_algo_params=drain_config)
    parser = LogParser(log_parser_config)
    parsed_result = parser.parse(loglines.dropna())
    parsed_result['cluster_label'] = cluster_label
    parsed_result['lrt'] = lrt
    return parsed_result


def cal_similarity_for_lrt(logs:pd.DataFrame, lrt):

    logrecord = LogRecordObject(body=logs['logline'], attributes=logs[['cluster_label']])
    similarity = pd.DataFrame()
    custom_delimeter_regex = [r"`+|\s+"]
    preprocessor = Preprocessor(PreprocessorConfig(custom_delimiters_regex=custom_delimeter_regex))
    preprocessed_loglines = preprocessor.clean_log(logrecord.body)
    index_groups = preprocessor.group_log_index(logrecord.attributes, by=['cluster_label'])


    for i in index_groups.index:
        cluster_label = index_groups['cluster_label'].iloc[i]
        indices = index_groups['group_index'][i]
        if index_groups['cluster_label'].iloc[i] == -1:
            continue
        if len(indices) == 1:
            continue
        loglines_in_group = preprocessed_loglines.iloc[indices]
        parsed_result = parse_logs(loglines_in_group, cluster_label, lrt)

        uniq_patterns = parsed_result['parsed_logline'].unique()
        num_p = len(uniq_patterns)

        if num_p > 1:
            similarity_table, base_pattern = get_sim_table(parsed_result["parsed_logline"], lrt, cluster_label)
            similarity = similarity.append(similarity_table)
            parsed_result['parsed_logline'] = base_pattern
            parsed_result['parameter_list'] = parsed_result.apply(get_parameter_list, axis=1)

        para_list = parsed_result['parameter_list'].to_list()

    return similarity, parsed_result, base_pattern, para_list


In [3]:
# Data layer
# Load log data and store all data in @LogRecordObject. Currently only implemented FileDataLoader.

# Please change the filepath correspondingly.
# I've put the ./data dir in .gitignore to avoid checking in data unexpectedly

#File Configuration

filepath = "/Users/qcheng/workspace/gitsoma/logai/logai/data/default_data/dbscan_clustering_clean - dbscan_clustering_clean.csv"
log_type = 'csv'
dimensions = {'attributes': ['cluster_label'],
              'body': ['logline']}
custom_delimeter_regex = [r"`+|\s+"]

file_config = DataLoaderConfig(
    filepath=filepath,
    log_type='csv',
    dimensions=dimensions,
    custom_delimeter_regex=custom_delimeter_regex,
    header=0
)

dataloader = FileDataLoader(file_config)
logrecord = dataloader.load_data()
# Preprocess
# Do customer rules to initially parse the loglines. Add custom delimeters in a regex
# Group log records by any attributes. Return grouped log index so follow up process can handle them separately.

preprocessor = Preprocessor(PreprocessorConfig(custom_delimiters_regex=custom_delimeter_regex))
preprocessed_loglines = preprocessor.clean_log(logrecord.body)

#bucket loglines into groups.
index_groups = preprocessor.group_log_index(attributes=logrecord.attributes, by=['cluster_label'])

# Information Extraction
drain_config = DrainParams(sim_th=0.4,
                           extra_delimiters=[])

log_parser_config = LogParserConfig(parsing_algorithm='drain',
                                    parsing_algo_params=drain_config)

# to_parse = preprocessed_loglines['logline']

# parser = LogParser(log_parser_config)
# parsed_result = parser.parse(to_parse)
# num_patterns = dict()
#
# tree_path = '/Users/qcheng/workspace/gitsoma/logai/logai/results/pattern_discovery/tree/'
#
#
# for i in index_groups.index[:500]:
#     indices = index_groups['group_index'][i]
#     loglines_in_group = preprocessed_loglines.iloc[indices]
#     parser = LogParser(log_parser_config)
#     parsed_result = parser.parse(loglines_in_group.dropna()[dimensions['body'][0]])
#     longest_log_length = max([len(l) for l in parsed_result['logline']])
#     uniq_patterns = parsed_result['parsed_logline'].unique()
#     num_p = len(uniq_patterns)
#     longest_p_length = max([len(p.split(" ")) for p in uniq_patterns])
#     num_patterns[index_groups['cluster_label'].iloc[i]] = [num_p, longest_p_length, longest_log_length]

    #write results to file
    # parsed_result.to_csv('../results/pattern_discovery_for_cluster_{}.csv'.format(index_groups['cluster_label'][i]))

    # write generated tree to file
    # write_path = join(tree_path, '{}.txt'.format(index_groups['cluster_label'][i]))
    #
    # if parser.parser.clusters_counter > 1:
    #     if not exists(dirname(write_path)):
    #         try:
    #             os.makedirs(dirname(write_path))
    #         except OSError as exc: # Guard against race condition
    #             if exc.errno != exc.errno.EEXIST:
    #                 raise
    #     with open(write_path, 'w+') as f:
    #         parser.parser.print_tree(max_clusters=100, file=f)
    #         f.close()



TypeError: __init__() got an unexpected keyword argument 'custom_delimeter_regex'

In [4]:


DIR = "/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/"

Res_DIR = "/Users/qcheng/workspace/gitsoma/logai/logai/results/pattern_discovery/"

files = os.listdir(DIR)

log_names = [f.split('.')[0] for f in files]

num_patterns = dict()

similarity = pd.DataFrame()

for name in log_names:
    log, cluster_label = read_log(name)

    logrecord = LogRecordObject(body=pd.Series(log, name='logline'), attributes=pd.DataFrame(cluster_label, columns=['cluster_label']))

    custom_delimeter_regex = [r"`+|\s+"]

    preprocessor = Preprocessor(PreprocessorConfig(custom_delimiters_regex=custom_delimeter_regex))
    preprocessed_loglines = preprocessor.clean_log(logrecord.body)
    index_groups = preprocessor.group_log_index(logrecord.attributes, by=['cluster_label'])
    drain_config = DrainParams(sim_th=0.1,
                               extra_delimiters=[])

    log_parser_config = LogParserConfig(parsing_algorithm='drain',
                                        parsing_algo_params=drain_config)

    for i in index_groups.index:
        cluster_label = index_groups['cluster_label'].iloc[i]
        indices = index_groups['group_index'][i]
        if index_groups['cluster_label'].iloc[i] == -1:
            continue
        if len(indices) == 1:
            continue
        loglines_in_group = preprocessed_loglines.iloc[indices]
        parser = LogParser(log_parser_config)
        parsed_result = parser.parse(loglines_in_group.dropna())
        longest_log_length = max([len(l) for l in parsed_result['logline']])
        uniq_patterns = parsed_result['parsed_logline'].unique()
        num_p = len(uniq_patterns)
        longest_p_length = max([len(p.split(" ")) for p in uniq_patterns])
        num_patterns["{}:{}".format(name, index_groups['cluster_label'].iloc[i])] = [name, index_groups['cluster_label'].iloc[i], parsed_result.shape[0], num_p, longest_p_length, longest_log_length]

        if num_p > 1:
            f_path = os.path.join(Res_DIR, "multiple_pattern_clusters_pattern_level")
            if not os.path.exists(f_path):
                os.makedirs(f_path)
            p_path = os.path.join(Res_DIR, "token_simlarity")
            if not os.path.exists(p_path):
                os.makedirs(p_path)

            for pid in range(len(uniq_patterns)):
                res_df = parsed_result[parsed_result['parsed_logline'] == uniq_patterns[pid]]
                res = res_df['logline'].append(res_df.head(1)['parsed_logline'])
                res.str.split(" ", expand = True).to_csv(os.path.join(f_path, "lrt_{}_cluster_{}_pattern_{}.csv".format(name, index_groups['cluster_label'].iloc[i], pid)), index=False, header=False)
            #parsed_result.to_csv(os.path.join(f_path, "lrt_{}_cluster_{}.csv".format(name, index_groups['cluster_label'].iloc[i])))

            similarity_table = get_sim_table(parsed_result["parsed_logline"], name, cluster_label)
            similarity = similarity.append(similarity_table)

            # similarity_table.to_csv(os.path.join(p_path, "similarity_ltr_{}_cluster_{}.csv".format(name, index_groups['cluster_label'].iloc[i])))

        else:
            f_path = os.path.join(Res_DIR, "single_pattern_clusters_pattern_level")
            if not os.path.exists(f_path):
                os.makedirs(f_path)
            for pid in range(len(uniq_patterns)):
                res_df = parsed_result[parsed_result['parsed_logline'] == uniq_patterns[pid]]
                res = res_df['logline'].append(res_df.head(1)['parsed_logline'])
                # res.str.split(" ", expand = True).to_csv(os.path.join(f_path, "lrt_{}_cluster_{}_pattern_{}.csv".format(name, index_groups['cluster_label'].iloc[i], pid)), index=False, header=False)


            #parsed_result.to_csv(os.path.join(f_path, "lrt_{}_cluster_{}.csv".format(name, index_groups['cluster_label'].iloc[i])))


AttributeError: 'tuple' object has no attribute 'iloc'

In [None]:
#similarity.to_csv(os.path.join(Res_DIR, "similarity.csv"))
similarity.head(10)

In [None]:
# Get patterns

# df = pd.DataFrame.from_dict(num_patterns, orient="index")
# df.columns=["log_record_type", "cluster_labels", "n_logline", "n_patterns", "longest_p_length", "longest_log_length"]
# count_table = df.n_patterns.groupby(df['log_record_type']).value_counts().rename("cluster_counts")
# count_total = df.n_patterns.groupby(df['log_record_type']).count().rename("total_clusters")
# count_logline = df.n_logline.groupby(df['log_record_type']).sum('n_logline').rename('logline_counts')
# count_df = pd.DataFrame(count_table)
#
# total = count_df.join(pd.DataFrame(count_total).join(pd.DataFrame(count_logline)))
# total['n_patterns'] =[i[1] for i in total.index]
# total['log_record_type'] = [i[0] for i in total.index]
#
# total["ratio"] = total['cluster_counts'] / total['total_clusters']
# total.to_csv("/Users/qcheng/workspace/gitsoma/logai/logai/results/pattern_discovery/pattern_stats_exclude_single.csv")

# single_pattern = total.loc[(total["n_patterns"] == 1)].drop(["n_patterns", "log_record_type"], axis=1)
# summary = single_pattern.append(single_pattern.sum().rename(("Total", 1)))
# summary["ratio"] = summary['cluster_counts'] / summary['total_clusters']
# summary.to_csv("/Users/qcheng/workspace/gitsoma/logai/logai/results/pattern_discovery/single_pattern_stats.csv")

In [None]:
count, bin = np.histogram(similarity[similarity['similarity'] < 1.0]['similarity'], bins=10, range=[0,1])
ratio = count / sum(count)
print(ratio)

ratio[-1]+ratio[-2]+ratio[-3]+ratio[-4]+ratio[-5]

In [None]:
similarity[similarity['similarity'] < 1.0]['similarity'].plot.hist(bins=50, range=[0,1])



In [None]:
similarity[similarity['similarity'] < 0.6].to_csv(os.path.join(Res_DIR, "less_similar.csv"))


In [None]:
less_df = similarity[similarity['similarity'] < 0.6]

f_path = os.path.join(Res_DIR, "most_unsimilar_patterns")
if not os.path.exists(f_path):
    os.makedirs(f_path)

for ind, row in less_df.iterrows():
    p_token = row['pattern'].split(" ")
    b_token = row['base_pattern'].split(" ")
    fname = "{}_p_{}.csv".format(row['cluster'], ind)
    wfp = os.path.join(f_path, fname)
    with open(wfp, "w") as f:
        f.write(",".join(b_token))
        f.write("\n")
        f.write(",".join(p_token))

        f.close()

In [None]:
base = "* * 0 0 * * * 232.3.3 INFO X *"
p = "* * 0 0 * * * 232.3.3 INFO X *"

calc_distance(base, p)

In [None]:
similarity[similarity['lrt'].str.contains('ailtn')]['similarity'].plot.hist(bins=50, range=[0,1])

In [None]:
similarity.loc[(similarity['lrt'].str.contains('ailtn')) & (similarity['similarity'] < 0.8)]['similarity']

In [None]:
similarity.groupby(by=['lrt']).mean()[['distance', 'similarity']].to_csv(os.path.join(Res_DIR, "similarity_by_lrt.csv"))

In [None]:
similarity.groupby(by=['lrt']).mean()[['distance', 'similarity']]

In [None]:
DIR = "/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets_new_labels/"

file_list = os.listdir(DIR)

similarity = pd.DataFrame()
parsing_res = pd.DataFrame()

for file in file_list[:1]:
    path = os.path.join(DIR, file)
    name = file.split('_')[0]
    print(name)
    print(path)
    logs = read_new_logs(path)
    sim, res, base_pattern, para_list = cal_similarity_for_lrt(logs, name)
    similarity = similarity.append(sim)
    parsing_res = parsing_res.append(res)

In [None]:
parsing_res.head(10)

In [None]:
res['parameter_list'].to_list()