In [1]:
import sys
import getopt
from math import cos, sqrt
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from Levenshtein import seqratio
from tqdm import tqdm


DEFAULT_MIN_SIM_THRESHOLD = 0.8


def clean_log(log):
    '''
    Cleans the log file using regex to remove unneccessary data
    '''
    user = re.compile('<\/.*>')
    token = re.compile('\[token:.*\]')
    mult_surls = re.compile('\[SURL: \[.*\]\]')
    surl = re.compile('\[SURL: .*\]')
    log = re.sub(user, 'USER', log)
    log = re.sub(token, 'TOKEN', log)
    log = re.sub(mult_surls, 'MULT_SURLS', log)
    cleaned_log = re.sub(surl, 'SURL', log)
    return cleaned_log


def clusterize(df, sim_thres=DEFAULT_MIN_SIM_THRESHOLD):
    '''
    Clusterizes the dataframe based on the similarity of logs to the reference of each cluster
    If no cluster within the threshold is found creates new cluster

    Uses normalized Levenshtein distance

    Returns the dataframe and a dicionary of clusters
    '''
    clusters = []
    id = 0
    for i, row in tqdm(df.iterrows()):
        best_clust = None

        log = row.message
        cleaned_log = clean_log(log)
        if len(clusters) == 0:
            clusters.append({'id': id, 'ref': cleaned_log, 'logs': [log]})
            df.at[i, 'cluster'] = id
            id = id+1
            continue

        similarities = [seqratio(cleaned_log, cluster['ref'])
                        for cluster in clusters]
        best_clust = np.argmax(similarities)
        if similarities[best_clust] > sim_thres:
            clusters[best_clust]['logs'].append(log)
            df.at[i, 'cluster'] = clusters[best_clust]['id']
        else:
            clusters.append({'id': id, 'ref': cleaned_log, 'logs': [log]})
            df.at[i, 'cluster'] = id
            id = id+1

    df.cluster = df.cluster.astype(int)
    return df, clusters


def add_value_labels(ax, reference_logs, spacing=5):
    '''
    Writes the occurrence value over each bar

    If reference log list is provided writes the corresponding log over each bar
    '''
    id = 0
    for rect in ax.patches:
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        vert_spacing = spacing
        vert_alignment = 'bottom'
        angle = 90

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            vert_spacing *= -1
            vert_alignment = 'top'

        # Create value annotation
        label = "{:.0f}".format(y_value)
        ax.annotate(label, (x_value, y_value), xytext=(0, vert_spacing),
                    textcoords="offset points", ha='center', va=vert_alignment)

        # Create log annotation
        if isinstance(reference_logs, (list,)):
            label = reference_logs[id]
            ax.annotate(label, (x_value, y_value), xytext=(0, vert_spacing*4),
                        textcoords="offset points", ha='center', va=vert_alignment, rotation=angle, fontsize='xx-small')
        id = id+1


def plot_clusters(unique_clusters, labels=None, skip_single=False):
    '''
    Plot the cluster size bar graph
    If list of label is passed prints it over each bar

    Label should be the refernce log for each cluster
    '''
    fig, ax = plt.subplots()

    if skip_single == True:
        clst_occurrence = unique_clusters[unique_clusters > 1]
    else:
        clst_occurrence = unique_clusters

    clst_label = clst_occurrence.index.values
    ax.bar(range(len(clst_label)),
           clst_occurrence)
    ax.set_xticks(range(len(clst_label)))
    ax.set_xticklabels(clst_label)
    ax.set_xlabel('cluster')
    ax.set_ylabel('occurrences')

    add_value_labels(ax, labels)

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    plt.show()
    return fig

To run the script run the following code with path to `inputfile` and `outputfile`, and proper `similarity_threshold`

In [1]:
inputfile = ''
outputfile = ''
similarity_threshold = DEFAULT_MIN_SIM_THRESHOLD

df = pd.read_csv(inputfile)

df, cluster_dict = clusterize(df, similarity_threshold)


To plot the result run the following script, set `reference_logs` to false if you don't want to show the logs label

In [None]:
unique_clusters = df.cluster.value_counts()
reference_logs = [dic['ref'] for dic in cluster_dict]

fig = plot_clusters(unique_clusters, reference_logs, skip_single=True)