In [1]:
!pip install python-Levenshtein



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
import sys
import getopt
from time import time
from math import cos, sqrt
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import Levenshtein as lv
from tqdm import tqdm
from collections import Counter

DEFAULT_MIN_SIM_THRESHOLD = 0.7

USER = re.compile('<.*>')
REQUEST = re.compile(
    '((\w|\d){8})-((\w|\d){4})-((\w|\d){4})-((\w|\d){4})-((\w|\d){12})')
TOKEN = re.compile('\[token:.*\]')
SURL = re.compile('srm:.+?(?=]| |$)')
PATH = re.compile('/.+?(?=]| |$)')


def clean_log(log):
    '''
    Cleans the log file using regex to remove unneccessary data
    '''
    log = re.sub(USER, 'USER', log)
    log = re.sub(TOKEN, 'TOKEN', log)
    log = re.sub(REQUEST, 'REQ_ID', log)
    log = re.sub(SURL, 'SURL', log)
    log = re.sub(PATH, 'PATH', log)
    return log


def similarity(str1, str2, method='levenshtein'):
    if method == 'lenvenshtein':
        return lv.seqratio(str1, str2)


def clusterize(df, sim_thres=DEFAULT_MIN_SIM_THRESHOLD):
    '''
    Clusterizes the dataframe based on the similarity of logs to the reference 
    of each cluster
    If no cluster within the threshold is found creates new cluster

    Uses normalized Levenshtein distance

    Returns the dataframe and a dicionary of clusters
    '''
    t0 = time()
    clusters = []
    time_trend = []
    id = 0
    for row in tqdm(df.itertuples()):
        best_clust = None
        i = getattr(row, 'Index')
        datetime = getattr(row, 'datetime')
        log = getattr(row, 'message')
        cleaned_log = getattr(row, 'cleaned_message')
        if pd.isnull(datetime):
            cleaned_log = 'JAVA_ERROR'
        if len(clusters) == 0:
            clusters.append({'id': id, 'ref': cleaned_log, 'count': 1})
            df.at[i, 'cluster'] = id
            id = id+1
            t = time()-t0
            time_trend.append([t, id, i])
            continue

        similarities = [similarity(cleaned_log, cluster['ref'], 'lenvenshtein')
                        for cluster in clusters]
        best_clust = np.argmax(similarities)
        if similarities[best_clust] > sim_thres:
            clusters[best_clust]['count'] = clusters[best_clust]['count']+1
            df.at[i, 'cluster'] = clusters[best_clust]['id']
        else:
            clusters.append({'id': id, 'ref': cleaned_log, 'count': 1})
            df.at[i, 'cluster'] = id
            id = id+1
            t = time() - t0
            time_trend.append([t, id, i])

    df.cluster = df.cluster.astype(int)
    return df, clusters, time_trend


def add_value_labels(ax, reference_logs, spacing=5):
    '''
    Writes the occurrence value over each bar

    If reference log list is provided writes the corresponding log over each bar
    '''
    id = 0
    for rect in ax.patches:
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        vert_spacing = spacing
        vert_alignment = 'bottom'
        angle = 90

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            vert_spacing *= -1
            vert_alignment = 'top'

        # Create value annotation
        label = "{:.0f}".format(y_value)
        ax.annotate(label, (x_value, y_value), xytext=(0, vert_spacing),
                    textcoords="offset points", ha='center', va=vert_alignment)

        # Create log annotation
        if isinstance(reference_logs, (list,)):
            label = reference_logs[id]
            ax.annotate(label, (x_value, y_value), xytext=(0, vert_spacing*4),
                        textcoords="offset points", ha='center', va=vert_alignment,
                        rotation=angle, fontsize='xx-small')
        id = id+1


def plot_clusters(cluster_array, write_ref=False, skip_single=False):
    '''
    Plot the cluster size bar graph
    If list of label is passed prints it over each bar

    Label should be the refernce log for each cluster
    '''
    fig, ax = plt.subplots()
    ids = [row[0] for row in cluster_array]
    occurrence = [row[1] for row in cluster_array]

    if write_ref is True:
        reference_log = [row[2] for row in cluster_array]
    else:
        reference_log = None

    if skip_single == True:
        y = occurrence[occurrence > 1]
        x = ids[occurrence > 1]
    else:
        y = occurrence
        x = ids

    ax.bar(range(len(x)), y)
    ax.set_xticks(range(len(x)))
    ax.set_xticklabels(x)
    ax.set_xlabel('cluster')
    ax.set_ylabel('occurrences')

    add_value_labels(ax, reference_log)

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    plt.show()

    return fig

To run the script run the following code with path to `inputfile` and `outputfile`, and proper `similarity_threshold`

In [4]:
import os
inputpath = 'C:/Users/simor/Desktop/output/'
outputpath = 'C:/Users/simor/Desktop/output/output/'

filelist = ['2019-05-25-storm-backend.log.csv','2019-05-26-storm-backend.log.csv','2019-05-27-storm-backend.log.csv']
for filename in filelist:
    print('Loading: ' + filename)
    inputfile = inputpath + filename
    df = pd.read_csv(inputfile)
    print('Loaded ' + str(len(df.index)) + ' lines')

    print('Cleaning log')
    df['cleaned_message'] = df['message'].apply(clean_log)
    print('Logs cleaned, started clustering')

    df, cluster_dict, time_trend = clusterize(df, DEFAULT_MIN_SIM_THRESHOLD)
    
    print('Clustered. Saving to ' + outputpath)
    df.to_csv(outputpath + 'clustered-' + filename, compression='zip')
    
    print('Saved log csv in '+ 'clustered-' + filename)
    array = [[dic['id'], dic['count'], dic['ref']] for dic in cluster_dict]
    np.savetxt(outputpath + 'cluster_table-' + filename,
               array, fmt='%s', delimiter=',')
    print('Saved clusters csv in '+ 'cluster_table-' + filename)

    np.savetxt(outputpath + 'time_trend-' +
               filename, time_trend, delimiter=',')
    print('Saved time trend csv in '+ 'time_trend-' + filename)


Loading: 2019-05-25-storm-backend.log.csv
Loaded 2996214 lines
Cleaning log
Logs cleaned, started clustering


2996214it [1:06:45, 748.02it/s]


Clustered. Saving to C:/Users/simor/Desktop/output/output/
Saved log csv in clustered-2019-05-25-storm-backend.log.csv
Saved clusters csv in cluster_table-2019-05-25-storm-backend.log.csv
Saved time trend csv in time_trend-2019-05-25-storm-backend.log.csv
Loading: 2019-05-26-storm-backend.log.csv
Loaded 3117686 lines
Cleaning log
Logs cleaned, started clustering


3117686it [56:13, 924.10it/s] 


Clustered. Saving to C:/Users/simor/Desktop/output/output/
Saved log csv in clustered-2019-05-26-storm-backend.log.csv
Saved clusters csv in cluster_table-2019-05-26-storm-backend.log.csv
Saved time trend csv in time_trend-2019-05-26-storm-backend.log.csv
Loading: 2019-05-27-storm-backend.log.csv
Loaded 4134166 lines
Cleaning log
Logs cleaned, started clustering


4134166it [1:56:52, 589.55it/s]


Clustered. Saving to C:/Users/simor/Desktop/output/output/
Saved log csv in clustered-2019-05-27-storm-backend.log.csv
Saved clusters csv in cluster_table-2019-05-27-storm-backend.log.csv
Saved time trend csv in time_trend-2019-05-27-storm-backend.log.csv


To plot the result run the following script, set `reference_logs` to false if you don't want to show the logs label

In [0]:
fig = plot_clusters(array, write_ref=False, skip_single=False)