In [1]:
import enum
import dateutil
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib

# Logs comparison

This is "use case b" from the list of planned use cases:
1. (a). **Trends monitoring**. User specifies log fields to monitor and specifies their min/max/alert levels. Tool makes prognosis and form summary for these log fields (and also for system load, e.g. number of messages per hour). The prognosis is based on previous dynamics, previous/future working days, holidays, and other events (like maintenance windows).
1. (b). **Logs comparison**. User compares current logs with the previous ones (e.g. from previous release). User selects log fields to analyze. Tool highlights high-level differences, like number of messages, differences in prev/next hops, maybe different trends of field values.
1. (c). **Anomalies in logs**. Tool tries to find messages, which don’t look similar to most of others (for example, less than 1%). One more case: cluster log messages, if we see several types of them.
1. (d). **Automatic fault detection**. Tool automatically finds and highlight failures, basing on HTTP codes and, probably, other fields.
1. (e). **Failure patterns**. Using the data from automatic failure detection module, tool tries to find any pattern in failures, like occurring only on 5th time after connection setup, also it tries to find precursors to failure (certain messages or values, which appear before it happens)

## Service functions

In [12]:
FieldBaseType = enum.Enum('FieldBaseType', 'float integer categorical date string')

class FieldType_General:

    def __init__(self, data):
        if not isinstance(data, np.ndarray) or len(data.shape) != 1:
            raise ValueError(f"Expected 1-d numpy array, got: {type(data)}")
        self.data = data


class IntOrFloatMixin:

    def find_big_differences(self, other_field):
        BIG_DIFF_THR = 2  # We suppose that big diffs shold be greater than BIG_DIFF_THR*diff.mean()
        RARE_DIFF_THR = 0.1  # We suppose that rare diffs should occur less than RARE_DIFF_THR*len(data)
        MAX_THR_CHANGE_ITERS = 10  # No more than 10 iterations to find optimal threshold

        other_data = other_field.data
        if len(self.data) != len(other_data):
            raise VelueError(f"Lengths are not equal: {len(self.data)} and {len(other_data)}")
        diff = np.abs(self.data - other_data)
        if np.isclose(diff.mean(), 0) or diff.max() <= BIG_DIFF_THR*diff.mean():
            return np.array([], dtype=int)
        diff_vals = np.unique(diff)
        if len(diff_vals) < 2:
            return np.array([], dtype=int)
        biggest_thr = diff_vals[-2]  # one value before the maximum

        big_thr = BIG_DIFF_THR*diff.mean()
        if big_thr >= biggest_thr:
            return np.array([], dtype=int)
        found = False
        for thr in np.linspace(big_thr, biggest_thr, num=MAX_THR_CHANGE_ITERS):
            if len(np.where(diff > thr)) < RARE_DIFF_THR*len(diff):
                found = True
                break
        if found:
            times = np.argwhere(diff > thr)  # thr is still defined after loop
        else:
            times = np.array([], dtype=int)
        return times


class FieldType_Float(FieldType_General, IntOrFloatMixin):
    BASE_TYPE = FieldBaseType.float


class FieldType_Int(FieldType_General, IntOrFloatMixin):
    BASE_TYPE = FieldBaseType.integer


class FieldType_Cat(FieldType_General):
    BASE_TYPE = FieldBaseType.categorical

    def compare_categories(self, other_field):
        other_data = other_field.data
        # TODO: implementation is missing


class FieldType_Date(FieldType_General):
    BASE_TYPE = FieldBaseType.date

    def __init__(self, data):
        time_data = np.vectorize(lambda t: t.timestamp())(data)
        super().__init__(time_data)


class FieldType_Str(FieldType_General):
    BASE_TYPE = FieldBaseType.string


class FieldType_Resource(FieldType_Float):

    def __init__(self, data, low_val, high_val, low_warn_level, high_warn_level):
        super().__init__(data)
        self.low_val = low_val
        self.high_val = high_val
        self.low_warn_level = low_warn_level
        self.high_warn_level = high_warn_level


class FieldType_CPUUtilization(FieldType_Resource):

    def __init__(self, data, high_warn_level):
        super().__init__(data, 0, 100, None, high_warn_level)


class FieldType_RAMUtilization(FieldType_Resource):

    def __init__(self, data, low_warn_level, high_warn_level):
        super().__init__(data, 0, 100, low_warn_level, high_warn_level)


def create_field_object(field_s, name, verbose=True):
    obj = None
    if obj is None:
        if np.issubdtype(field_s.dtype, np.floating):
            obj = FieldType_Float(field_s.values)
        elif np.issubdtype(field_s.dtype, np.integer):
            # TODO: may be categorical?
            obj = FieldType_Int(field_s.values)
    if obj is None:
        # here we assume, that it is string, but it also can be categorical
        try:
            date_s = field_s.apply(dateutil.parser.parse)
            obj = FieldType_Date(date_s.values)
        except:
            pass
    if obj is None:
        try:
            float_s = field_s.apply(float)
            try:
                int_s = float_s.apply(int)
                obj = FieldType_Int(int_s.values)
            except:
                obj = FieldType_Float(float_s.values)
        except:
            pass
    if obj is None:
        if field_s.nunique() < 0.9*len(field_s):
            obj = FieldType_Cat(field_s.values)
        else:
            obj = FieldType_Str(field_s.values)
    if verbose:
        print(f"{name}: autodetected type is {obj.BASE_TYPE}")
    return obj


def align_base_field_types(field1, field2):
    # TODO: implementation is missing
    return field1, field2


def compare_fields(field1: FieldType_General, field2: FieldType_General):
    field1, field2 = align_base_field_types(field1, field2)
    comparison_res = {}
    if issubclass(type(field1), IntOrFloatMixin):
        comparison_res['big_difference_idxs'] = field1.find_big_differences(field2)
    return comparison_res

In [13]:
logs1_df = pd.read_csv('data/chatbroker.csv')
logs2_df = pd.read_csv('data/smsbroker.csv')

fields1 = {col: create_field_object(logs1_df[col], col, verbose=True) for col in logs1_df.columns.tolist()}
fields2 = {col: create_field_object(logs1_df[col], col, verbose=True) for col in logs2_df.columns.tolist()}

res = compare_fields(fields1['Line'], fields2['Line'])
print(res)

@timestamp: autodetected type is FieldBaseType.string
Class: autodetected type is FieldBaseType.categorical
Class.keyword: autodetected type is FieldBaseType.categorical
Device: autodetected type is FieldBaseType.categorical
Level: autodetected type is FieldBaseType.categorical
Line: autodetected type is FieldBaseType.integer
Message: autodetected type is FieldBaseType.string
Method: autodetected type is FieldBaseType.categorical
Pod: autodetected type is FieldBaseType.categorical
Service: autodetected type is FieldBaseType.categorical
Subscriber: autodetected type is FieldBaseType.categorical
Thread: autodetected type is FieldBaseType.categorical
_id: autodetected type is FieldBaseType.string
_index: autodetected type is FieldBaseType.categorical
_score: autodetected type is FieldBaseType.categorical
_type: autodetected type is FieldBaseType.categorical
@timestamp: autodetected type is FieldBaseType.string
Class: autodetected type is FieldBaseType.categorical
Class.keyword: autodetect

In [None]:
# Field by field comparison

In [None]:
# message statistics comparison