In [1]:
import pandas as pd
import numpy as np
import json
import datetime
import matplotlib.pyplot as plt

# Anomaly detection

This is "use case c" from the list of planned use cases:
1. (a). **Trends monitoring**. User specifies log fields to monitor and specifies their min/max/alert levels. Tool makes prognosis and form summary for these log fields (and also for system load, e.g. number of messages per hour). The prognosis is based on previous dynamics, previous/future working days, holidays, and other events (like maintenance windows).
1. (b). **Logs comparison**. User compares current logs with the previous ones (e.g. from previous release). User selects log fields to analyze. Tool highlights high-level differences, like number of messages, differences in prev/next hops, maybe different trends of field values.
1. (c). **Anomalies in logs**. Tool tries to find messages, which don’t look similar to most of others (for example, less than 1%). One more case: cluster log messages, if we see several types of them.
1. (d). **Automatic fault detection**. Tool automatically finds and highlight failures, basing on HTTP codes and, probably, other fields.
1. (e). **Failure patterns**. Using the data from automatic failure detection module, tool tries to find any pattern in failures, like occurring only on 5th time after connection setup, also it tries to find precursors to failure (certain messages or values, which appear before it happens)

**TODO**:
1. TBD

## Service Functions

In [2]:
def load_logs(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        logs = json.load(f)
    return logs

def find_event_time_anomaly(logs_dict, ts_win_address, win_size_sec):
    ts_vals = logs_dict
    for hop in ts_win_address.split('.'):
        new_ts_vals = []
        for msg in ts_vals:
            if hop not in msg:
                print(f"WARNING: didn't find \"{hop}\" in message {msg}")
            else:
                new_ts_vals.append(msg[hop])
        ts_vals = new_ts_vals

    ts_vals = sorted([
        datetime.datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ')
        for ts in ts_vals
    ])  # automatic ISO conversion with "fromisoformat()" is supported only from Python 3.11

    print(ts_vals)
    
    idx_freq = max(win_size_sec//2, 1)
    event_idx = pd.date_range(start=ts_vals[0], end=ts_vals[-1] + datetime.timedelta(seconds=1), freq=f'{idx_freq}s')
    event_cnt = []
    idx = 0
    prev_time = None
    for next_time in event_idx:
        if prev_time is None:
            prev_time = next_time
            continue
        cnt = 0
        while idx < len(ts_vals) and ts_vals[idx] >= prev_time and ts_vals[idx] < next_time:
            cnt += 1
            idx += 1
        event_cnt.append(cnt)
        prev_time = next_time
    ts_s = pd.Series(data=event_cnt, index=event_idx[:-1])
    # ts_df = pd.DataFrame(data={'event_time_sec': ts_vals})
    # event_s = ts_df.groupby('event_time_sec').size().sort_index()
    # freqs = event_s.rolling(window=f'{win_size_sec}s', min_periods=0).sum()
    freqs = ts_s.rolling(window=f'{win_size_sec}s', min_periods=0).sum()
    print(freqs)

    # Tukey's Fence
    k = 3  # to be sure that value is enough "far out"
    q1 = freqs.quantile(0.25)
    q3 = freqs.quantile(0.75)
    low_thr = q1 - k * (q3 - q1)
    high_thr = q1 + k * (q3 - q1)

    too_rare_events = freqs[freqs < low_thr].index.tolist()
    # too_rare_events = freqs[freqs > low_thr].index.tolist()  # remove
    too_frequent_events = freqs[freqs > high_thr].index.tolist()

    return too_rare_events, too_frequent_events

## Experiments

In [3]:
logs_dict = load_logs('data/elastic_logs_example_01.json')

In [4]:
win_size_sec = 3
too_rare_list, too_freq_list = find_event_time_anomaly(logs_dict, 'event.ingested', win_size_sec)
if len(too_rare_list) > 0:
    print("Anomaly found: too rare messages in the following time frames:")
    for win_start in too_rare_list:
        print(f"\t from {win_start} to {win_start + datetime.timedelta(seconds=win_size_sec)}")
if len(too_freq_list) > 0:
    print("Anomaly found: too frequent messages in the following time frames:")
    for win_start in too_freq_list:
        print(f"\t from {win_start} to {win_start + datetime.timedelta(seconds=win_size_sec)}")

[datetime.datetime(2024, 1, 27, 21, 11, 9), datetime.datetime(2024, 1, 27, 21, 11, 14), datetime.datetime(2024, 1, 27, 21, 11, 25), datetime.datetime(2024, 1, 27, 21, 11, 25), datetime.datetime(2024, 1, 27, 21, 17, 36), datetime.datetime(2024, 1, 27, 21, 17, 36), datetime.datetime(2024, 1, 27, 21, 17, 36), datetime.datetime(2024, 1, 27, 21, 17, 36), datetime.datetime(2024, 1, 27, 21, 17, 36), datetime.datetime(2024, 1, 27, 21, 17, 36)]
2024-01-27 21:11:09    1.0
2024-01-27 21:11:10    1.0
2024-01-27 21:11:11    1.0
2024-01-27 21:11:12    0.0
2024-01-27 21:11:13    0.0
                      ... 
2024-01-27 21:17:32    0.0
2024-01-27 21:17:33    0.0
2024-01-27 21:17:34    0.0
2024-01-27 21:17:35    0.0
2024-01-27 21:17:36    6.0
Freq: S, Length: 388, dtype: float64
Anomaly found: too frequent messages in the following time frames:
	 from 2024-01-27 21:11:09 to 2024-01-27 21:11:12
	 from 2024-01-27 21:11:10 to 2024-01-27 21:11:13
	 from 2024-01-27 21:11:11 to 2024-01-27 21:11:14
	 from 20