### Load the Data

In [1]:
import sys 
import os
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pathlib import Path

import cdt
cdt.SETTINGS.rpath = "/usr/bin/Rscript"

import pandas as pd
from src.features import preprocessing
import numpy as np

No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.


In [4]:
data_path = Path('/home/ralmasri/projects/Thesis/Domain-Guided-Monitoring/data/')
csv_path = data_path / "logs_aggregated_concurrent.csv"
size = -1
subset_path = data_path / f"{size}_logs_aggregated_concurrent.csv" if size > 0 else csv_path

huawei_config = preprocessing.HuaweiPreprocessorConfig()
huawei_config.aggregated_log_file = subset_path
preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(huawei_config)
# huawei_df = preprocessor._load_log_only_data().fillna("")
huawei_df = pd.read_csv(data_path / "log_only_data.csv").fillna("").astype(str)
huawei_df = huawei_df.head(50)
huawei_df = huawei_df.sort_values(by='@timestamp').reset_index(drop=True)

relevant_columns = [
            "Hostname",
            "log_level",
            "programname",
            "python_module",
            "http_status",
            "http_method",
            "@timestamp",
            "fine_log_cluster_template",
            "coarse_log_cluster_template",
            "url_cluster_template"
        ]
huawei_df.drop(labels=[x for x in huawei_df.columns if x not in relevant_columns], axis=1, inplace=True)
huawei_df.head()

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,fine_log_cluster_template,coarse_log_cluster_template,url_cluster_template
0,wally113,,placement-api-access,,200.0,GET,2019-11-19T17:00:05.000000000+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * traits
1,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:00:48.255000000+01:00,get v20 networks tenant id * shared false http...,get v20 networks tenant id * shared false http...,
2,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:00:48.274000000+01:00,get v20 networks shared true http 11 status 20...,get v20 networks shared true http 11 status 20...,
3,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:01:50.119000000+01:00,get v20 ports fields binding 3ahost id fields ...,get v20 ports fields binding 3ahost id fields ...,
4,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:01:50.210000000+01:00,get v20 ports tenant id * device id * http 11 ...,get v20 ports * id * device id * http 11 statu...,


In [5]:
import datetime
date_format = '%Y-%m-%dT%H:%M:%S.%f000%z'
huawei_df['@timestamp'] = huawei_df['@timestamp'].apply(lambda x: datetime.datetime.strptime(x, date_format))
type(huawei_df['@timestamp'][1])

pandas._libs.tslibs.timestamps.Timestamp

### Setup arguments

In [6]:
min_dt = huawei_df['@timestamp'].iloc[0].to_pydatetime()
max_dt = huawei_df['@timestamp'].iloc[-1].to_pydatetime()
print(min_dt)
print(max_dt)

2019-11-19 17:00:05+01:00
2019-11-19 17:03:41.786000+01:00


In [7]:
# Adjust the range as needed to fit the data
top_dt = datetime.datetime.combine(min_dt.date(), datetime.time(hour=min_dt.hour, minute=min_dt.minute)).replace(tzinfo=min_dt.tzinfo)
end_dt = datetime.datetime.combine(max_dt.date(), datetime.time(hour=max_dt.hour, minute=4)).replace(tzinfo=min_dt.tzinfo)
print(top_dt)
print(end_dt)

2019-11-19 17:00:00+01:00
2019-11-19 17:04:00+01:00


In [8]:
# term and diff are for when we want to split the timestamps into "terms" and work on each term.
# If you want to consider all the data at once, those variables are not needed
# # Length of unit terms to construct DAG
# term = datetime.timedelta(minutes=5)

# # Length of time difference of unit terms
# diff = datetime.timedelta(minutes=5)

# Bin size of discrete data for G square test
dur = datetime.timedelta(seconds=10)

# this is assuming that area is all as is default:
# l_args = []
# top_dt = w_top_dt
# while top_dt < w_end_dt:
#     end_dt = top_dt + term
#     l_args.append((top_dt, end_dt, dur))
#     top_dt = top_dt + diff

### Creating log2event

In [9]:
from collections import namedtuple
EvDef = namedtuple('EvDef', ['type', 'value'])
class EventDefinitionMap: # eid -> evdef
    def __init__(self, top_dt, end_dt):
        self.top_dt = top_dt
        self.end_dt = end_dt
        self._emap = {} # key : eid, val : evdef
        self._ermap = {} # key : evdef, val : eid

    def __len__(self):
        return len(self._emap)

    def _eids(self):
        return self._emap.keys()

    def _next_eid(self):
        eid = len(self._emap)
        while eid in self._emap:
            eid += 1
        else:
            return eid

    def get_evdef(self, eid):
        return self._emap[eid]

    def get_eid(self, evdef):
        return self._ermap[evdef]

    def process_row(self, columns, row):
        row_eids = []
        for column in columns:
                if column == '@timestamp':
                    continue
                value = row[column]
                if value == "":
                    continue
                d = {
                    "type": column,
                    "value": row[column],
                }

                evdef = EvDef(**d)

                if evdef in self._ermap:
                    row_eids.append(self._ermap[evdef])
                else:
                    eid = self._next_eid()
                    self._emap[eid] = evdef
                    self._ermap[evdef] = eid
                    row_eids.append(eid)
        return row_eids
evmap = EventDefinitionMap(top_dt=top_dt, end_dt=end_dt)
evdict = {} # Event id -> list(datetime.datetime)
            

In [10]:

for _, row in huawei_df.iterrows():
    row_eids = evmap.process_row(huawei_df.columns, row)
    for eid in row_eids:
        if eid in evdict:
            evdict[eid].append(row['@timestamp'])
        else:
            evdict[eid] = [row['@timestamp']]

In [11]:
huawei_df[huawei_df['programname'] == 'placement-api-access']

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,fine_log_cluster_template,coarse_log_cluster_template,url_cluster_template
0,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:00:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * traits
6,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:02:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * allocations
7,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:02:09+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * allocations
8,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:02:30+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * inventories
10,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:03:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * allocations
11,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:03:11+01:00,19 nov 2019 * * * 0100 get resource providers ...,* nov 2019 * * * 0100 get resource providers i...,resource providers in tree *


In [12]:
print(evdict[1])
print(evmap.get_evdef(1))

[Timestamp('2019-11-19 17:00:05+0100', tz='UTC+01:00'), Timestamp('2019-11-19 17:02:05+0100', tz='UTC+01:00'), Timestamp('2019-11-19 17:02:09+0100', tz='UTC+01:00'), Timestamp('2019-11-19 17:02:30+0100', tz='UTC+01:00'), Timestamp('2019-11-19 17:03:05+0100', tz='UTC+01:00'), Timestamp('2019-11-19 17:03:11+0100', tz='UTC+01:00')]
EvDef(type='programname', value='placement-api-access')


### Converting data to stats

In [13]:
def discretize(l_dt, l_label, method = "count", binarize = False):
    def return_empty(size, method):
        if method in ("count", "binary"):
            return [0] * bin_num
        elif method == "datetime":
            return [[] for i in range(bin_num)]
        else:
            raise NotImplementedError(
                "Invalid method name ({0})".format(method))

    def init_tempobj(method):
        if method == "count":
            return 0
        elif method == "binary":
            return 0
        elif method == "datetime":
            return []
        else:
            raise NotImplementedError(
                "Invalid method name ({0})".format(method))

    def update_tempobj(temp, new_dt, method):
        if method == "count":
            return temp + 1
        elif method == "binary":
            return 1
        elif method == "datetime":
            temp.append(new_dt)
            return temp
        else:
            raise NotImplementedError(
                "Invalid method name ({0})".format(method))

    if binarize:
        method = "binary"

    bin_num = len(l_label) - 1
    l_dt_temp = sorted(l_dt)
    if len(l_dt_temp) <= 0:
        return_empty(bin_num, method)

    iterobj = iter(l_dt_temp)
    try:
        new_dt = next(iterobj)
    except StopIteration:
        raise ValueError("Not empty list, but failed to get initial value")
    while new_dt < l_label[0]:
        try:
            new_dt = next(iterobj)
        except StopIteration:
            return_empty(bin_num, method)

    ret = []
    stop = False
    for label_dt in l_label[1:]:
        temp = init_tempobj(method)
        if stop:
            ret.append(temp)
            continue
        while new_dt < label_dt:
            temp = update_tempobj(temp, new_dt, method)
            try:
                new_dt = next(iterobj)
            except StopIteration:
                # "stop" make data after label term be ignored
                stop = True
                break
        ret.append(temp)
    return ret

In [14]:
# Assume we're using the G-squared test
binarize = True
bin_overlap = datetime.timedelta(seconds = 0)
def label(dt_range, duration):
    top_dt, end_dt = dt_range
    l_label = []
    temp_dt = top_dt
    while temp_dt < end_dt:
        l_label.append(temp_dt)
        temp_dt += duration
    l_label.append(end_dt)
    return l_label

def auto_discretize(l_ev, dur, labels):
    return discretize(l_ev, labels, binarize=True)

def event2stat(evdict, top_dt, end_dt):
    """This function looks at each event and returns a map that maps the 
    eid to a list that describes in which time bins the event occurs"""
    d_stat = {}
    labels = label((top_dt, end_dt), dur)
    for eid, l_ev in evdict.items():
        if len(l_ev) == 0: # Skip events that dont have timestamps (shouldn't be possible)
            continue

        if bin_overlap == datetime.timedelta(seconds = 0):
            val = auto_discretize(l_ev, dur, labels)
        if val is not None:
            d_stat[eid] = val
    return d_stat

In [18]:
data = event2stat(evdict, top_dt, end_dt)
dm = np.array([d for eid, d in sorted(data.items())]).transpose() 
df = pd.DataFrame(dm)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
huawei_df.head()

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,fine_log_cluster_template,coarse_log_cluster_template,url_cluster_template
0,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:00:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * traits
1,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19 17:00:48.255000+01:00,get v20 networks tenant id * shared false http...,get v20 networks tenant id * shared false http...,
2,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19 17:00:48.274000+01:00,get v20 networks shared true http 11 status 20...,get v20 networks shared true http 11 status 20...,
3,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19 17:01:50.119000+01:00,get v20 ports fields binding 3ahost id fields ...,get v20 ports fields binding 3ahost id fields ...,
4,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19 17:01:50.210000+01:00,get v20 ports tenant id * device id * http 11 ...,get v20 ports * id * device id * http 11 statu...,


In [24]:
evmap.get_evdef(1) # eid -> EvDef

EvDef(type='programname', value='placement-api-access')

In [25]:
evdict[1] # eid -> timestamps it happened in

[Timestamp('2019-11-19 17:00:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:09+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:30+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:03:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:03:11+0100', tz='UTC+01:00')]

In [26]:
huawei_df[huawei_df['programname'] == 'placement-api-access']

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,fine_log_cluster_template,coarse_log_cluster_template,url_cluster_template
0,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:00:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * traits
6,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:02:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * allocations
7,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:02:09+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * allocations
8,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:02:30+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * inventories
10,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:03:05+01:00,19 nov 2019 17 * * 0100 get resource providers...,* nov 2019 * * * 0100 * * * * * * * * * * * ke...,resource providers * allocations
11,wally113,,placement-api-access,,200.0,GET,2019-11-19 17:03:11+01:00,19 nov 2019 * * * 0100 get resource providers ...,* nov 2019 * * * 0100 get resource providers i...,resource providers in tree *


In [30]:
labels = label((top_dt, end_dt), dur)
print(labels[0])
print(labels[1])

2019-11-19 17:00:00+01:00
2019-11-19 17:00:10+01:00


In [31]:
min_dt = huawei_df['@timestamp'].iloc[0].to_pydatetime()
max_dt = huawei_df['@timestamp'].iloc[-1].to_pydatetime()
print(min_dt)
print(max_dt)

2019-11-19 17:00:05+01:00
2019-11-19 17:03:41.786000+01:00
