### Load the Data

In [31]:
import sys 
import os
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pathlib import Path

import cdt
cdt.SETTINGS.rpath = "/usr/bin/Rscript"

import pandas as pd
from src.features import preprocessing

In [32]:
data_path = Path('/home/ralmasri/projects/Thesis/Domain-Guided-Monitoring/data/')
csv_path = data_path / "logs_aggregated_concurrent.csv"
size = 50
subset_path = data_path / f"{size}_logs_aggregated_concurrent.csv" if size > 0 else csv_path

huawei_config = preprocessing.HuaweiPreprocessorConfig()
huawei_config.aggregated_log_file = subset_path
preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(huawei_config)
huawei_df = preprocessor._load_log_only_data().fillna("")
huawei_df = huawei_df.sort_values(by='@timestamp').reset_index(drop=True)

relevant_columns = [
            "Hostname",
            "log_level",
            "programname",
            "python_module",
            "http_status",
            "http_method",
            "@timestamp",
            "fine_log_cluster_template",
            "coarse_log_cluster_template",
            "url_cluster_template"
        ]
huawei_df.drop(labels=[x for x in huawei_df.columns if x not in relevant_columns], axis=1, inplace=True)
huawei_df.head()

Generating DRAIN clusters from log_df: 100%|██████████| 50/50 [00:00<00:00, 1938.70it/s]
Generating DRAIN clusters from log_df: 100%|██████████| 50/50 [00:00<00:00, 1071.93it/s]
Generating DRAIN clusters from log_df: 100%|██████████| 15/15 [00:00<00:00, 2758.93it/s]


Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,fine_log_cluster_template,coarse_log_cluster_template,url_cluster_template
0,wally113,,placement-api-access,,200.0,GET,2019-11-19T17:00:05.000000000+01:00,19 nov 2019 17 * * 0100 get resource providers...,19 nov 2019 17 * * 0100 get resource providers...,resource providers 45bac5db-7b40-4922-ae54-fe7...
1,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:00:48.255000000+01:00,get v20 networks tenant id 99c2677b197747c9bd8...,get v20 networks tenant id 99c2677b197747c9bd8...,
2,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:00:48.274000000+01:00,get v20 networks shared true http 11 status 20...,get v20 networks shared true http 11 status 20...,
3,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:01:50.119000000+01:00,get v20 ports fields binding 3ahost id fields ...,get v20 ports fields binding 3ahost id fields ...,
4,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-19T17:01:50.210000000+01:00,get v20 ports tenant id 99c2677b197747c9bd8bc0...,get v20 ports tenant id 99c2677b197747c9bd8bc0...,


In [33]:
import datetime
date_format = '%Y-%m-%dT%H:%M:%S.%f000%z'
huawei_df['@timestamp'] = huawei_df['@timestamp'].apply(lambda x: datetime.datetime.strptime(x, date_format))
type(huawei_df['@timestamp'][1])

pandas._libs.tslibs.timestamps.Timestamp

In [34]:
# # Append the name of a value's column to itself
# for column in list(huawei_df.columns):
#     if column == '@timestamp':
#         continue
#     huawei_df[column].apply(lambda x: column + "#" + x.lower() if len(x) > 0 else "", inplace=True)
# huawei_df.head()

### Setup arguments

In [35]:
min_dt = huawei_df['@timestamp'].iloc[0].to_pydatetime()
max_dt = huawei_df['@timestamp'].iloc[-1].to_pydatetime()
print(min_dt)
print(max_dt)

2019-11-19 17:00:05+01:00
2019-11-19 17:03:41.786000+01:00


In [36]:
w_top_dt = datetime.datetime.combine(min_dt.date(), datetime.time())
w_end_dt = datetime.datetime.combine(max_dt.date(), datetime.time()) + datetime.timedelta(days = 1)
print(w_top_dt)
print(w_end_dt)

2019-11-19 00:00:00
2019-11-20 00:00:00


In [37]:
# Length of unit terms to construct DAG
term = datetime.timedelta(minutes=5)

# Length of time difference of unit terms
diff = datetime.timedelta(minutes=5)

# Bin size of discrete data for G square test
dur = datetime.timedelta(seconds=10)

# this is assuming that area is all as is default

l_args = []
top_dt = w_top_dt
while top_dt < w_end_dt:
    end_dt = top_dt + term
    l_args.append((top_dt, end_dt, dur))
    top_dt = top_dt + diff

### Creating log2event

In [47]:
from collections import namedtuple
EvDef = namedtuple('EvDef', ['type', 'value'])
evdict = {} # Event id -> list(datetime.datetime)
class EventDefinitionMap: # eid -> evdef
    def __init__(self, top_dt, end_dt):
        self.top_dt = top_dt
        self.end_dt = end_dt
        self._emap = {} # key : eid, val : evdef
        self._ermap = {} # key : evdef, val : eid

    def __len__(self):
        return len(self._emap)

    def _eids(self):
        return self._emap.keys()

    def _next_eid(self):
        eid = len(self._emap)
        while eid in self._emap:
            eid += 1
        else:
            return eid

    def process_row(self, columns, row):
        row_eids = []
        for column in columns:
                if column == '@timestamp':
                    continue
                value = row[column]
                if value == "":
                    continue
                d = {
                    "type": column,
                    "value": row[column],
                }

                evdef = EvDef(**d)

                if evdef in self._ermap:
                    row_eids.append(self._ermap[evdef])
                else:
                    eid = self._next_eid()
                    self._emap[eid] = evdef
                    self._ermap[evdef] = eid
                    row_eids.append(eid)
        return row_eids
evmap = EventDefinitionMap(top_dt=w_top_dt, end_dt=w_end_dt)
            

In [49]:

for _, row in huawei_df.iterrows():
    row_eids = evmap.process_row(huawei_df.columns, row)
    for eid in row_eids:
        if eid in evdict:
            evdict[eid].append(row['@timestamp'])
        else:
            evdict[eid] = []


[Timestamp('2019-11-19 17:02:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:09+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:30+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:03:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:03:11+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:00:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:09+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:02:30+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:03:05+0100', tz='UTC+01:00'),
 Timestamp('2019-11-19 17:03:11+0100', tz='UTC+01:00')]