In [8]:
import sys 
import os
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from pathlib import Path
from src.features import preprocessing
import pandas as pd
import numpy as np

In [9]:
huawei_config = preprocessing.HuaweiPreprocessorConfig()
sequence_preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(huawei_config)
huawei_config.aggregated_log_file

PosixPath('data/logs_aggregated_concurrent.csv')

In [17]:
print([huawei_config.log_datetime_column_name, huawei_config.log_payload_column_name, huawei_config.url_column_name] + huawei_config.relevant_aggregated_log_columns)

['@timestamp', 'Payload', 'http_url', 'Hostname', 'log_level', 'programname', 'python_module', 'http_status', 'http_method']


In [21]:
csv_path = Path('/home/ralmasri/projects/Thesis/Domain-Guided-Monitoring/data/logs_aggregated_concurrent.csv')
data_df = pd.read_csv(csv_path).fillna("").astype(str).replace(np.nan, "", regex=True).head(50)
data_df = data_df[
    [huawei_config.log_datetime_column_name, huawei_config.log_payload_column_name, huawei_config.url_column_name] 
    + huawei_config.relevant_aggregated_log_columns]
data_df.columns

Index(['@timestamp', 'Payload', 'http_url', 'Hostname', 'log_level',
       'programname', 'python_module', 'http_status', 'http_method'],
      dtype='object')

In [27]:
all_logs_df = pd.DataFrame(data_df[huawei_config.log_payload_column_name].dropna().drop_duplicates())
all_logs_df['Payload'].iloc[0]

'130.149.249.127 "GET /v2.0/networks?shared=True HTTP/1.1" status: 200  len: 213 time: 0.0162041'

In [29]:
depth = huawei_config.fine_drain_log_depth
st = huawei_config.fine_drain_log_st
request_drain_regex: str = "[^a-zA-Z0-9\-\.]"
drain = preprocessing.Drain(
            preprocessing.DrainParameters(
                depth=depth,
                st=st,
                rex=[
                    ("(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)", ""),
                    (request_drain_regex, " "),
                    ("[^a-zA-Z\d\s:]", ""),
                ],
            ),
            data_df=all_logs_df,
            data_df_column_name=huawei_config.log_payload_column_name,
        )
drain_result_df = drain.load_data().drop_duplicates().set_index("log_idx")
drain_result_df[['cluster_template', 'cluster_path']].iloc[0]


Generating DRAIN clusters from log_df: 100%|██████████| 50/50 [00:00<00:00, 1880.97it/s]


cluster_template    get v20 networks shared true http 11 status 20...
cluster_path          13 get v20 networks shared true http * status *
Name: 0, dtype: object

In [31]:
prefix = "fine_"
log_result_df = (
            pd.merge(
                data_df,
                pd.merge(
                    all_logs_df,
                    drain_result_df,
                    left_index=True,
                    right_index=True,
                    how="left",
                )
                .drop_duplicates()
                .reset_index(drop=True),
                on=huawei_config.log_payload_column_name,
                how="left",
            )
            .rename(
                columns={
                    "cluster_template": prefix + "log_cluster_template",
                    "cluster_path": prefix + "log_cluster_path",
                }
            )
            .drop(columns=["cluster_id"])
        )
log_result_df[prefix + "log_cluster_template"] = (
            log_result_df[prefix + "log_cluster_template"]
            .fillna("")
            .astype(str)
            .replace(np.nan, "", regex=True)
        )
log_result_df.head(5)

Unnamed: 0,@timestamp,Payload,http_url,Hostname,log_level,programname,python_module,http_status,http_method,fine_log_cluster_template,fine_log_cluster_path
0,2019-11-19T17:00:48.274000000+01:00,"130.149.249.127 ""GET /v2.0/networks?shared=Tru...",,wally113,INFO,neutron-server,neutron.wsgi,,,get v20 networks shared true http 11 status 20...,13 get v20 networks shared true http * status *
1,2019-11-19T17:01:50.119000000+01:00,"130.149.249.127 ""GET /v2.0/ports?fields=bindin...",,wally113,INFO,neutron-server,neutron.wsgi,,,get v20 ports fields binding 3ahost id fields ...,22 get v20 ports fields binding * id fields bi...
2,2019-11-19T17:01:50.210000000+01:00,"130.149.249.127 ""GET /v2.0/ports?tenant_id=99c...",,wally113,INFO,neutron-server,neutron.wsgi,,,get v20 ports tenant id 99c2677b197747c9bd8bc0...,17 get v20 ports tenant id * device id *
3,2019-11-19T17:01:50.317000000+01:00,"130.149.249.127 ""GET /v2.0/networks?shared=Tru...",,wally113,INFO,neutron-server,neutron.wsgi,,,get v20 networks shared true http 11 status 20...,13 get v20 networks shared true http * status *
4,2019-11-19T17:02:52.398000000+01:00,"130.149.249.127 ""GET /v2.0/networks?tenant_id=...",,wally113,INFO,neutron-server,neutron.wsgi,,,get v20 networks tenant id 99c2677b197747c9bd8...,16 get v20 networks tenant id * shared false http
