### Preprocessing

In [None]:
import sys 
import os
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pathlib import Path

import cdt
cdt.SETTINGS.rpath = "/usr/bin/Rscript"
from cdt.causality.graph import PC

import pandas as pd
from src.features import preprocessing
import numpy as np
import networkx as nx
import time

In [None]:
data_path = Path('/home/ralmasri/projects/Thesis/Domain-Guided-Monitoring/data/')
csv_path = data_path / "logs_aggregated_concurrent.csv"
huawei_config = preprocessing.HuaweiPreprocessorConfig()
huawei_config.aggregated_log_file = csv_path
preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(huawei_config)
huawei_df = preprocessor._load_log_only_data().fillna("")

In [None]:
df_sizes = [100, 1000, 10000, 50000]
huawei_dfs = [huawei_df.head(x) for x in df_sizes]
results = {}

In [None]:
relevant_columns = [
            "Hostname",
            "log_level",
            "programname",
            "python_module",
            "http_status",
            "http_method",
            "@timestamp",
            "fine_log_cluster_template",
            "coarse_log_cluster_template",
            "url_cluster_template"
        ]

### Heuristic

In [None]:
from src.features import sequences
from collections import Counter

causality_preprocessor = preprocessing.ConcurrentAggregatedLogsCausalityPreprocessor(huawei_config)
huawei_config.min_causality = 0.01

def collect_sequence_metadata(grouped_data):
    transformer = sequences.load_sequence_transformer()
    return transformer.collect_metadata(grouped_data, "all_events")

def generate_heuristic_knowledge(huawei_df):
    log_only_data = huawei_df.copy()
    log_only_data["grouper"] = 1
    grouped_data = preprocessor._aggregate_per(log_only_data, aggregation_column="grouper")
    metadata = collect_sequence_metadata(grouped_data)

    relevant_columns = set(
        [
            x
            for x in preprocessor.relevant_columns
            if not causality_preprocessor.config.log_only_causality or "log" in x
        ]
    )
    counted_causality = causality_preprocessor._generate_counted_causality(
        huawei_df, relevant_columns
    )

    causality_records = []
    for from_value, to_values in counted_causality.items():
        total_to_counts = len(to_values)
        to_values_counter = Counter(to_values)
        for to_value, to_count in to_values_counter.items():
            if to_count / total_to_counts > causality_preprocessor.config.min_causality:
                causality_records.append(
                    {
                        "parent_id": from_value,
                        "parent_name": from_value.split("#")[1],
                        "child_id": to_value,
                        "child_name": to_value.split("#")[1],
                    },
                )

    return (
        pd.DataFrame.from_records(causality_records)
        .drop_duplicates()
        .reset_index(drop=True)
    )

In [None]:
results['heuristic'] = [generate_heuristic_knowledge(x) for x in huawei_dfs]

### My method

In [None]:
def generate_my_method_knoweldge(huawei_df):
    vocab = set()
    for _, row in huawei_df.iterrows():
        for column in relevant_columns:
            if column == '@timestamp':
                continue
            name = column + "#" + str(row[column]).lower()
            if name not in vocab and row[column] != "":
                vocab.add(name)
    df_dict = {}
    for column in vocab:
        attribute, value = tuple(column.split('#'))
        if column not in df_dict:
            df_dict[column] = huawei_df[attribute].apply(lambda x: 1 if x == value else 0)
        else:
            df_dict[column] |= huawei_df[attribute].apply(lambda x: 1 if x == value else 0)
    alg_df = pd.concat(list(df_dict.values()), axis=1)
    obj = PC(CItest="binary", method_indep="binary")
    output = obj.predict(alg_df)
    causality_records = []
    for edge in list(output.edges()):
        from_value, to_value = edge
        try:
            causality_records.append(
                {
                    "parent_id": from_value,
                    "parent_name": from_value.split("#")[1],
                    "child_id": to_value,
                    "child_name": to_value.split("#")[1],
                },
            )
        except Exception:
            print(from_value)
    return pd.DataFrame.from_records(causality_records).drop_duplicates().reset_index(drop=True)

In [None]:
results['my method'] = []
for i, huawei_df in enumerate(huawei_dfs):
    start = time.time()
    print(f"Calculating for size {df_sizes[i]}")
    results['my method'].append(generate_my_method_knoweldge(huawei_df))
    print(time.time() - start)