In [1]:
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys

import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas import DataFrame
from pandas.core.dtypes import common as com
from core.loader import Loader

from models.model_wrapper import ModelWrapper

from tensorflow.python.client import device_lib

for device in device_lib.list_local_devices():
    print(device.physical_device_desc)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# TODO: Zkusit 3 stupně na NN, SVM a porovnat s LightGBM + XGBoost

# 4 x Malware
# 4x  Phishing
# + DGA (jeden stupen)

2025-05-04 21:42:37.962640: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-04 21:42:37.962672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-04 21:42:37.963798: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04 21:42:37.969537: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


2025-05-04 21:42:41.943196: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-04 21:42:42.005860: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-04 21:42:42.008684: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [None]:
import pickle
import numpy as np
import joblib
from tqdm import tqdm
from sklearn.metrics import classification_report
from core.validator import ModelValidator, load_saved_split, load_train_split
from models.model_wrapper import ModelWrapper
from core.fpd_nn import FPDNeuralNetwork
from core.meta_nn import MetaNeuralClassifier  # <--- new import
from core.utils import safe_predict

# === Configuration ===
ARCHITECTURES = ["cnn", "XgBoost", "Lgbm", "feedforward","svm"]
VERSION = "v1.1"
MALICIOUS_LABEL = "phishing"
STAGE = 3
VERIFICATION = True
FPD_MODEL_PATH = "./models/fpd_saved_model"
META_MODEL_PATH = "./models/meta_nn_model"

# === Load train/test split ===
x_train, x_test, y_train, y_test, columns = load_train_split(STAGE, MALICIOUS_LABEL)

# Optionally, overwrite test set from saved split (e.g. for verification)
x_train_additional, y_test_additional = load_saved_split(STAGE, MALICIOUS_LABEL,folder="./data/", verification=VERIFICATION)



# Reduce training size for speed
x_train = x_train[:int(len(x_train) * 0.10)]
y_train = y_train[:int(len(y_train) * 0.10)]

# appedn x_train_additional to x_train
x_train = np.vstack((x_train, x_train_additional))
y_train = np.hstack((y_train, y_test_additional))

# apend to test set
x_test = np.vstack((x_test, x_train_additional))
y_test = np.hstack((y_test, y_test_additional))

# === Load and run base models ===
model_wrapper = ModelWrapper(model_dir="models")
train_preds = []
test_preds = []


def predict(model, x, architecture, label):
    y_pred = safe_predict(model, x, architecture, label, STAGE)
            
    return np.array(y_pred.flatten())

for arch in ARCHITECTURES:
    model = model_wrapper.load(
        arch_name=arch,
        label=MALICIOUS_LABEL,
        prefix=f"stage_{STAGE}",
        version=VERSION
    )
    train_preds.append(predict(model, x_train, arch, MALICIOUS_LABEL))
    test_preds.append(predict(model, x_test, arch, MALICIOUS_LABEL))
    
# === Prepare Meta Model Input ===
meta_input_train = np.hstack([
    np.vstack(train_preds).T,     # shape: (n_samples, n_models)
    x_train[:, :10]               # shape: (n_samples, 10)
])

meta_input_test = np.hstack([
    np.vstack(test_preds).T,
    x_test[:, :10]
])


# === Train Meta Neural Network ===
meta_nn = MetaNeuralClassifier()
meta_nn.fit(meta_input_train, y_train)

meta_nn.save(META_MODEL_PATH, "v1.1")

# === Train False Positive Detector ===
ensemble_train_preds = np.round(np.mean(train_preds, axis=0)).astype(int)
fpd_labels_train = ((ensemble_train_preds == 1) & (y_train == 0)).astype(int)

fpd_nn = FPDNeuralNetwork()
fpd_nn.fit(x_train, fpd_labels_train)
fpd_nn.save(FPD_MODEL_PATH, MALICIOUS_LABEL, STAGE)

meta_nn.load(META_MODEL_PATH, "v1.1")
raw_preds = meta_nn.predict(meta_input_test)

fpd_nn.load(FPD_MODEL_PATH, MALICIOUS_LABEL, STAGE)
corrected_preds = fpd_nn.correct_predictions(raw_preds, x_test)

# === Evaluation ===
print("\n=== Meta-NN without FPD ===")
print(classification_report(y_test, raw_preds, digits=4))

print("\n=== Meta-NN with FPD correction ===")
print(classification_report(y_test, corrected_preds, digits=4))

# === ModelValidator integration ===
final_model_wrapper = ModelWrapper(model_dir="models")
final_model_wrapper.predict = lambda x: corrected_preds

validator = ModelValidator(
    final_model_wrapper,
    x_test,
    y_test,
    arch_name="MetaNN+FPD_NN",
    label=MALICIOUS_LABEL,
    prefix=f"stage_{STAGE}",
    version=VERSION,
    verification=VERIFICATION
    
)
validator.evaluate_performance()



2025-05-05 11:24:06.717787: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-05 11:24:06.717816: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-05 11:24:06.733239: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-05 11:24:06.816037: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3 phishing ./data/
Columns: Index(['dns_has_dnskey', 'dns_A_count', 'dns_AAAA_count', 'dns_MX_count',
       'dns_NS_count', 'dns_TXT_count', 'dns_SOA_count', 'dns_CNAME_count',
       'dns_zone_level', 'dns_zone_digit_count',
       ...
       'rdap_ip_v4_count', 'rdap_ip_v6_count',
       'rdap_ip_shortest_v4_prefix_len', 'rdap_ip_longest_v4_prefix_len',
       'rdap_ip_shortest_v6_prefix_len', 'rdap_ip_longest_v6_prefix_len',
       'rdap_ip_avg_admin_name_len', 'rdap_ip_avg_admin_name_entropy',
       'rdap_ip_avg_admin_email_len', 'rdap_ip_avg_admin_email_entropy'],
      dtype='object', length=176)
3 phishing ./data/
📦 Loading model from models/cnn_stage_3_phishing_v1.1.keras


2025-05-05 11:24:41.028501: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-05 11:24:41.176700: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-05 11:24:41.179673: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

[CNN] X_test reshaped to (89098, 14, 14, 1) (side 14, padded by 20)


2025-05-05 11:24:43.472702: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902






[CNN] X_test reshaped to (98906, 14, 14, 1) (side 14, padded by 20)
📦 Loading model from models/XgBoost_stage_3_phishing_v1.1.xgb
📦 Loading model from models/Lgbm_stage_3_phishing_v1.1.pkl
📦 Loading model from models/feedforward_stage_3_phishing_v1.1.keras








📦 Loading model from models/svm_stage_3_phishing_v1.1.pkl
🔄 Detected SVM, using: scalers/phishing_svm_3_scaler.joblib scaler




🔄 Detected SVM, using: scalers/phishing_svm_3_scaler.joblib scaler




Epoch 1/15


2025-05-05 11:36:06.305896: I external/local_xla/xla/service/service.cc:168] XLA service 0x772368194190 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-05-05 11:36:06.305918: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Ti Laptop GPU, Compute Capability 8.6
2025-05-05 11:36:06.318755: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1746437766.414982  629130 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
saving fp detektor as fpd_scaler_phishing_3.joblib

=== Meta-NN without FPD ===
              precision    recall  f1-score   support

           0     0.9958    0.9989    0.9973     82367
           1     0.9943    0.9791    0.9866     16539

    accuracy                         0.9956     98906
   macro avg     0.9951    0.9890    0.9920     98906
weighted avg     0.9956    0.9956    0.9955     98906


=== Meta-NN with FPD correction ===
              precision    recall  f1-score   support

           0     0.9958    0.9990    0.9974     82367
           1     0.9950    0.9791    0.9870     16539

    accuracy                         0.9957     98906
   macro avg     0.9954    0.9890    0.9922     98906
weighted avg     0.9957    0.9957    0.9957     98906


🔍 Starting model evaluation...

📊 Saving evaluation metrics to .tex and CF matrix

📋 Evaluation 