# Load packages and import modules

In [8]:
# -*- coding: utf-8 -*-
import sys
import os

# Manually specify the path to the src folder
sys.path.append(os.path.abspath('../'))

# Load datasets 

In [None]:
from core.loader import Loader

benign_dataset_filenames = [
    '../parkets/benign/benign_2312_anonymized_HTML.parquet', 
    '../parkets/benign/umbrella_benign_FINISHED_HTML.parquet'
        
]
malicious_dataset_filenames = [
    '../parkets/malware_2406_strict_HTML.parquet'
]

# CONFIGURATION

benign_label = "benign"
malicious_label = "malware"

class_map = {benign_label: 0, malicious_label: 1}

loader = Loader(benign_dataset_filenames, malicious_dataset_filenames, benign_label=benign_label, malicious_label=malicious_label, subsample=0.02)
df = loader.load()

# Split data into stages

In [6]:
from core.loader import Segmenter

# Define the aggregates that needs to be created

aggregates = [
    ["lex_"],
    ["lex_", "dns_", "ip_", "geo_"],
    ["lex_", "dns_", "ip_", "tls_", "geo_", "rdap_"],
]

segmenter = Segmenter(df)
segmenter.create_base_subsets() # create base subsets
segmenter.create_aggregated_subsets(aggregates)
subset_dfs = segmenter.get_aggregated_subsets()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["label"] = self.df["label"].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["label"] = self.df["label"].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["label"] = self.df["label"].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try

# Define the CNN model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense, BatchNormalization, Activation, MaxPooling2D, Dropout

ARCH_NAME = "cnn"
VERSION = "v1.1"
LR = 0.0023

def build_cnn_net(input_shape=(28, 28, 1), dropout_conv=0.25, dropout_dense=0.5):
    inputs = Input(shape=input_shape)

    # Block 1
    x = Conv2D(32, kernel_size=(3, 3), padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # Block 2
    x = Conv2D(64, kernel_size=(3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropout_conv)(x)

    # Block 3
    x = Conv2D(128, kernel_size=(3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropout_conv)(x)

    x = Flatten()(x)

    # Dense block 1
    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_dense)(x)

    # Dense block 2
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_dense)(x)

    # Output
    outputs = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=outputs, name=ARCH_NAME)
    return model

# For each subset/stage train one model

In [None]:
import gc
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from models.model_wrapper import ModelWrapper


### Initialize GPU ###
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


def next_perfect_square(n):
    return int(np.ceil(np.sqrt(n)) ** 2)


# Initialize ModelWrapper and model histories
wrapper = ModelWrapper()
model_histories = []


i = 0
for prefix, subset_df in subset_dfs.items():

    print(f"\n🚀 Training CNN on '{prefix}' features…")
    
    labels = subset_df['label'].map(class_map)
    features_df = loader.scale(subset_df.drop('label', axis=1), stage=i, model=ARCH_NAME)

    features = features_df.values
    original_feature_size = features.shape[1]
    padded_size = next_perfect_square(original_feature_size)
    side_size = int(np.sqrt(padded_size))
    padding = padded_size - original_feature_size

    if padding > 0:
        features = np.pad(features, ((0, 0), (0, padding)), mode='constant', constant_values=0)

    X = features.reshape(-1, side_size, side_size, 1)

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, labels,
        test_size=0.2, random_state=42,
        shuffle=True, stratify=labels
    )

    model = build_cnn_net(input_shape=(side_size, side_size, 1))
    model.compile(
        optimizer=Adam(learning_rate=LR),
        loss='binary_crossentropy',
        metrics=['Precision', 'Recall', 'AUC']
    )

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )

    history = model.fit(
        X_train, Y_train,
        batch_size=512,
        epochs=30,
        validation_data=(X_test, Y_test),
        class_weight={0: 1.0, 1: 0.8},
        callbacks=[early_stopping]
    )

    model_histories.append({"model_name": prefix, "history": history})

    # Save the model using ModelWrapper
    wrapper.save(model,
                 arch_name=ARCH_NAME,
                 label=malicious_label,
                 prefix=prefix,
                 version=VERSION)
    
    # After training cleanup 
    K.clear_session()
    del model, history, X_train, X_test, Y_train, Y_test, X, features, labels
    gc.collect()
    i += 1     


# Plot metrics for all models

In [None]:
import matplotlib.pyplot as plt

# Assume model_histories is a list of dicts, each with keys "model_name" and "history"
def get_metric(history, metric_name):
    if metric_name in history:
        return history[metric_name]
    for suffix in ["_12", "_2"]:
        if f"{metric_name}{suffix}" in history:
            return history[f"{metric_name}{suffix}"]
    raise KeyError(f"Metric {metric_name} not found in history.")

for model_entry in model_histories:
    name = model_entry["model_name"]
    history = model_entry["history"].history  # Keras history object

    epoch_losses = get_metric(history, 'loss')
    epoch_val_losses = get_metric(history, 'val_loss')
    epoch_accuracies = get_metric(history, 'auc')
    epoch_val_accuracies = get_metric(history, 'val_auc')
    epoch_precisions = get_metric(history, 'precision')
    epoch_val_precisions = get_metric(history, 'val_precision')
    epoch_recalls = get_metric(history, 'recall')
    epoch_val_recalls = get_metric(history, 'val_recall')

    # Calculate F1 scores
    def safe_f1(p, r):
        return 2 * (p * r) / (p + r) if (p + r) > 0 else 0

    epoch_f1s = [safe_f1(p, r) for p, r in zip(epoch_precisions, epoch_recalls)]
    epoch_val_f1s = [safe_f1(p, r) for p, r in zip(epoch_val_precisions, epoch_val_recalls)]

    # Plot
    plt.figure(figsize=(18, 10))
    plt.subplot(2, 3, 1)
    plt.plot(epoch_losses, 'b--o', label='Training Loss')
    plt.plot(epoch_val_losses, 'r--o', label='Validation Loss')
    plt.title('Loss'); plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(epoch_accuracies, '--o', label='Training AUC', color='#ff7f0e')
    plt.plot(epoch_val_accuracies, 'r--o', label='Validation AUC')
    plt.title('AUC'); plt.xlabel('Epoch'); plt.ylabel('AUC'); plt.legend(); plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(epoch_precisions, 'g--o', label='Training Precision')
    plt.plot(epoch_val_precisions, 'r--o', label='Validation Precision')
    plt.title('Precision'); plt.xlabel('Epoch'); plt.ylabel('Precision'); plt.legend(); plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(epoch_recalls, 'c--o', label='Training Recall')
    plt.plot(epoch_val_recalls, 'r--o', label='Validation Recall')
    plt.title('Recall'); plt.xlabel('Epoch'); plt.ylabel('Recall'); plt.legend(); plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(epoch_f1s, 'm--o', label='Training F1')
    plt.plot(epoch_val_f1s, 'r--o', label='Validation F1')
    plt.title('F1 Score'); plt.xlabel('Epoch'); plt.ylabel('F1'); plt.legend(); plt.grid(True)

    plt.suptitle(f"Training Progress - {name}", fontsize=16, y=1.02)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(f'./figures/training_{ARCH_NAME}_{name}_{VERSION}.png', dpi=500, bbox_inches='tight', pad_inches=0.5)
    plt.close()
