# Load packages and import modules

In [1]:
# -*- coding: utf-8 -*-
import sys
import os

# Manually specify the path to the src folder
sys.path.append(os.path.abspath('../'))

# Load input datasets

In [3]:
from core.loader import Loader

benign_dataset_filenames = [
    '../parkets/benign/benign_2312_anonymized_HTML.parquet', 
    '../parkets/benign/umbrella_benign_FINISHED_HTML.parquet'
        
]
malicious_dataset_filenames = [
    '../parkets/malware_2406_strict_HTML.parquet'
]

# CONFIGURATION

benign_label = "benign"
malicious_label = "malware"

class_map = {benign_label: 0, malicious_label: 1}
# print labels from malicious datasets

loader = Loader(benign_dataset_filenames, malicious_dataset_filenames, benign_label=benign_label, malicious_label=malicious_label, subsample=0.08)
df = loader.load()

# Split data into stages

In [4]:
from core.loader import Segmenter

# Define the aggregates that needs to be created

aggregates = [
    ["lex_"],
    ["lex_", "dns_", "ip_", "geo_"],
    ["lex_", "dns_", "ip_", "tls_", "geo_", "rdap_"],
]

segmenter = Segmenter(df)
segmenter.create_base_subsets() # create base subsets
segmenter.create_aggregated_subsets(aggregates)
subset_dfs = segmenter.get_aggregated_subsets()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["label"] = self.df["label"].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["label"] = self.df["label"].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["label"] = self.df["label"].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try

# For each subset/stage train one XgBoost model

In [None]:
from xgboost import XGBClassifier
from models.model_wrapper import ModelWrapper
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

ARCH_NAME = "XgBoost"
VERSION = "v1.2"

# Initialize ModelWrapper and model histories
model_wrapper = ModelWrapper(model_dir="models")
xgb_models = {}
xgb_performance = {}

# Parameters already tuned on the full dataset
params = {        
    "objective": "binary:logistic",
    "tree_method": "gpu_hist",
    "sampling_method": "gradient_based",
    "max_depth": 12, 
    "eta": 0.09787878787878787, 
    "min_child_weight": 1, 
    "subsample": 0.595959595959596, 
    "alpha": 0, 
    "gamma": 0.06060606060606061, 
    "lambda": 2.0707070707070705, 
    "max_delta_step": 0, 
    "grow_policy": "depthwise",
    "max_bin": 512,
    "n_estimators": 600, 
    #"scale_pos_weight": scale_pos_weight,
    "random_state": 42  # Set the seed for each run
}

for prefix, subset_df in subset_dfs.items():
    print(f"\nðŸš€ Training XGBoost on '{prefix}' features...")
    
    X = subset_df.drop('label', axis=1)
    y = subset_df['label'].map(class_map)

    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42
    )

    xgb_model = XGBClassifier(**params)
    
    # Fit the model
    xgb_model.fit(X_train, y_train)
      
    y_pred = xgb_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    model_wrapper.save(xgb_model, arch_name=ARCH_NAME, label=malicious_label, prefix=f"{prefix}", version=VERSION)

    xgb_models[prefix] = xgb_model
    
    xgb_performance[prefix] = {
        'accuracy': acc,
        'classification_report': report,
    }
    
    print(f"âœ… Accuracy for '{prefix}': {acc:.2f}")
    print(report)
    
    # shap analysis
    import shap
    import matplotlib.pyplot as plt
    
    # Create a SHAP explainer
    explainer = shap.Explainer(xgb_model, X_train)
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test, feature_names=X.columns)
    plt.savefig(f"shap_summary_{ARCH_NAME}_{malicious_label}_{prefix}.png")


ðŸš€ Training XGBoost on 'lex_agg' features...
Saving as stage: stage_1
âœ… Accuracy for 'lex_agg': 0.97
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     81671
           1       0.96      0.83      0.89     16404

    accuracy                           0.97     98075
   macro avg       0.96      0.91      0.93     98075
weighted avg       0.96      0.97      0.96     98075


ðŸš€ Training XGBoost on 'lex_+dns_+ip_+geo_agg' features...
Saving as stage: stage_2
âœ… Accuracy for 'lex_+dns_+ip_+geo_agg': 0.99
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     81671
           1       0.98      0.95      0.96     16404

    accuracy                           0.99     98075
   macro avg       0.98      0.97      0.98     98075
weighted avg       0.99      0.99      0.99     98075


ðŸš€ Training XGBoost on 'lex_+dns_+ip_+tls_+geo_+rdap_agg' features...
Saving as stage: stage_3
âœ… Accu

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from models.model_wrapper import ModelWrapper

ARCH_NAME = "Lgbm"
VERSION = "v1.2"


# Initialize ModelWrapper and model histories
model_wrapper = ModelWrapper(model_dir="models")
lgb_models = {}
lgb_performance = {}


# Iterate through each subset, train LightGBM, and evaluate it
for prefix, subset_df in subset_dfs.items():
    print(f"\nðŸš€ Training LightGBM on '{prefix}' features...")
    
    # Prepare data
    X = subset_df.drop('label', axis=1)
    y = subset_df['label']
    
    
    # map benign to 0 and malicious to 1
    y = y.map(class_map)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Initialize and train LightGBM with hyperparameter tuning
    lgb_model = LGBMClassifier(
        objective='binary', 
        metric='binary_logloss', 
        learning_rate=0.05, 
        num_leaves=31, 
        max_depth=-1, 
        n_estimators=250, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        reg_alpha=0.1, 
        reg_lambda=0.1
    )
    
    lgb_model.fit(X_train, y_train)
    
    # save the model 
    model_wrapper.save(lgb_model, arch_name=ARCH_NAME, label=malicious_label, prefix=f"{prefix}", version=VERSION)
    
    # Store the model
    lgb_models[prefix] = lgb_model
    
    # Predict and evaluate the model
    y_pred = lgb_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Store performance
    lgb_performance[prefix] = {
        'accuracy': acc,
        'classification_report': report
    }
    
    # Print performance
    print(f"âœ… LightGBM model accuracy for '{prefix}': {acc:.2f}")
    print(report)
    
    # shap analysis
    import shap
    import matplotlib.pyplot as plt
    
    # Create a SHAP explainer
    explainer = shap.Explainer(lgb_model, X_train)
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test, feature_names=X.columns)
    plt.savefig(f"shap_summary_{ARCH_NAME}_{malicious_label}_{prefix}.png")


ðŸš€ Training LightGBM on 'lex_agg' features...
[LightGBM] [Info] Number of positive: 148021, number of negative: 734653
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.327186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4012
[LightGBM] [Info] Number of data points in the train set: 882674, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.167696 -> initscore=-1.602044
[LightGBM] [Info] Start training from score -1.602044
Saving as stage: stage_1
âœ… LightGBM model accuracy for 'lex_agg': 0.95
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     81671
           1       0.94      0.76      0.84     16404

    accuracy                           0.95     98075
   macro avg       0.95      0.88      0.91     98075
weighted avg       0.95      0.95      0.95     98075


ðŸš€ Training LightGBM on 'lex_+dns_+ip_+geo_agg' fea