In [None]:
import optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE

In [4]:
# Load training and testing datasets
data_train = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")
data_test = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")

# Define features (X) and target variable (y) for training and testing sets
X_train = data_train.drop(['category', 'subcategory', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'attack'], axis=1)
y_train = data_train['attack']

X_test = data_test.drop(['category', 'subcategory', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'attack'], axis=1)
y_test = data_test['attack']

print("Training and Testing Datasets Prepared!")

Training and Testing Datasets Prepared!


In [5]:
# Apply SMOTE to balance the class distribution in the training dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the class distribution after resampling
print(f"Original y_train distribution:\n{y_train.value_counts()}")
print(f"Resampled y_train distribution:\n{y_train_resampled.value_counts()}")

Original y_train distribution:
attack
1    2934447
0        370
Name: count, dtype: int64
Resampled y_train distribution:
attack
1    2934447
0    2934447
Name: count, dtype: int64


In [6]:
def objective(trial):
    """
    Objective function for Optuna's Bayesian optimization.
    Args:
        trial: Optuna trial object for sampling hyperparameters.
    Returns:
        Accuracy score on the test dataset.
    """
    # Define hyperparameter search space
    param_grid = {
        'objective': 'binary',  # Binary classification
        'metric': 'binary_logloss',  # Loss metric
        'verbosity': -1,  # Suppress training logs
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'random_state': 42,  # Ensure reproducibility
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 50)
    }

    # Train LightGBM model using hyperparameters sampled by Optuna
    model = lgb.LGBMClassifier(**param_grid)
    model.fit(
        X_train_resampled,
        y_train_resampled,
        eval_set=[(X_test, y_test)],
        eval_metric='logloss',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]  # Early stopping
    )

    # Predict on the testing dataset
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [7]:
# Create Optuna study to maximize accuracy
study = optuna.create_study(direction='maximize', study_name='lightgbm_training_testing')
study.optimize(objective, n_trials=50, timeout=3600)  # Run for 50 trials or 1 hour

# Print the best trial results
print("\nBest trial:")
print(f"  Value: {study.best_trial.value:.4f}")
print("  Params:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

[I 2025-04-28 13:32:07,646] A new study created in memory with name: lightgbm_training_testing
[I 2025-04-28 13:32:39,437] Trial 0 finished with value: 1.0 and parameters: {'learning_rate': 0.1418693712348676, 'num_leaves': 93, 'max_depth': 12, 'min_child_samples': 75, 'subsample': 0.7884650721308273, 'colsample_bytree': 0.8596671965546114, 'reg_alpha': 0.02109289810592904, 'reg_lambda': 1.6552239859588088e-06, 'n_estimators': 209, 'feature_fraction': 0.8271006742871947, 'bagging_fraction': 0.761809685719689, 'min_child_weight': 43}. Best is trial 0 with value: 1.0.
[I 2025-04-28 13:32:53,635] Trial 1 finished with value: 1.0 and parameters: {'learning_rate': 0.25727120449686935, 'num_leaves': 130, 'max_depth': 15, 'min_child_samples': 22, 'subsample': 0.6105637551311074, 'colsample_bytree': 0.9459719485734974, 'reg_alpha': 0.03986722290052597, 'reg_lambda': 2.563269805806106e-05, 'n_estimators': 409, 'feature_fraction': 0.5675166985342001, 'bagging_fraction': 0.9062709329646854, 'min_


Best trial:
  Value: 1.0000
  Params:
    learning_rate: 0.1418693712348676
    num_leaves: 93
    max_depth: 12
    min_child_samples: 75
    subsample: 0.7884650721308273
    colsample_bytree: 0.8596671965546114
    reg_alpha: 0.02109289810592904
    reg_lambda: 1.6552239859588088e-06
    n_estimators: 209
    feature_fraction: 0.8271006742871947
    bagging_fraction: 0.761809685719689
    min_child_weight: 43


In [8]:
# Save the best hyperparameters and accuracy score to a file
output_file = "models_and_data/best_hyperparameters_lightgbm_training_testing.txt"
with open(output_file, "w") as f:
    f.write("Best Hyperparameters:\n")
    for key, value in study.best_trial.params.items():
        f.write(f"{key}: {value}\n")
    f.write(f"\nBest Accuracy: {study.best_trial.value:.4f}\n")

print(f"\nResults saved to {output_file}")


Results saved to models_and_data/best_hyperparameters_lightgbm_training_testing.txt
