# Hybrid Botnet Detection using GA and LightGBM


1. Import required packages
2. Load and prepare datasets
3. Preprocess data
   - Remove duplicates
   - Handle missing values
   - Handle infinite values
   - Drop single-value columns
4. Feature engineering and selection
   - Apply SMOTE for class balancing
   - Scale features
   - Use GA for feature selection
5. Train LightGBM model
   - K-fold cross validation
   - Performance evaluation
6. Save results and model

**Importing the needed packages**

In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from deap import base, creator, tools, algorithms
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import multiprocessing
from deap import base, creator, tools, algorithms


In [17]:
# 1. Load data
data_train = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")
data_test = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")


In [18]:
# 2.Data Preprocessing
def preprocess(df):
    df = df.copy()
    df.drop_duplicates(inplace=True)
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    non_numeric_cols = df.select_dtypes(exclude=np.number).columns
    for col in non_numeric_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    df.columns = df.columns.str.replace(' ', '')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
 
    return df

data_train = preprocess(data_train)
data_test = preprocess(data_test)

In [19]:
# 3. Feature selection
columns_to_drop = [
    'subcategory', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'attack',
    'starttime', 'ltime', 'sid', 'sessionid', 'tcprtt', 'synack', 'ackdat', 'state',
    'service', 'smac', 'dmac', 'trans_depth', 'response_body_len'
]
X_train = data_train.drop(columns=columns_to_drop, axis=1, errors='ignore')
y_train = data_train['category']
X_test = data_test.drop(columns=columns_to_drop, axis=1, errors='ignore')
y_test = data_test['category']

# Ensure all features are numeric for X_train and X_test
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

# Reset indices so that X_train and y_train are aligned
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Encode the categorical target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
print("Shapes after feature selection and index reset: ", X_train.shape, y_train.shape)

Label mapping: {'DDoS': np.int64(0), 'DoS': np.int64(1), 'Normal': np.int64(2), 'Reconnaissance': np.int64(3), 'Theft': np.int64(4)}
Shapes after feature selection and index reset:  (2934817, 11) (2934817,)


In [20]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("Shape after SMOTE:", X_train_resampled.shape, y_train_resampled.shape)

Shape after SMOTE: (7706575, 11) (7706575,)


In [21]:
# 5. Scale features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
# 6. Drop highly correlated features
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
corr_matrix = X_train_scaled_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X_train_scaled_df.drop(columns=to_drop, inplace=True)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_train.columns).drop(columns=to_drop, errors='ignore')
X_train_scaled = X_train_scaled_df.values
X_test_scaled = X_test_scaled_df.values
feature_names = X_train_scaled_df.columns


In [23]:
# LightGBM parameters for multiclass
lgb_params = {
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "random_state": 42,
    "n_estimators": 100,
    "n_jobs": -1
}

In [None]:
# 7. Genetic Algorithm for Feature Selection
import multiprocessing
from deap import base, creator, tools, algorithms
import random

# --- Clean up previous DEAP creators (especially in Jupyter) ---
for name in ["FitnessMax", "Individual"]:
    if hasattr(creator, name):
        delattr(creator, name)

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
num_features = X_train_scaled.shape[1]
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# --- GA parameters ---
population_size = 40
n_generations = 15
crossover_prob = 0.7
mutation_prob = 0.15
mutation_indpb = 0.08

min_features = max(5, int(0.1 * num_features))
penalty_factor = 0.001

# --- Global cache to store fitness evaluations ---
fitness_cache = {}

def evaluate_individual(individual, X, y, min_features, n_splits=3):
    key = tuple(individual)
    if key in fitness_cache:
        return fitness_cache[key]
    selected = [i for i, bit in enumerate(individual) if bit]
    if len(selected) < min_features:
        fitness_cache[key] = (0.0,)
        return (0.0,)
    X_sel = X[:, selected]
    y_arr = np.array(y)
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in kf.split(X_sel):
        X_tr, X_val = X_sel[train_idx], X_sel[val_idx]
        y_tr, y_val = y_arr[train_idx], y_arr[val_idx]
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(
            X_tr, y_tr,
            early_stopping_rounds=10,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        y_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, y_pred))
    mean_score = np.mean(scores)
    penalty = penalty_factor * (len(selected) / X.shape[1])
    fitness = mean_score - penalty
    fitness_cache[key] = (fitness,)
    return (fitness,)

toolbox.register("evaluate", evaluate_individual, X=X_train_scaled, y=y_train, min_features=min_features, n_splits=3)
toolbox.register("mate", tools.cxUniform, indpb=0.5)
toolbox.register("mutate", tools.mutFlipBit, indpb=mutation_indpb)
toolbox.register("select", tools.selTournament, tournsize=3)

# --- Parallelize evaluation with multiprocessing ---
with multiprocessing.Pool() as pool:
    toolbox.register("map", pool.map)
    population = toolbox.population(n=population_size)
    result, logbook = algorithms.eaSimple(
        population, toolbox,
        cxpb=crossover_prob,
        mutpb=mutation_prob,
        ngen=n_generations,
        verbose=True
    )

# --- Plot evolution: Fitness over generations ---
generations = logbook.select("gen")
max_fitness = logbook.select("max")
avg_fitness = logbook.select("avg")

plt.figure(figsize=(10, 5))
plt.plot(generations, max_fitness, label="Max Fitness")
plt.plot(generations, avg_fitness, label="Avg Fitness")
plt.xlabel("Generation")
plt.ylabel("Fitness")
plt.title("Fitness Evolution Over Generations")
plt.legend()
plt.show()

# --- Identify the best individual and the selected features ---
best_individual = tools.selBest(population, k=1)[0]
selected_feature_indices = [i for i, bit in enumerate(best_individual) if bit]
chosen_features = list(np.array(feature_names)[selected_feature_indices])

print(f"\nNumber of selected features: {len(chosen_features)}")
print("Selected Features:")
for i, feature in enumerate(chosen_features, 1):
    print(f"{i}. {feature}")


In [None]:
from sklearn.model_selection import StratifiedKFold

print("Training LightGBM with 5 stratified k-fold validation and early stopping...")

# Prepare data with selected features
X_selected = X_train_scaled[:, selected_feature_indices]
X_test_selected = X_test_scaled[:, selected_feature_indices]

fold_results = []
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_selected, y_train_resampled), 1):
    print(f"\nTraining fold {fold}/5...")

    X_train_fold = X_selected[train_idx]
    X_val_fold = X_selected[val_idx]
    y_train_fold = y_train_resampled.iloc[train_idx]
    y_val_fold = y_train_resampled.iloc[val_idx]

    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        early_stopping_rounds=20,
        verbose=False
    )

    y_pred = model.predict(X_val_fold)
    cm = confusion_matrix(y_val_fold, y_pred)
    metrics = {
        "Fold": fold,
        "Accuracy": accuracy_score(y_val_fold, y_pred),
        "Precision": precision_score(y_val_fold, y_pred, average='weighted'),
        "Recall": recall_score(y_val_fold, y_pred, average='weighted'),
        "F1": f1_score(y_val_fold, y_pred, average='weighted'),
        "Confusion_Matrix": cm
    }
    fold_results.append(metrics)

    print(f"Fold {fold} Results:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1']:.4f}")
    print(f"Confusion Matrix:\n{cm}")

# Calculate average metrics
avg_metrics = {
    "Accuracy": np.mean([m["Accuracy"] for m in fold_results]),
    "Precision": np.mean([m["Precision"] for m in fold_results]),
    "Recall": np.mean([m["Recall"] for m in fold_results]),
    "F1": np.mean([m["F1"] for m in fold_results])
}

print("\nAverage Metrics Across All Folds:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

# Optional: Plot per-fold metrics
plt.figure(figsize=(8, 5))
plt.plot([m["Fold"] for m in fold_results], [m["Accuracy"] for m in fold_results], marker='o', label='Accuracy')
plt.plot([m["Fold"] for m in fold_results], [m["F1"] for m in fold_results], marker='o', label='F1 Score')
plt.xlabel("Fold")
plt.ylabel("Score")
plt.title("Per-Fold Metrics")
plt.legend()
plt.show()

# Evaluate on the held-out test set
final_model = lgb.LGBMClassifier(**lgb_params)
final_model.fit(X_selected, y_train_resampled)
y_pred_test = final_model.predict(X_test_selected)
test_cm = confusion_matrix(y_test, y_pred_test)
print("\nTest Set Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_test, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_test, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_test, average='weighted'):.4f}")
print(f"Confusion Matrix:\n{test_cm}")

Training LightGBM with 5 k-fold validation...

Training fold 1/5...
Fold 1 Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[    72      8]
 [     6 586878]]

Training fold 2/5...
Fold 2 Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[    63      8]
 [    10 586883]]

Training fold 3/5...
Fold 3 Results:
Accuracy: 0.9999
Precision: 0.9999
Recall: 0.9999
F1 Score: 0.9999
Confusion Matrix:
[[    48     30]
 [     9 586876]]

Training fold 4/5...
Fold 4 Results:
Accuracy: 1.0000
Precision: 0.9999
Recall: 1.0000
F1 Score: 0.9999
Confusion Matrix:
[[    49     18]
 [    11 586885]]

Training fold 5/5...
Fold 5 Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[    71      3]
 [    13 586876]]

Average Metrics Across All Folds:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1: 1.0000


In [None]:
# ...existing code...

import joblib

# Save the final LightGBM model to a file
joblib.dump(final_model, "final_lightgbm_model.joblib")
print("Final LightGBM model saved as 'final_lightgbm_model.joblib'.")

# To load the model later:
# loaded_model = joblib.load("final_lightgbm_model.joblib")
# y_pred_loaded = loaded_model.predict(X_test_selected)
# ...existing code...