# Hybrid Botnet Detection using GA and LightGBM


1. Import required packages
2. Load and prepare datasets
3. Preprocess data
   - Remove duplicates
   - Handle missing values
   - Handle infinite values
   - Drop single-value columns
4. Feature engineering and selection
   - Apply SMOTE for class balancing
   - Scale features
   - Use GA for feature selection
5. Train LightGBM model
   - K-fold cross validation
   - Performance evaluation
6. Save results and model

**Importing the needed packages**

In [114]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from deap import base, creator, tools, algorithms
import random
from joblib import dump
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [115]:
data_test = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")
data_train = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")

# Concatenate datasets
df = pd.concat([data_train, data_test], axis=0, ignore_index=True)
print("Dataset shapes:")
print(f"Training data: {data_train.shape}")
print(f"Testing data: {data_test.shape}")
print(f"Combined data: {df.shape}")

Dataset shapes:
Training data: (2934817, 19)
Testing data: (733705, 19)
Combined data: (3668522, 19)


In [116]:
#Data Preprocessing

# 1. Remove duplicate rows
df.drop_duplicates(inplace=True)
print("Duplicates removal: Done")

# 2. Handle missing values
numeric_cols = df.select_dtypes(include=np.number).columns
missing_values_before = df[numeric_cols].isnull().sum().sum()
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Handle non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=np.number).columns
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

missing_values_after = df[numeric_cols].isnull().sum().sum()
print(f"Missing values removed: {missing_values_before - missing_values_after}")

# 3. Clean column names
df.columns = df.columns.str.replace(' ', '')
print("Column names cleaned")

# 4. Handle infinite values
infinite_values_before = df.isin([np.inf, -np.inf]).sum().sum()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

infinite_values_after = df.isin([np.inf, -np.inf]).sum().sum()
print(f"Infinite values handled: {infinite_values_before - infinite_values_after}")

# 5. Drop single-value columns
cols_to_drop = [col for col in df.columns if df[col].nunique() == 1]
df.drop(cols_to_drop, axis=1, inplace=True)
print(f"Dropped {len(cols_to_drop)} single-value columns")

Duplicates removal: Done
Missing values removed: 0
Column names cleaned
Infinite values handled: 0
Dropped 0 single-value columns


In [117]:
#SMOTE and Feature Preparation
# Remove unnecessary columns
# Modified feature preparation code
columns_to_drop = [
    # these are non numeric or categorical columns that are not useful for the model
    'category', 'subcategory', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'attack',
    
    # Time-related columns (often not relevant for pattern detection)
    'starttime', 'ltime',
    
    # Session identifiers (unique per connection)
    'sid', 'sessionid',
    
    # Redundant or derived features
    'tcprtt', 'synack', 'ackdat',  # Often correlated with other timing features
    'state',  # Protocol state often captured by other features
    
    # Location or routing specific
    'service', 'smac', 'dmac',  # MAC addresses aren't relevant for botnet detection
    
    # Highly sparse or constant features
    'trans_depth', 'response_body_len'  # Often sparse in botnet traffic
]

X = df.drop(columns=columns_to_drop, axis=1, errors='ignore')
y = df['attack']

# Split data before SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_with_label = X_train.copy()
train_with_label['attack'] = y_train
test_with_label = X_test.copy()
test_with_label['attack'] = y_test

# Check for duplicate rows between train and test
duplicates = pd.merge(train_with_label, test_with_label, how='inner')
print(f"Number of duplicate rows between train and test: {len(duplicates)}")
if len(duplicates) == 0:
    print("No overlap between train and test sets.")
else:
    print("Warning: There is overlap between train and test sets!")

# ...after train_test_split, before SMOTE...

# Check correlation of each feature with the target
correlations = X_train.corrwith(y_train)
print("Feature correlations with target (attack):")
print(correlations.sort_values(ascending=False))

# Optionally, flag features with high correlation (e.g., > 0.95)
high_corr = correlations[correlations.abs() > 0.95]
if not high_corr.empty:
    print("Warning: The following features are highly correlated with the target and may cause leakage:")
    print(high_corr)
else:
    print("No features are highly correlated with the target.")
# ...existing code...

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Show class distribution after SMOTE
print("Class distribution in 'attack' after SMOTE:")
print(y_train_resampled.value_counts())
print("Number of unique classes in 'attack' after SMOTE:", y_train_resampled.nunique())

# Resample 10% of the SMOTE data
from sklearn.model_selection import train_test_split
X_train_resampled_10, _, y_train_resampled_10, _ = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.7, random_state=42, stratify=y_train_resampled
)
print(f"Shape after 10% resample: {X_train_resampled_10.shape}, {y_train_resampled_10.shape}")
# Scale features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled_10)
X_test_scaled = scaler.transform(X_test)

# Use the 10% resampled labels
y_train_resampled = y_train_resampled_10

print("Data preparation completed:")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")

Number of duplicate rows between train and test: 0
No overlap between train and test sets.
Feature correlations with target (attack):
N_IN_Conn_P_DstIP    0.051966
N_IN_Conn_P_SrcIP    0.028058
seq                  0.016681
max                  0.015012
mean                 0.012895
stddev               0.011811
min                  0.005380
state_number         0.002169
drate               -0.001579
pkSeqID             -0.018452
srate               -0.091030
dtype: float64
No features are highly correlated with the target.
Class distribution in 'attack' after SMOTE:
attack
1    2934448
0    2934448
Name: count, dtype: int64
Number of unique classes in 'attack' after SMOTE: 2
Shape after 10% resample: (1760668, 11), (1760668,)
Data preparation completed:
Training set shape: (1760668, 11)
Test set shape: (733705, 11)


In [118]:
# Convert scaled data back to DataFrame for correlation analysis
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)

# Calculate Pearson correlation matrix
corr_matrix = X_train_scaled_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f"Features to drop due to high correlation: {to_drop}")

# Drop highly correlated features
X_train_scaled_df.drop(columns=to_drop, inplace=True)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns).drop(columns=to_drop, errors='ignore')

# Update numpy arrays for GA and model training
X_train_scaled = X_train_scaled_df.values
X_test_scaled = X_test_scaled_df.values

# Update feature names for GA and later use
feature_names = X_train_scaled_df.columns

# List selected columns
selected_columns = list(feature_names)
print(f"Selected columns after dropping highly correlated ones: {selected_columns}")

Features to drop due to high correlation: ['max']
Selected columns after dropping highly correlated ones: ['pkSeqID', 'seq', 'stddev', 'N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP', 'drate', 'srate']


In [119]:
#Genetic Algorithm Feature Selection
# Clear existing DEAP creators
if 'FitnessMax' in creator.__dict__:
    del creator.FitnessMax
if 'Individual' in creator.__dict__:
    del creator.Individual

# Initialize GA components
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Setup toolbox
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, 
                 toolbox.attr_bool, n=X_train_scaled.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


# Define fitness function
def evaluate_individual(individual, X, y, k=3, min_features=5):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) < min_features:
        return 0.0,
    
    X_selected = X[:, selected_features]
    y_array = np.array(y)  # <-- Add this line
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kf.split(X_selected):
        X_train_fold = X_selected[train_idx]
        X_val_fold = X_selected[val_idx]
        y_train_fold = y_array[train_idx]   # <-- Use y_array
        y_val_fold = y_array[val_idx]       # <-- Use y_array
        
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        scores.append(accuracy_score(y_val_fold, y_pred))
    
    feature_penalty = 0.0001 * len(selected_features)/X.shape[1]
    return np.mean(scores) - feature_penalty,
# ...existing code...

# Register GA operators
toolbox.register("evaluate", evaluate_individual, X=X_train_scaled, y=y_train_resampled)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

print("GA components initialized")

GA components initialized


In [120]:
# Initialize GA components
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Setup toolbox
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, 
                 toolbox.attr_bool, n=X_train_scaled.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Define LightGBM parameters
lgb_params = {
    'learning_rate': 0.01990086265614642,
    'num_leaves': 65,
    'max_depth': 7,
    'min_child_samples': 40,
    'subsample': 0.6971404971190901,
    'colsample_bytree': 0.8912957047621014,
    'reg_alpha': 1.1077475757773147e-05,
    'reg_lambda': 7.505137036788418,
    'n_estimators': 386,
    'feature_fraction': 0.6749863286377266,
    'bagging_fraction': 0.8742278952008886,
    'min_child_weight': 13,
    'random_state': 42,
    'verbosity': -1
    }

# Define fitness function
def evaluate_individual(individual, X, y, k=3):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0.0,
    
    X_selected = X[:, selected_features]
    y_array = np.array(y)  # Always convert y to numpy array
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kf.split(X_selected):
        X_train_fold = X_selected[train_idx]
        X_val_fold = X_selected[val_idx]
        y_train_fold = y_array[train_idx]
        y_val_fold = y_array[val_idx]
        
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        scores.append(accuracy_score(y_val_fold, y_pred))
    
    feature_penalty = 0.001 * len(selected_features)/X.shape[1]
    return np.mean(scores) - feature_penalty,

# Register GA operators
toolbox.register("evaluate", evaluate_individual, X=X_train_scaled, y=y_train_resampled)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

print("GA components initialized")

GA components initialized




In [121]:
#Run GA and Store Selected Features
print("Running Genetic Algorithm for feature selection...")

# Set GA parameters
population_size = 20
n_generations = 10
crossover_prob = 0.8
mutation_prob = 0.1

# Create initial population
population = toolbox.population(n=population_size)

# Run GA
result, logbook = algorithms.eaSimple(population, toolbox,
                                      cxpb=crossover_prob,
                                      mutpb=mutation_prob,
                                      ngen=n_generations,
                                      verbose=True)

# Get best solution
best_individual = tools.selBest(population, k=1)[0]
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

# Get feature names
feature_names = X.columns
chosen_features = list(feature_names[selected_features])

print(f"\nNumber of selected features: {len(chosen_features)}")
print("\nSelected Features:")
for i, feature in enumerate(chosen_features, 1):
    print(f"{i}. {feature}")

# Save selected features
feature_file = "models_and_data/selected_features.txt"
with open(feature_file, "w") as file:
    file.write("Selected Features from Genetic Algorithm\n")
    file.write(f"\nTotal Features Selected: {len(chosen_features)} out of {len(feature_names)}\n")
    file.write("\nFeature List:\n")
    for i, feature in enumerate(chosen_features, 1):
        file.write(f"{i}. {feature}\n")

print(f"\nSelected features saved to {feature_file}")

Running Genetic Algorithm for feature selection...
gen	nevals
0  	20    
1  	16    
2  	16    
3  	20    
4  	17    
5  	18    
6  	16    
7  	16    
8  	16    
9  	18    
10 	18    

Number of selected features: 2

Selected Features:
1. pkSeqID
2. drate

Selected features saved to models_and_data/selected_features.txt


In [127]:
#LightGBM Training and Evaluation
print("Training LightGBM with 5 k-fold validation...")

# Prepare data with selected features
X_selected = X_train_scaled[:, selected_features]
X_test_selected = X_test_scaled[:, selected_features]

# Initialize metrics storage
fold_results = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ...existing code...
for fold, (train_idx, val_idx) in enumerate(kf.split(X_selected), 1):
    print(f"\nTraining fold {fold}/5...")

    # Split data
    X_train_fold = X_selected[train_idx]
    X_val_fold = X_selected[val_idx]
    y_train_fold = y_train_resampled.iloc[train_idx]  # <-- Use .iloc here
    y_val_fold = y_train_resampled.iloc[val_idx]      # <-- And here

    # Train model
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_train_fold, y_train_fold)

    # Evaluate
    y_pred = model.predict(X_val_fold)
    metrics = {
        "Fold": fold,
        "Accuracy": accuracy_score(y_val_fold, y_pred),
        "Precision": precision_score(y_val_fold, y_pred, average='weighted'),
        "Recall": recall_score(y_val_fold, y_pred, average='weighted'),
        "F1": f1_score(y_val_fold, y_pred, average='weighted'),
        "Confusion_Matrix": confusion_matrix(y_val_fold, y_pred)
    }
    fold_results.append(metrics)

    print(f"Fold {fold} Results:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1']:.4f}")
# ...existing code...

# Calculate average metrics
avg_metrics = {
    "Accuracy": np.mean([m["Accuracy"] for m in fold_results]),
    "Precision": np.mean([m["Precision"] for m in fold_results]),
    "Recall": np.mean([m["Recall"] for m in fold_results]),
    "F1": np.mean([m["F1"] for m in fold_results])
}

print("\nAverage Metrics Across All Folds:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

Training LightGBM with 5 k-fold validation...

Training fold 1/5...
Fold 1 Results:
Accuracy: 0.9990
Precision: 0.9990
Recall: 0.9990
F1 Score: 0.9990

Training fold 2/5...
Fold 2 Results:
Accuracy: 0.9989
Precision: 0.9989
Recall: 0.9989
F1 Score: 0.9989

Training fold 3/5...
Fold 3 Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Training fold 4/5...
Fold 4 Results:
Accuracy: 0.9990
Precision: 0.9990
Recall: 0.9990
F1 Score: 0.9990

Training fold 5/5...
Fold 5 Results:
Accuracy: 0.9990
Precision: 0.9990
Recall: 0.9990
F1 Score: 0.9990

Average Metrics Across All Folds:
Accuracy: 0.9992
Precision: 0.9992
Recall: 0.9992
F1: 0.9992


In [123]:
# Save Results and Model
# Train final model on all training data
final_model = lgb.LGBMClassifier(**lgb_params)
final_model.fit(X_selected, y_train_resampled)

# Evaluate on test set
y_pred_test = final_model.predict(X_test_selected)
test_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_test),
    "Precision": precision_score(y_test, y_pred_test, average='weighted'),
    "Recall": recall_score(y_test, y_pred_test, average='weighted'),
    "F1": f1_score(y_test, y_pred_test, average='weighted'),
    "Confusion_Matrix": confusion_matrix(y_test, y_pred_test)
}

# Save results
results_file = "models_and_data/final_results.txt"
with open(results_file, "w") as file:
    file.write("GA-LightGBM Results\n\n")
    
    file.write("Selected Features:\n")
    for i, feature in enumerate(chosen_features, 1):
        file.write(f"{i}. {feature}\n")
    
    file.write("\nCross-Validation Results:\n")
    for metric, value in avg_metrics.items():
        file.write(f"Average {metric}: {value:.4f}\n")
    
    file.write("\nTest Set Results:\n")
    for metric, value in test_metrics.items():
        if metric != "Confusion_Matrix":
            file.write(f"{metric}: {value:.4f}\n")
    file.write(f"\nConfusion Matrix:\n{test_metrics['Confusion_Matrix']}\n")

# Save model
model_file = "models_and_data/final_model.joblib"
dump(final_model, model_file)

print(f"\nResults saved to {results_file}")
print(f"Model saved to {model_file}")


Results saved to models_and_data/final_results.txt
Model saved to models_and_data/final_model.joblib


In [None]:
output_file = "models_and_data/performance_metrics.txt"
with open(output_file, "w") as file:
    file.write("Performance Metrics:\n")
    file.write(f"Average Accuracy: {avg_metrics['Accuracy']}\n")
    file.write(f"Average Precision: {avg_metrics['Precision']}\n")
    file.write(f"Average Recall: {avg_metrics['Recall']}\n")
    file.write(f"Average F1 Score: {avg_metrics['F1']}\n")
    file.write("\nResults for Each Fold:\n")
    for result in fold_results:
        file.write(f"Fold {result['Fold']}:\n")
        file.write(f"  Accuracy: {result['Accuracy']}\n")
        file.write(f"  Precision: {result['Precision']}\n")
        file.write(f"  Recall: {result['Recall']}\n")
        file.write(f"  F1 Score: {result['F1']}\n")
        file.write(f"  Confusion Matrix:\n{result['Confusion_Matrix']}\n\n")

print(f"Performance metrics saved to {output_file}")

Performance metrics saved to models_and_data/performance_metrics.txt


In [126]:
from joblib import dump

# Save the trained model
model_file = "models_and_data/final_model.joblib"
dump(final_model, model_file)

print(f"Model saved to {model_file}")

Model saved to models_and_data/final_model.joblib
