# Hybrid Botnet Detection using GA and LightGBM


1. Import required packages
2. Load and prepare datasets
3. Preprocess data
   - Remove duplicates
   - Handle missing values
   - Handle infinite values
   - Drop single-value columns
4. Feature engineering and selection
   - Apply SMOTE for class balancing
   - Scale features
   - Use GA for feature selection
5. Train LightGBM model
   - K-fold cross validation
   - Performance evaluation
6. Save results and model

**Importing the needed packages**

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report
)
from imblearn.over_sampling import SMOTE
from deap import base, creator, tools, algorithms
import random
from joblib import dump
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load datasets
data_test = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")
data_train = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")

# Concatenate datasets
df = pd.concat([data_train, data_test], axis=0, ignore_index=True)

# Convert attack labels to multiclass
label_encoder = LabelEncoder()
print("Original attack distribution:")
print(df['attack'].value_counts())
df['attack_encoded'] = label_encoder.fit_transform(df['attack'])
print("\nEncoded attack distribution:")
print(df['attack_encoded'].value_counts())
print("\nAttack classes:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

print("\nDataset shapes:")
print(f"Training data: {data_train.shape}")
print(f"Testing data: {data_test.shape}")
print(f"Combined data: {df.shape}")

Original attack distribution:
attack
1    3668045
0        477
Name: count, dtype: int64

Encoded attack distribution:
attack_encoded
1    3668045
0        477
Name: count, dtype: int64

Attack classes:
0: 0
1: 1

Dataset shapes:
Training data: (2934817, 19)
Testing data: (733705, 19)
Combined data: (3668522, 20)


In [3]:
#Data Preprocessing

# 1. Remove duplicate rows
df.drop_duplicates(inplace=True)
print("Duplicates removal: Done")

# 2. Handle missing values by attack type
numeric_cols = df.select_dtypes(include=np.number).columns
missing_values_before = df[numeric_cols].isnull().sum().sum()

# Handle missing values per attack type for better representation
for attack_type in df['attack'].unique():
    mask = df['attack'] == attack_type
    df.loc[mask, numeric_cols] = df.loc[mask, numeric_cols].fillna(
        df.loc[mask, numeric_cols].mean()
    )

# Handle non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=np.number).columns
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

missing_values_after = df[numeric_cols].isnull().sum().sum()
print(f"Missing values removed: {missing_values_before - missing_values_after}")

# 3. Clean column names
df.columns = df.columns.str.replace(' ', '')
print("Column names cleaned")

# 4. Handle infinite values by attack type
infinite_values_before = df.isin([np.inf, -np.inf]).sum().sum()
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle infinites per attack type
for attack_type in df['attack'].unique():
    mask = df['attack'] == attack_type
    df.loc[mask, numeric_cols] = df.loc[mask, numeric_cols].fillna(
        df.loc[mask, numeric_cols].mean()
    )

infinite_values_after = df.isin([np.inf, -np.inf]).sum().sum()
print(f"Infinite values handled: {infinite_values_before - infinite_values_after}")

# 5. Drop single-value columns
cols_to_drop = [col for col in df.columns if df[col].nunique() == 1]
df.drop(cols_to_drop, axis=1, inplace=True)
print(f"Dropped {len(cols_to_drop)} single-value columns")

# 6. Print statistics per attack type
print("\nFeature statistics by attack type:")
for attack_type in df['attack'].unique():
    mask = df['attack'] == attack_type
    print(f"\nAttack type: {attack_type}")
    print(f"Number of samples: {mask.sum()}")
    print("Numeric feature statistics:")
    print(df.loc[mask, numeric_cols].describe().round(2))

Duplicates removal: Done
Missing values removed: 0
Column names cleaned
Infinite values handled: 0
Dropped 0 single-value columns

Feature statistics by attack type:

Attack type: 1
Number of samples: 3668045
Numeric feature statistics:
          pkSeqID         seq      stddev  N_IN_Conn_P_SrcIP         min  \
count  3668045.00  3668045.00  3668045.00         3668045.00  3668045.00   
mean   1834034.85   121335.04        0.89              82.55        1.02   
std    1058893.55    75788.29        0.80              24.39        1.48   
min          1.00        1.00        0.00               1.00        0.00   
25%     917012.00    54900.00        0.03              69.00        0.00   
50%    1834023.00   117786.00        0.79             100.00        0.00   
75%    2751034.00   184940.00        1.75             100.00        2.15   
max    3668522.00   262212.00        2.50             100.00        4.98   

       state_number        mean  N_IN_Conn_P_DstIP       drate       srate  \


In [4]:
#SMOTE and Feature Preparation
# Remove unnecessary columns
# Modified feature preparation code
columns_to_drop = [
    # these are non numeric or categorical columns that are not useful for the model
    'category', 'subcategory', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'attack', 'attack_encoded',
    
    # Time-related columns (often not relevant for pattern detection)
    'starttime', 'ltime',
    
    # Session identifiers (unique per connection)
    'sid', 'sessionid',
    
    # Redundant or derived features
    'tcprtt', 'synack', 'ackdat',  # Often correlated with other timing features
    'state',  # Protocol state often captured by other features
    
    # Location or routing specific
    'service', 'smac', 'dmac',  # MAC addresses aren't relevant for botnet detection
    
    # Highly sparse or constant features
    'trans_depth', 'response_body_len'  # Often sparse in botnet traffic
]

X = df.drop(columns=columns_to_drop, axis=1, errors='ignore')
y = df['attack_encoded']  # Using encoded labels for multiclass

# Split data before SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print class distribution before SMOTE
print("Class distribution before SMOTE:")
print(y_train.value_counts(normalize=True))

# Apply SMOTE to training data only
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

# Scale features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Print shapes and feature names
print("\nData preparation completed:")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")
print(f"Number of features: {X_train_scaled.shape[1]}")
print("\nFeature names:")
for i, feature in enumerate(X.columns):
    print(f"{i + 1}. {feature}")

Class distribution before SMOTE:
attack_encoded
1    0.99987
0    0.00013
Name: proportion, dtype: float64

Class distribution after SMOTE:
attack_encoded
1    0.5
0    0.5
Name: proportion, dtype: float64

Data preparation completed:
Training set shape: (5868870, 11)
Test set shape: (733705, 11)
Number of features: 11

Feature names:
1. pkSeqID
2. seq
3. stddev
4. N_IN_Conn_P_SrcIP
5. min
6. state_number
7. mean
8. N_IN_Conn_P_DstIP
9. drate
10. srate
11. max


In [5]:
#Genetic Algorithm Feature Selection
# Clear existing DEAP creators
if 'FitnessMax' in creator.__dict__:
    del creator.FitnessMax
if 'Individual' in creator.__dict__:
    del creator.Individual

# Initialize GA components
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Setup toolbox
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, 
                 toolbox.attr_bool, n=X_train_scaled.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Define LightGBM parameters for multiclass
lgb_params = {
    'objective': 'multiclass',
    'num_class': len(label_encoder.classes_),
    'learning_rate': 0.05,
    'num_leaves': 50,
    'max_depth': 10,
    'min_child_samples': 40,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'n_estimators': 200,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1,
    'metric': 'multi_logloss'
}

# Define multiclass fitness function
def evaluate_individual(individual, X, y, k=5):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0.0,
    
    X_selected = X[:, selected_features]
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kf.split(X_selected):
        X_train_fold = X_selected[train_idx]
        X_val_fold = X_selected[val_idx]
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        
        # Calculate weighted metrics for multiclass
        accuracy = accuracy_score(y_val_fold, y_pred)
        f1 = f1_score(y_val_fold, y_pred, average='weighted')
        score = (accuracy + f1) / 2
        scores.append(score)
    
    # Return mean CV score with feature penalty
    feature_penalty = 0.001 * len(selected_features)/X.shape[1]
    return np.mean(scores) - feature_penalty,

# Register GA operators
toolbox.register("evaluate", evaluate_individual, X=X_train_scaled, y=y_train_resampled)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

print("GA components initialized with multiclass support")

GA components initialized with multiclass support


In [None]:
# Initialize GA components
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Setup toolbox
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, 
                 toolbox.attr_bool, n=X_train_scaled.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Define LightGBM parameters for multiclass classification
lgb_params = {
    'objective': 'multiclass',
    'num_class': len(label_encoder.classes_),  # Number of attack classes
    'learning_rate': 0.01990086265614642,
    'num_leaves': 65,
    'max_depth': 7,
    'min_child_samples': 40,
    'subsample': 0.6971404971190901,
    'colsample_bytree': 0.8912957047621014,
    'reg_alpha': 1.1077475757773147e-05,
    'reg_lambda': 7.505137036788418,
    'n_estimators': 386,
    'feature_fraction': 0.6749863286377266,
    'bagging_fraction': 0.8742278952008886,
    'min_child_weight': 13,
    'metric': 'multi_logloss',  # Metric for multiclass
    'random_state': 42,
    'verbose': -1,
    'n_jobs': -1
}

# Define multiclass fitness function
def evaluate_individual(individual, X, y, k=5):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0.0,
    
    X_selected = X[:, selected_features]
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kf.split(X_selected):
        X_train_fold = X_selected[train_idx]
        X_val_fold = X_selected[val_idx]
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        
        # Calculate weighted metrics for multiclass
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, average='weighted')
        f1 = f1_score(y_val_fold, y_pred, average='weighted')
        
        # Combined score with multiple metrics
        score = (accuracy + precision + f1) / 3
        scores.append(score)
    
    # Return mean CV score with feature penalty
    feature_penalty = 0.001 * len(selected_features)/X.shape[1]
    return np.mean(scores) - feature_penalty,

# Register GA operators
toolbox.register("evaluate", evaluate_individual, X=X_train_scaled, y=y_train_resampled)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

print("GA components initialized with multiclass support")

GA components initialized with multiclass support


: 

In [None]:
#Run GA and Store Selected Features
print("Running Genetic Algorithm for feature selection...")

# Set GA parameters for multiclass problem
population_size = 50  # Increased for better exploration
n_generations = 20    # More generations for complex multiclass problem
crossover_prob = 0.8
mutation_prob = 0.1  # Slightly increased mutation rate

# Create initial population
population = toolbox.population(n=population_size)

# Statistics setup for tracking evolution
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

# Run GA with statistics
result, logbook = algorithms.eaSimple(population, toolbox,
                                    cxpb=crossover_prob,
                                    mutpb=mutation_prob,
                                    ngen=n_generations,
                                    stats=stats,
                                    verbose=True)

# Get best solution and its features
best_individual = tools.selBest(population, k=1)[0]
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

# Get feature names and their importance scores
feature_names = X.columns
chosen_features = list(feature_names[selected_features])

# Print detailed selection results
print(f"\nGA Feature Selection Results:")
print(f"Number of selected features: {len(chosen_features)} out of {len(feature_names)}")
print(f"Selection ratio: {len(chosen_features)/len(feature_names):.2%}")

print("\nSelected Features:")
for i, feature in enumerate(chosen_features, 1):
    print(f"{i}. {feature}")

# Save selected features with additional information
feature_file = "models_and_data/selected_features.txt"
os.makedirs(os.path.dirname(feature_file), exist_ok=True)

with open(feature_file, "w") as file:
    file.write("Selected Features from Genetic Algorithm\n")
    file.write("-" * 50 + "\n\n")
    file.write(f"Total Features Selected: {len(chosen_features)} out of {len(feature_names)}\n")
    file.write(f"Selection Ratio: {len(chosen_features)/len(feature_names):.2%}\n\n")
    
    file.write("GA Parameters:\n")
    file.write(f"Population Size: {population_size}\n")
    file.write(f"Number of Generations: {n_generations}\n")
    file.write(f"Crossover Probability: {crossover_prob}\n")
    file.write(f"Mutation Probability: {mutation_prob}\n\n")
    
    file.write("Selected Features List:\n")
    for i, feature in enumerate(chosen_features, 1):
        file.write(f"{i}. {feature}\n")
    
    # Add evolution statistics
    file.write("\nEvolution Statistics:\n")
    for gen, stats in enumerate(logbook):
        file.write(f"Generation {gen}:\n")
        file.write(f"  Avg Fitness: {stats['avg']:.4f}\n")
        file.write(f"  Max Fitness: {stats['max']:.4f}\n")

print(f"\nDetailed feature selection results saved to {feature_file}")

# Save selected feature indices for later use
np.save("models_and_data/selected_feature_indices.npy", selected_features)

Running Genetic Algorithm for feature selection...


In [None]:
#LightGBM Training and Evaluation for Multimodal
print("Training LightGBM with k-fold validation for multimodal data...")

# Use the features selected by GA for both modalities
X_network = X_train_scaled[:, selected_features]
X_host = X_train_scaled[:, selected_features]
X_test_network = X_test_scaled[:, selected_features]
X_test_host = X_test_scaled[:, selected_features]

# Initialize metrics storage
fold_results = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_network), 1):
    print(f"\nTraining fold {fold}/5...")

    # Split data for both modalities
    X_train_network = X_network[train_idx]
    X_val_network = X_network[val_idx]
    X_train_host = X_host[train_idx]
    X_val_host = X_host[val_idx]
    y_train_fold = y_train_resampled[train_idx]
    y_val_fold = y_train_resampled[val_idx]

    # Train network model
    network_model = lgb.LGBMClassifier(**lgb_params)
    network_model.fit(X_train_network, y_train_fold)

    # Train host model
    host_model = lgb.LGBMClassifier(**lgb_params)
    host_model.fit(X_train_host, y_train_fold)

    # Get predictions from both models
    network_pred_proba = network_model.predict_proba(X_val_network)
    host_pred_proba = host_model.predict_proba(X_val_host)

    # Combine predictions with weighted average
    combined_pred_proba = 0.6 * network_pred_proba + 0.4 * host_pred_proba
    y_pred = np.argmax(combined_pred_proba, axis=1)

    # Calculate metrics for multimodal prediction
    metrics = {
        "Fold": fold,
        "Accuracy": accuracy_score(y_val_fold, y_pred),
        "Precision": precision_score(y_val_fold, y_pred, average='weighted'),
        "Recall": recall_score(y_val_fold, y_pred, average='weighted'),
        "F1": f1_score(y_val_fold, y_pred, average='weighted'),
        "Confusion_Matrix": confusion_matrix(y_val_fold, y_pred)
    }
    fold_results.append(metrics)

    # Print individual modality performance
    network_pred = network_model.predict(X_val_network)
    host_pred = host_model.predict(X_val_host)

    print(f"Fold {fold} Results:")
    print("Network Features:")
    print(f"Accuracy: {accuracy_score(y_val_fold, network_pred):.4f}")
    print(f"F1 Score: {f1_score(y_val_fold, network_pred, average='weighted'):.4f}")

    print("\nHost Features:")
    print(f"Accuracy: {accuracy_score(y_val_fold, host_pred):.4f}")
    print(f"F1 Score: {f1_score(y_val_fold, host_pred, average='weighted'):.4f}")

    print("\nCombined Multimodal:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1']:.4f}")

# Calculate average metrics
avg_metrics = {
    "Accuracy": np.mean([m["Accuracy"] for m in fold_results]),
    "Precision": np.mean([m["Precision"] for m in fold_results]),
    "Recall": np.mean([m["Recall"] for m in fold_results]),
    "F1": np.mean([m["F1"] for m in fold_results])
}

print("\nAverage Multimodal Metrics Across All Folds:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save the best models (by F1 score)
best_fold = max(fold_results, key=lambda x: x['F1'])
best_fold_idx = best_fold['Fold'] - 1

# Retrain best models on full training set
final_network_model = lgb.LGBMClassifier(**lgb_params)
final_host_model = lgb.LGBMClassifier(**lgb_params)

final_network_model.fit(X_network, y_train_resampled)
final_host_model.fit(X_host, y_train_resampled)

# Save models and feature information
models_dir = "models_and_data/multimodal"
os.makedirs(models_dir, exist_ok=True)

dump(final_network_model, f"{models_dir}/network_model.joblib")
dump(final_host_model, f"{models_dir}/host_model.joblib")

In [None]:
# Save Results and Model for Multimodal Implementation
print("Evaluating and saving multimodal models...")

# Evaluate both models on test set
network_pred_proba = final_network_model.predict_proba(X_test_network)
host_pred_proba = final_host_model.predict_proba(X_test_host)

# Combine predictions with weighted average
combined_pred_proba = 0.6 * network_pred_proba + 0.4 * host_pred_proba
y_pred_test = np.argmax(combined_pred_proba, axis=1)

# Get individual model predictions
network_pred_test = final_network_model.predict(X_test_network)
host_pred_test = final_host_model.predict(X_test_host)

# Calculate metrics for each model type
test_metrics = {
    "Network": {
        "Accuracy": accuracy_score(y_test, network_pred_test),
        "Precision": precision_score(y_test, network_pred_test, average='weighted'),
        "Recall": recall_score(y_test, network_pred_test, average='weighted'),
        "F1": f1_score(y_test, network_pred_test, average='weighted'),
        "Confusion_Matrix": confusion_matrix(y_test, network_pred_test)
    },
    "Host": {
        "Accuracy": accuracy_score(y_test, host_pred_test),
        "Precision": precision_score(y_test, host_pred_test, average='weighted'),
        "Recall": recall_score(y_test, host_pred_test, average='weighted'),
        "F1": f1_score(y_test, host_pred_test, average='weighted'),
        "Confusion_Matrix": confusion_matrix(y_test, host_pred_test)
    },
    "Combined": {
        "Accuracy": accuracy_score(y_test, y_pred_test),
        "Precision": precision_score(y_test, y_pred_test, average='weighted'),
        "Recall": recall_score(y_test, y_pred_test, average='weighted'),
        "F1": f1_score(y_test, y_pred_test, average='weighted'),
        "Confusion_Matrix": confusion_matrix(y_test, y_pred_test)
    }
}

# Save detailed results
results_file = "models_and_data/multimodal_results.txt"
os.makedirs(os.path.dirname(results_file), exist_ok=True)

with open(results_file, "w") as file:
    file.write("Multimodal GA-LightGBM Results\n\n")
    
    # Write selected features (same for both modalities in this implementation)
    file.write("Selected Features Used (GA):\n")
    for i, feature in enumerate(chosen_features, 1):
        file.write(f"{i}. {feature}\n")
    
    # Write cross-validation results
    file.write("\nCross-Validation Results:\n")
    for metric, value in avg_metrics.items():
        file.write(f"Average {metric}: {value:.4f}\n")
    
    # Write test results for each model type
    for model_name, metrics in test_metrics.items():
        file.write(f"\n{model_name} Model Test Results:\n")
        for metric_name, value in metrics.items():
            if metric_name != "Confusion_Matrix":
                file.write(f"{metric_name}: {value:.4f}\n")
        file.write(f"Confusion Matrix:\n{metrics['Confusion_Matrix']}\n")
    
    # Write model parameters and weights
    file.write("\nModel Parameters:\n")
    file.write("Network Model Weight: 0.6\n")
    file.write("Host Model Weight: 0.4\n")

# Save models and metadata
models_dir = "models_and_data/multimodal"
os.makedirs(models_dir, exist_ok=True)

# Save final models
dump(final_network_model, f"{models_dir}/final_network_model.joblib")
dump(final_host_model, f"{models_dir}/final_host_model.joblib")

# Save feature indices and scaler (same indices for both modalities)
np.save(f"{models_dir}/selected_feature_indices.npy", selected_features)
dump(scaler, f"{models_dir}/feature_scaler.joblib")

print(f"\nResults saved to {results_file}")
print(f"Models and metadata saved to {models_dir}")

# Print final performance summary
print("\nTest Set Performance Summary:")
for model_name, metrics in test_metrics.items():
    print(f"\n{model_name} Model Performance:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"F1 Score: {metrics['F1']:.4f}")

In [None]:
# Multimodal k-fold cross-validation using GA-selected features

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []

print("Performing multimodal k-fold cross-validation...")
fold = 1
fold_results = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_network):
    print(f"Training on fold {fold}...")

    # Split the data for both modalities (same features in this implementation)
    X_train_network, X_val_network = X_network[train_index], X_network[val_index]
    X_train_host, X_val_host = X_host[train_index], X_host[val_index]
    y_train_fold, y_val_fold = y_train_resampled[train_index], y_train_resampled[val_index]

    # Train models for each modality
    network_model = lgb.LGBMClassifier(**lgb_params)
    host_model = lgb.LGBMClassifier(**lgb_params)
    network_model.fit(X_train_network, y_train_fold)
    host_model.fit(X_train_host, y_train_fold)

    # Predict probabilities and combine
    network_pred_proba = network_model.predict_proba(X_val_network)
    host_pred_proba = host_model.predict_proba(X_val_host)
    combined_pred_proba = 0.6 * network_pred_proba + 0.4 * host_pred_proba
    y_pred = np.argmax(combined_pred_proba, axis=1)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, average='weighted')
    recall = recall_score(y_val_fold, y_pred, average='weighted')
    f1 = f1_score(y_val_fold, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_val_fold, y_pred)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

    fold_results.append({
        "Fold": fold,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Confusion Matrix": conf_matrix
    })

    print(f"Fold {fold} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    fold += 1

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)
avg_conf_matrix = np.sum(conf_matrices, axis=0)

print("\nResults for Each Fold:")
for result in fold_results:
    print(f"Fold {result['Fold']}:")
    print(f"  Accuracy: {result['Accuracy']:.4f}")
    print(f"  Precision: {result['Precision']:.4f}")
    print(f"  Recall: {result['Recall']:.4f}")
    print(f"  F1 Score: {result['F1 Score']:.4f}")
    print(f"  Confusion Matrix:\n{result['Confusion Matrix']}\n")

print("\nAverage Metrics Across All Folds:")
print(f"Accuracy: {avg_accuracy:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
print("Confusion Matrix:")
print(avg_conf_matrix)

Performing k-fold cross-validation...
Training on fold 1...




[LightGBM] [Info] Number of positive: 232177, number of negative: 231915
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5883
[LightGBM] [Info] Number of data points in the train set: 464092, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500282 -> initscore=0.001129
[LightGBM] [Info] Start training from score 0.001129




Fold 1 - Accuracy: 0.9955871199062263, Precision: 0.9955933998341946, Recall: 0.9955871199062263, F1: 0.9955871213918758
Training on fold 2...




[LightGBM] [Info] Number of positive: 231996, number of negative: 232097
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5881
[LightGBM] [Info] Number of data points in the train set: 464093, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499891 -> initscore=-0.000435
[LightGBM] [Info] Start training from score -0.000435




Fold 2 - Accuracy: 0.9959318411004715, Precision: 0.9959331913920312, Recall: 0.9959318411004715, F1: 0.9959318343986221
Training on fold 3...




[LightGBM] [Info] Number of positive: 232242, number of negative: 231851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5884
[LightGBM] [Info] Number of data points in the train set: 464093, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500421 -> initscore=0.001685
[LightGBM] [Info] Start training from score 0.001685




Fold 3 - Accuracy: 0.995664652698172, Precision: 0.9956738418127152, Recall: 0.995664652698172, F1: 0.9956646613589956
Training on fold 4...




[LightGBM] [Info] Number of positive: 232075, number of negative: 232018
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5878
[LightGBM] [Info] Number of data points in the train set: 464093, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500061 -> initscore=0.000246
[LightGBM] [Info] Start training from score 0.000246




Fold 4 - Accuracy: 0.9958887461968747, Precision: 0.9958926635091048, Recall: 0.9958887461968747, F1: 0.9958887392273627
Training on fold 5...




[LightGBM] [Info] Number of positive: 231810, number of negative: 232283
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5898
[LightGBM] [Info] Number of data points in the train set: 464093, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499490 -> initscore=-0.002038
[LightGBM] [Info] Start training from score -0.002038




Fold 5 - Accuracy: 0.9965437887315446, Precision: 0.9965480588528058, Recall: 0.9965437887315446, F1: 0.9965437589633337

Results for Each Fold:
Fold 1:
  Accuracy: 0.9955871199062263
  Precision: 0.9955933998341946
  Recall: 0.9955871199062263
  F1 Score: 0.9955871213918758
  Confusion Matrix:
[[57767   359]
 [  153 57745]]

Fold 2:
  Accuracy: 0.9959318411004715
  Precision: 0.9959331913920312
  Recall: 0.9959318411004715
  F1 Score: 0.9959318343986221
  Confusion Matrix:
[[57660   284]
 [  188 57891]]

Fold 3:
  Accuracy: 0.995664652698172
  Precision: 0.9956738418127152
  Recall: 0.995664652698172
  F1 Score: 0.9956646613589956
  Confusion Matrix:
[[57814   376]
 [  127 57706]]

Fold 4:
  Accuracy: 0.9958887461968747
  Precision: 0.9958926635091048
  Recall: 0.9958887461968747
  F1 Score: 0.9958887392273627
  Confusion Matrix:
[[57703   320]
 [  157 57843]]

Fold 5:
  Accuracy: 0.9965437887315446
  Precision: 0.9965480588528058
  Recall: 0.9965437887315446
  F1 Score: 0.99654375896

NameError: name 'os' is not defined

In [None]:
output_file = "models_and_data/performance_metrics.txt"
with open(output_file, "w") as file:
    file.write("Performance Metrics:\n")
    file.write(f"Average Accuracy: {avg_accuracy}\n")
    file.write(f"Average Precision: {avg_precision}\n")
    file.write(f"Average Recall: {avg_recall}\n")
    file.write(f"Average F1 Score: {avg_f1}\n")
    file.write("\nConfusion Matrix (Summed Across Folds):\n")
    file.write(f"{avg_conf_matrix}\n")
    file.write("\nResults for Each Fold:\n")
    for result in fold_results:
        file.write(f"Fold {result['Fold']}:\n")
        file.write(f"  Accuracy: {result['Accuracy']}\n")
        file.write(f"  Precision: {result['Precision']}\n")
        file.write(f"  Recall: {result['Recall']}\n")
        file.write(f"  F1 Score: {result['F1 Score']}\n")
        file.write(f"  Confusion Matrix:\n{result['Confusion Matrix']}\n\n")

print(f"Performance metrics saved to {output_file}")


Performance metrics saved to models_and_data/performance_metrics.txt


In [None]:
from joblib import dump

# Save the trained model
model_file = "models_and_data/final_model.joblib"
dump(final_model, model_file)

print(f"Model saved to {model_file}")

Model saved to models_and_data/final_model.joblib
