In [7]:
import optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE  # Import SMOTE

data_test = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")
data_train = pd.read_csv(r"C:\VS code projects\data_files\UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")
# Concatenate the training and testing datasets
df = pd.concat([data_train, data_test], axis=0, ignore_index=True)


In [8]:
# Print the shape of the original and combined datasets
print("Shape of training data:", data_train.shape)
print("Shape of testing data:", data_test.shape)
print("Shape of combined data:", df.shape)

# Display first few rows of combined dataset
print("\nFirst few rows of combined dataset:")
display(df.head())

Shape of training data: (2934817, 19)
Shape of testing data: (733705, 19)
Shape of combined data: (3668522, 19)

First few rows of combined dataset:


Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,3142762,udp,192.168.100.150,6551,192.168.100.3,80,251984,1.900363,100,0.0,4,2.687519,100,0.0,0.494549,4.031619,1,DDoS,UDP
1,2432264,tcp,192.168.100.150,5532,192.168.100.3,80,256724,0.078003,38,3.85693,3,3.934927,100,0.0,0.256493,4.012924,1,DDoS,TCP
2,1976315,tcp,192.168.100.147,27165,192.168.100.3,80,62921,0.268666,100,2.9741,3,3.341429,100,0.0,0.29488,3.609205,1,DDoS,TCP
3,1240757,udp,192.168.100.150,48719,192.168.100.3,80,99168,1.823185,63,0.0,4,3.222832,63,0.0,0.461435,4.942302,1,DoS,UDP
4,3257991,udp,192.168.100.147,22461,192.168.100.3,80,105063,0.822418,100,2.979995,4,3.983222,100,0.0,1.002999,4.994452,1,DDoS,UDP


In [9]:
# Display columns and their data types
print("Columns in the dataset with their data types:")
for i, (column, dtype) in enumerate(df.dtypes.items(), 1):
    print(f"{i}. {column:<30} {dtype}")

# Show detailed dataframe information
print("\nDetailed DataFrame Information:")
df.info(memory_usage='deep')

# Display summary statistics for numeric columns
print("\nSummary Statistics for Numeric Columns:")
print(df.describe())

Columns in the dataset with their data types:
1. pkSeqID                        int64
2. proto                          object
3. saddr                          object
4. sport                          object
5. daddr                          object
6. dport                          object
7. seq                            int64
8. stddev                         float64
9. N_IN_Conn_P_SrcIP              int64
10. min                            float64
11. state_number                   int64
12. mean                           float64
13. N_IN_Conn_P_DstIP              int64
14. drate                          float64
15. srate                          float64
16. max                            float64
17. attack                         int64
18. category                       object
19. subcategory                    object

Detailed DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3668522 entries, 0 to 3668521
Data columns (total 19 columns):
 #   Column        

In [10]:
#Data Preprocessing

# 1. Remove duplicate rows
df.drop_duplicates(inplace=True)
print("Duplicates removal: Done")

# 2. Handle missing values
# Impute numeric columns with the mean
numeric_cols = df.select_dtypes(include=np.number).columns
missing_values_before = df[numeric_cols].isnull().sum().sum()
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Impute non-numeric columns with the mode
non_numeric_cols = df.select_dtypes(exclude=np.number).columns
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

missing_values_after = df[numeric_cols].isnull().sum().sum()
missing_values_removed = missing_values_before - missing_values_after

print(f"Missing values handling: Done. Number of missing values removed: {missing_values_removed}")

# 3. Remove spaces from column names
df.columns = df.columns.str.replace(' ', '')
print("Column names removal of spaces: Done")

# 4. Handle infinite values
# Replace infinite values with NaN
infinite_values_before = df.isin([np.inf, -np.inf]).sum().sum()
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute NaN values resulting from infinite values
# Impute numeric columns with the mean
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Impute non-numeric columns with the mode
non_numeric_cols = df.select_dtypes(exclude=np.number).columns
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

infinite_values_after = df.isin([np.inf, -np.inf]).sum().sum()
infinite_values_handled = infinite_values_before - infinite_values_after

print(f"Infinite values handling: Done. Number of infinite values handled: {infinite_values_handled}")

# 5. Drop columns with only one unique value
cols_to_drop = [col for col in df.columns if df[col].nunique() == 1]
df.drop(cols_to_drop, axis=1, inplace=True)

print("One unique value columns removal: Done")
print("Columns removed due to one unique value:", cols_to_drop)



Duplicates removal: Done
Missing values handling: Done. Number of missing values removed: 0
Column names removal of spaces: Done
Infinite values handling: Done. Number of infinite values handled: 0
One unique value columns removal: Done
Columns removed due to one unique value: []


In [11]:
df['subcategory'].value_counts()

subcategory
UDP                  1981230
TCP                  1593180
Service_Scan           73168
OS_Fingerprint         17914
HTTP                    2474
Normal                   477
Keylogging                73
Data_Exfiltration          6
Name: count, dtype: int64

In [12]:
# Remove unnecessary columns and prepare data
X = df.drop(['category', 'subcategory', 'proto','saddr','sport','daddr','dport', 'attack'], axis=1)
y = df['attack']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

def objective(trial):
    param_grid = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 50)
    }

    # Create and train model
    model = lgb.LGBMClassifier(**param_grid, random_state=42)
    
    # Fit the model with early stopping
    model.fit(
        X_train_resampled,
        y_train_resampled, 
        eval_set=[(X_test, y_test)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )

    # Predict and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Create and run the study
study = optuna.create_study(direction='maximize', study_name='lightgbm_optimization')
study.optimize(objective, n_trials=50, timeout=3600)

# Print and save results
print("\nBest trial:")
print(f"  Value: {study.best_trial.value:.4f}")
print("  Params:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

# Save results
output_file = "models_and_data/best_hyperparameters_lightgbm.txt"
with open(output_file, "w") as f:
    f.write("Best Hyperparameters:\n")
    for key, value in study.best_trial.params.items():
        f.write(f"{key}: {value}\n")
    f.write(f"\nBest Accuracy: {study.best_trial.value:.4f}\n")

print(f"\nResults saved to {output_file}")

[I 2025-04-25 13:42:19,807] A new study created in memory with name: lightgbm_optimization
[I 2025-04-25 13:42:56,686] Trial 0 finished with value: 0.999995911163206 and parameters: {'learning_rate': 0.011055901328120189, 'num_leaves': 44, 'max_depth': 7, 'min_child_samples': 35, 'subsample': 0.9012714407342719, 'colsample_bytree': 0.8811295769726059, 'reg_alpha': 4.161069413205356, 'reg_lambda': 0.0013355995091145212, 'n_estimators': 265, 'feature_fraction': 0.8141454613424521, 'bagging_fraction': 0.8100557168692921, 'min_child_weight': 37}. Best is trial 0 with value: 0.999995911163206.
[I 2025-04-25 13:43:19,234] Trial 1 finished with value: 0.999995911163206 and parameters: {'learning_rate': 0.0878413626441053, 'num_leaves': 131, 'max_depth': 8, 'min_child_samples': 29, 'subsample': 0.6075115834542242, 'colsample_bytree': 0.5049851532134011, 'reg_alpha': 3.051696230070762e-08, 'reg_lambda': 1.3772340098581526, 'n_estimators': 301, 'feature_fraction': 0.9125772458350402, 'bagging_fr


Best trial:
  Value: 1.0000
  Params:
    learning_rate: 0.01990086265614642
    num_leaves: 65
    max_depth: 7
    min_child_samples: 40
    subsample: 0.6971404971190901
    colsample_bytree: 0.8912957047621014
    reg_alpha: 1.1077475757773147e-05
    reg_lambda: 7.505137036788418
    n_estimators: 386
    feature_fraction: 0.6749863286377266
    bagging_fraction: 0.8742278952008886
    min_child_weight: 13

Results saved to models_and_data/best_hyperparameters_lightgbm.txt
