In [4]:
import optuna
import pandas as pd
from deap import base, creator, tools, algorithms
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import numpy as np
from imblearn.over_sampling import SMOTE

# Load the dataset
file_paths = [
    "C:/VS code projects/data_files/Monday-WorkingHours.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Tuesday-WorkingHours.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Wednesday-workingHours.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "C:/VS code projects/data_files/Friday-WorkingHours-Morning.pcap_ISCX.csv"
]

# Read and clean datasets
dataframes = []
for file_path in file_paths:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Remove whitespace from column names
    dataframes.append(df)

# Combine all datasets into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

In [5]:
#Dealing with duplicates
print(f'Before Cremoving duplicates: {df.shape}')
duplicates = df[df.duplicated()]
print(f'Number of duplicates: {len(duplicates)}')
df.drop_duplicates(inplace = True)
print(f'After removing duplicates: {df.shape}')


#Handling missing values both numeric and non-numeric columns
# Identify columns with missing values
missing_val = df.isna().sum()
print("Columns with missing values:")
print(missing_val.loc[missing_val > 0])

# Handle missing values for numeric columns (fill with mean)
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Handle missing values for non-numeric columns (fill with mode)
non_numeric_cols = df.select_dtypes(exclude=['number']).columns
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify if there are still any missing values
print(f"Missing values after filling: {df.isna().sum().sum()}")

#2.3 Handling infinite values

# Initial count of missing and infinite values
print(f'Initial missing values: {df.isna().sum().sum()}')
print(f'Initial infinite values: {df.isin([np.inf, -np.inf]).sum().sum()}')

# Drop rows with infinite values
df = df[~df.isin([np.inf, -np.inf]).any(axis=1)]

# Verify that infinite values are removed
inf_count = df.isin([np.inf, -np.inf]).sum()
print("Columns with infinite values after processing (should be empty):")
print(inf_count[inf_count > 0])

# Final missing value check
print(f"Missing values after dropping rows: {df.isna().sum().sum()}")
# Dropping columns with only one unique value
num_unique = df.nunique()
one_variable = num_unique[num_unique == 1]
not_one_variable = num_unique[num_unique > 1].index

dropped_cols = one_variable.index
df = df[not_one_variable]

print('Dropped columns:')
dropped_cols

Before Cremoving duplicates: (2830743, 79)
Number of duplicates: 330963
After removing duplicates: (2499780, 79)
Columns with missing values:
Flow Bytes/s    353
dtype: int64
Missing values after filling: 0
Initial missing values: 0
Initial infinite values: 3126
Columns with infinite values after processing (should be empty):
Series([], dtype: int64)
Missing values after dropping rows: 0
Dropped columns:


Index(['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk',
       'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk',
       'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'],
      dtype='object')

In [6]:
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'BENIGN' else 0)

#Due to resource constraints, we will sample 20% of the dataset for training
# Randomly sample 20% of the dataset
df = df.sample(frac=0.2, random_state=42)

#SMOTE (Synthetic Minority Over-sampling Technique) is used to handle class imbalance after sampling
# Ensure there are no NaN values in the dataset before applying SMOTE
if df.isna().sum().sum() > 0:
    print("Dataset contains NaN values. Filling NaN values with column means...")
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Split the original dataset into features (X) and target (y)
X = df.drop('Label', axis=1)  # Features
y = df['Label']  # Target

# Perform SMOTE sampling to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target into a new DataFrame
df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), 
                pd.DataFrame(y_resampled, columns=['Label'])], axis=1)

# Display the value counts to verify balance
print('Balanced dataset:')
print(df['Label'].value_counts())

Balanced dataset:
Label
1    414369
0    414369
Name: count, dtype: int64


In [None]:
print("Preprocessing dataset...")
df.dropna(inplace=True)  # Remove missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinite values with NaN
df.columns = df.columns.str.strip()  # Strip whitespaces from column names
X = df.drop(columns=['Label'])  # Replace 'Label' with the actual target column name if different
y = df['Label']


# Define the DEAP toolbox
num_features = X.shape[1]  # Number of features in the dataset

# Create the fitness function (maximize accuracy)
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)  # Binary representation (0 or 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Two-point crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Bit-flip mutation
toolbox.register("select", tools.selTournament, tournsize=3)  # Tournament selection

# Define the evaluation function
def evaluate_model(selected_features):
    # Use only the selected features
    X_selected = X.iloc[:, selected_features]

    # Define a LightGBM classifier
    clf = lgb.LGBMClassifier(random_state=42)

    # Perform cross-validation and return the mean score
    scores = cross_val_score(clf, X_selected, y, cv=5, scoring='accuracy')
    return scores.mean()

# Register the evaluation function to the toolbox
def evaluate_individual(individual):
    # Convert the individual (binary list) into selected feature indices
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:  # Handle cases where no features are selected
        return 0.0,
    score = evaluate_model(selected_features)
    return score,

toolbox.register("evaluate", evaluate_individual)

# Define the objective function for Optuna
def objective(trial):
    # Suggest values for GA parameters
    population_size = trial.suggest_int("population_size", 20, 100, step=10)
    ngen = trial.suggest_int("ngen", 10, 50, step=10)
    cxpb = trial.suggest_float("cxpb", 0.5, 0.9, step=0.1)
    mutpb = trial.suggest_float("mutpb", 0.1, 0.3, step=0.05)

    # Set up the GA with suggested parameters
    population = toolbox.population(n=population_size)
    result_population = algorithms.eaSimple(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen, verbose=False)
    best_individual = tools.selBest(result_population[0], k=1)[0]

    # Evaluate the selected features
    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]
    if len(selected_features) == 0:  
        return 0.0
    score = evaluate_model(selected_features)

    return score

# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=3600)

# Print the best parameters
print("Best Parameters:", study.best_params)
print("Best Score:", study.best_value)

Preprocessing dataset...


[I 2025-04-12 07:13:41,251] A new study created in memory with name: no-name-7c6cb878-478d-4ef9-8d03-60b56a081323


[LightGBM] [Info] Number of positive: 331495, number of negative: 331495
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6052
[LightGBM] [Info] Number of data points in the train set: 662990, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 331495, number of negative: 331495
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6043
[LightGBM] [Info] Number of data points in the train set: 662990, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco