# Embedded Feature Selection for DDoS Attack Detection

This notebook preprocesses the dataset and applies different embedded feature selection techniques to determine the most effective method.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

In [4]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Syn.csv')

  df = pd.read_csv('/content/drive/MyDrive/Syn.csv')


In [5]:
# Drop unnecessary columns
df.drop(df.columns[[1,2,3,4,5,7,85]], axis=1, inplace=True)

# Encode target variable
df['Label_new'] = df[' Label'].apply(lambda x: 1 if x == 'Syn' else 0 if x == 'BENIGN' else None)
df.drop(columns=[' Label'], inplace=True, errors='ignore')

# Remove NaN and infinite values
df = df.dropna()
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# **Dataset Balancing BEFORE Splitting**
df_benign = df[df['Label_new'] == 0]
df_attack = df[df['Label_new'] == 1]

# Upsample benign traffic (1/3 of attack samples)
df_benign_upsampled = resample(df_benign,
                               replace=True,
                               n_samples=len(df_attack) // 3,
                               random_state=42)

# Downsample attack traffic (50% reduction)
df_attack_downsampled = resample(df_attack,
                                 replace=False,
                                 n_samples=len(df_attack) // 2,
                                 random_state=42)

# Merge balanced dataset
df_balanced = pd.concat([df_benign_upsampled, df_attack_downsampled])

# Shuffle dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into features & target
X = df_balanced.drop(columns=['Label_new'])
y = df_balanced['Label_new']

In [6]:
from collections import Counter

label_counts = Counter(y)  # Count occurrences of each label
print("Label counts before splitting:", label_counts)


Label counts before splitting: Counter({1: 689991, 0: 459994})


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# **1️⃣ LASSO Feature Selection**
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Fit LASSO for feature selection
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)

# Select only non-zero coefficient features
lasso_selected_features = X.columns[lasso.coef_ != 0]
X_train_lasso = X_train[lasso_selected_features]
X_test_lasso = X_test[lasso_selected_features]

# Train Logistic Regression on LASSO-selected features
lasso_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lasso_model.fit(X_train_lasso, y_train)
y_pred_lr = lasso_model.predict(X_test_lasso)

# Accuracy & Classification Report
print("LASSO Feature Selection Accuracy (Logistic Regression):", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


LASSO Feature Selection Accuracy (Logistic Regression): 0.9972912690165524
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     91999
           1       1.00      1.00      1.00    137998

    accuracy                           1.00    229997
   macro avg       1.00      1.00      1.00    229997
weighted avg       1.00      1.00      1.00    229997



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# **2️⃣ Ridge Feature Selection**
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Fit Ridge for feature selection
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

# Select features with absolute coefficient above threshold
ridge_selected_features = X.columns[np.abs(ridge.coef_) > 0.01]
X_train_ridge = X_train[ridge_selected_features]
X_test_ridge = X_test[ridge_selected_features]

# Train Logistic Regression on Ridge-selected features
ridge_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
ridge_model.fit(X_train_ridge, y_train)
y_pred_ridge = ridge_model.predict(X_test_ridge)

# Accuracy & Classification Report
print("Ridge Feature Selection Accuracy (Logistic Regression):", accuracy_score(y_test, y_pred_ridge))
print(classification_report(y_test, y_pred_ridge))


Ridge Feature Selection Accuracy (Logistic Regression): 0.9333165215198459
              precision    recall  f1-score   support

           0       1.00      0.83      0.91     91999
           1       0.90      1.00      0.95    137998

    accuracy                           0.93    229997
   macro avg       0.95      0.92      0.93    229997
weighted avg       0.94      0.93      0.93    229997



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# **3️⃣ Decision Tree Feature Selection**
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Decision Tree for feature importance
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_scaled, y_train)

# Select features with importance > 0.01
tree_selected_features = X.columns[tree.feature_importances_ > 0.01]
X_train_tree = X_train[tree_selected_features]
X_test_tree = X_test[tree_selected_features]

# Train Logistic Regression on selected features
tree_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
tree_model.fit(X_train_tree, y_train)
y_pred_tree = tree_model.predict(X_test_tree)

# Accuracy & Classification Report
print("Decision Tree Feature Selection Accuracy (Logistic Regression):", accuracy_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))


Decision Tree Feature Selection Accuracy (Logistic Regression): 0.9996086905481376
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     91999
           1       1.00      1.00      1.00    137998

    accuracy                           1.00    229997
   macro avg       1.00      1.00      1.00    229997
weighted avg       1.00      1.00      1.00    229997



In [12]:
# **4️⃣ XGBoost Feature Selection**
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train XGBoost for feature importance
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_scaled, y_train)

# Select features with importance > 0.01
xgb_selected_features = X.columns[xgb.feature_importances_ > 0.01]
X_train_xgb = X_train[xgb_selected_features]
X_test_xgb = X_test[xgb_selected_features]

# Train Logistic Regression on selected features
xgb_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
xgb_model.fit(X_train_xgb, y_train)
y_pred_xgb = xgb_model.predict(X_test_xgb)

# Accuracy & Classification Report
print("XGBoost Feature Selection Accuracy (Logistic Regression):", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



XGBoost Feature Selection Accuracy (Logistic Regression): 0.9996086905481376
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     91999
           1       1.00      1.00      1.00    137998

    accuracy                           1.00    229997
   macro avg       1.00      1.00      1.00    229997
weighted avg       1.00      1.00      1.00    229997

