In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import seaborn as sns

In [2]:
def data_cleaning_in_batches(D, batch_size=10000):
    Cleaned_Data = D.drop_duplicates()
    
    # Drop columns with too many NaNs
    threshold = 0.7 * len(Cleaned_Data)
    Cleaned_Data = Cleaned_Data.dropna(thresh=threshold, axis=1)

    # Identify column types before batching
    categorical_cols = Cleaned_Data.select_dtypes(include=['object', 'category']).columns
    numerical_cols = Cleaned_Data.select_dtypes(include=['int64', 'float64']).columns

    # One-hot encode categoricals all at once before batching
    if len(categorical_cols) > 0:
        Cleaned_Data = pd.get_dummies(Cleaned_Data, columns=categorical_cols, drop_first=True)

    # Split into batches of rows
    num_batches = int(np.ceil(len(Cleaned_Data) / batch_size))
    batches = []
    for i in range(num_batches):
        batch = Cleaned_Data.iloc[i*batch_size : (i+1)*batch_size].copy()

        # Impute missing values in the batch using IterativeImputer
        if batch.isnull().sum().sum() > 0:
            imputer = IterativeImputer(random_state=42, max_iter=10)
            batch = pd.DataFrame(
                imputer.fit_transform(batch),
                columns=batch.columns,
                index=batch.index
            )

        # Scale numerical columns (if they exist in this batch)
        existing_numerical_cols = [col for col in numerical_cols if col in batch.columns]
        if existing_numerical_cols:
            scaler = MinMaxScaler()
            batch[existing_numerical_cols] = scaler.fit_transform(batch[existing_numerical_cols])

        batches.append(batch)

    # Recombine all cleaned batches
    final_cleaned_data = pd.concat(batches)

    return final_cleaned_data


In [3]:
def Balance_Data(Cleaned_Data, target_column, random_state=42):
  
    if target_column not in Cleaned_Data.columns:
        raise ValueError(f"Target column '{target_column}' not found")
    if Cleaned_Data[target_column].nunique() < 2:
        raise ValueError("Target must have at least two classes")
    
    # Separate features and target
    X = Cleaned_Data.drop(columns=[target_column])
    y = Cleaned_Data[target_column]
    
    # Apply SMOTE
    smote = SMOTE(random_state=random_state, k_neighbors=min(y.value_counts().min() - 1, 5))
    X_balanced, y_balanced = smote.fit_resample(X, y)
    
    # Create balanced DataFrame
    Balanced_Data = pd.DataFrame(X_balanced, columns=X.columns)
    Balanced_Data[target_column] = y_balanced
    
    return Balanced_Data

In [4]:

from pathlib import Path

from sklearn.model_selection import StratifiedKFold


def Reduce_Dimensionality(Cleaned_Data, k, target_column, n_splits=5, output_dir="feature_selection_results"):
    # ===== Check parameters =====
    if target_column not in Cleaned_Data.columns:
        raise ValueError(f"Target column '{target_column}' not found in dataset")
    if k <= 0 or k >= len(Cleaned_Data.columns) - 1:
        raise ValueError(f"Number of features k ({k}) must be between 1 and {len(Cleaned_Data.columns) - 1}")
    
    Path(output_dir).mkdir(exist_ok=True)

    # ===== Separate features and target =====
    X = Cleaned_Data.drop(columns=[target_column])
    y = Cleaned_Data[target_column]

    # ===== ANOVA scores =====
    f_values, _ = f_classif(X, y)
    f_values = np.nan_to_num(f_values, nan=0.0, posinf=0.0, neginf=0.0)
    f_values_norm = (f_values - f_values.min()) / (f_values.max() - f_values.min() + 1e-10)

    # ===== XGBoost importances =====
    xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
    xgb_model.fit(X, y)
    xgb_importances = xgb_model.feature_importances_

    # ===== Combine scores =====
    combined_scores = (f_values_norm + xgb_importances) / 2
    feature_scores = pd.Series(combined_scores, index=X.columns)
    selected_features = feature_scores.nlargest(k).index.tolist()

    # ===== Plots for feature importance =====
    scores_df = pd.DataFrame({
        'Feature': X.columns,
        'ANOVA Score': f_values_norm,
        'XGBoost Importance': xgb_importances,
        'Combined Score': combined_scores
    }).sort_values(by="Combined Score", ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x="Combined Score", y="Feature", data=scores_df.head(k), palette="viridis")
    plt.title(f"Top {k} Features (Combined Score)")
    plt.tight_layout()
    plt.savefig(Path(output_dir) / "top_features_combined.png")
    plt.close()

    # ===== Correlation heatmap =====
    corr = X[selected_features].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
    plt.title("Correlation Heatmap of Selected Features")
    plt.tight_layout()
    plt.savefig(Path(output_dir) / "correlation_heatmap.png")
    plt.close()

    # ===== Normalize selected features =====
    normalizer = Normalizer(norm='l2')
    X_norm = pd.DataFrame(
        normalizer.fit_transform(X[selected_features]),
        columns=selected_features,
        index=X.index
    )

    # # ===== Cross-validation split =====
    # skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # fold_num = 1
    # for train_idx, test_idx in skf.split(X_norm, y):
    #     X_train, X_test = X_norm.iloc[train_idx], X_norm.iloc[test_idx]
    #     y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    #     # Save to CSV
    #     X_train.join(y_train).to_csv(Path(output_dir) / f"train_fold_{fold_num}.csv", index=False)
    #     X_test.join(y_test).to_csv(Path(output_dir) / f"test_fold_{fold_num}.csv", index=False)
    #     y_train.to_csv(Path(output_dir) / f"y_train_fold_{fold_num}.csv", index=False)
    #     y_test.to_csv(Path(output_dir) / f"y_test_fold_{fold_num}.csv", index=False)

    #     fold_num += 1
    final_df = X_norm.join(y)
    final_path = Path(output_dir) / "dataset_selected_features.csv"
    final_df.to_csv(final_path, index=False)


    print(f"✅ Feature selection complete. Results saved in '{output_dir}'")
    return selected_features, scores_df


In [5]:
df = pd.read_csv("../Datasets/Combined.csv")


  df = pd.read_csv("../Datasets/Combined.csv")


In [6]:
cleaned_df = data_cleaning_in_batches(df, batch_size=10000)  

  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S**2

In [7]:
balanced_df = Balance_Data(Cleaned_Data=cleaned_df, target_column='Label_Malicious')


In [8]:
selected_features, scores_df = Reduce_Dimensionality (
    Cleaned_Data = balanced_df,
    k=8,  # number of top features
    target_column="Label_Malicious",  # change to your label column name
)

# Show results
print("Selected Features:", selected_features)
scores_df.head()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Combined Score", y="Feature", data=scores_df.head(k), palette="viridis")


✅ Feature selection complete. Results saved in 'feature_selection_results'
Selected Features: ['Attack Tool_Hping3', 'Attack Type_UDPFlood', 'Proto_tcp', 'Attack Type_HTTPFlood', 'Attack Tool_Goldeneye', 'AckDat', 'Proto_udp', 'sHops']


Unnamed: 0,Feature,ANOVA Score,XGBoost Importance,Combined Score
72,Attack Tool_Hping3,1.0,0.441347,0.720674
69,Attack Type_UDPFlood,0.938941,0.0,0.46947
38,Proto_tcp,0.235457,0.403278,0.319367
63,Attack Type_HTTPFlood,0.135904,0.001781,0.068843
71,Attack Tool_Goldeneye,0.135904,0.0,0.067952
