In [None]:
pip install numpy pandas scikit-learn catboost xgboost pytorch-tabnet optuna torch tabulate joblib matplotlib seaborn

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.1.0-py3-none-a

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
import optuna
from sklearn.model_selection import StratifiedKFold
import torch
from tabulate import tabulate
import warnings
import time
warnings.filterwarnings('ignore')

class AdvancedEnsembleModel:
    def __init__(self):
        self.scaler = StandardScaler()
        self.power_transformer = PowerTransformer(method='yeo-johnson')
        self.label_encoder = LabelEncoder()
        self.feature_encoders = {}
        self.models = {}
        self.best_params = {}
        self.feature_importance = {}
        self.known_labels = None

    def create_advanced_features(self, X):
        X = X.copy()
        numeric_cols = X.select_dtypes(include=[np.number]).columns

        # Reduced number of ratio features (only most important ones)
        main_numeric_cols = numeric_cols[:5]  # Take only first 5 numeric columns
        for i in range(len(main_numeric_cols)):
            for j in range(i + 1, len(main_numeric_cols)):
                col1, col2 = main_numeric_cols[i], main_numeric_cols[j]
                ratio_name = f'ratio_{col1}_{col2}'
                X[ratio_name] = X[col1] / (X[col2] + 1e-6)

        # Simplified statistical aggregations
        for col in main_numeric_cols:
            X[f'{col}_zscore'] = (X[col] - X[col].mean()) / (X[col].std() + 1e-6)

        return X

    def preprocess_data(self, X, train_mode=True):
        X = X.copy()

        # Handle missing values
        for col in X.columns:
            if X[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
                X[col] = X[col].fillna(X[col].median())
            else:
                X[col] = X[col].fillna('unknown')

        # Encode categorical variables
        if train_mode:
            for col in X.select_dtypes(include=['object']).columns:
                self.feature_encoders[col] = LabelEncoder()
                # Add 'unknown' to encoder classes if not present
                unique_values = list(X[col].unique())
                if 'unknown' not in unique_values:
                    unique_values.append('unknown')
                self.feature_encoders[col].fit(unique_values)
                X[col] = self.feature_encoders[col].transform(X[col].astype(str))
        else:
            for col in self.feature_encoders.keys():
                if col in X.columns:
                    X[col] = X[col].astype(str)
                    # Map unseen categories to 'unknown'
                    X[col] = X[col].map(lambda x: 'unknown' if x not in self.feature_encoders[col].classes_ else x)
                    X[col] = self.feature_encoders[col].transform(X[col])

        # Create advanced features
        X = self.create_advanced_features(X)

        # Scale numeric features
        if train_mode:
            X = pd.DataFrame(self.scaler.fit_transform(X), columns=X.columns)
            X = pd.DataFrame(self.power_transformer.fit_transform(X), columns=X.columns)
        else:
            X = pd.DataFrame(self.scaler.transform(X), columns=X.columns)
            X = pd.DataFrame(self.power_transformer.transform(X), columns=X.columns)

        return X

    def fit(self, X, y):
        print("Starting advanced model training...")
        start_time = time.time()

        # Store known labels and ensure 'unknown' is included
        self.known_labels = list(np.unique(y))
        if 'unknown' not in self.known_labels:
            self.known_labels.append('unknown')

        # Preprocess the data
        print("Preprocessing data...")
        X_processed = self.preprocess_data(X, train_mode=True)
        self.label_encoder.fit(self.known_labels)
        y_encoded = self.label_encoder.transform(y)

        # Train CatBoost with optimized iterations
        print("Training CatBoost...")
        self.models['catboost'] = CatBoostClassifier(
            iterations=30,
            learning_rate=0.25,
            depth=4,
            l2_leaf_reg=3,
            bootstrap_type='Bayesian',
            verbose=0,
            random_seed=42
        )
        self.models['catboost'].fit(X_processed, y_encoded)

        # Train XGBoost with optimized iterations
        print("Training XGBoost...")
        self.models['xgboost'] = XGBClassifier(
            n_estimators=30,
            learning_rate=0.25,
            max_depth=4,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            tree_method='hist'
        )
        self.models['xgboost'].fit(X_processed, y_encoded)

        # Train TabNet with optimized parameters
        print("Training TabNet...")
        self.models['tabnet'] = TabNetClassifier(
            n_d=8,
            n_a=8,
            n_steps=3,
            gamma=1.5,
            n_independent=2,
            n_shared=2,
            verbose=0
        )

        self.models['tabnet'].fit(
            X_processed.values, y_encoded,
            eval_metric=['accuracy'],
            patience=3,
            max_epochs=5
        )

        # Store feature importance
        self.feature_importance['catboost'] = self.models['catboost'].feature_importances_
        self.feature_importance['xgboost'] = self.models['xgboost'].feature_importances_

        training_time = time.time() - start_time
        print(f"Training completed in {training_time:.2f} seconds")

    def predict(self, X):
        # Preprocess test data
        X_processed = self.preprocess_data(X, train_mode=False)

        # Get predictions from each model
        pred_catboost = self.models['catboost'].predict_proba(X_processed)
        pred_xgboost = self.models['xgboost'].predict_proba(X_processed)
        pred_tabnet = self.models['tabnet'].predict_proba(X_processed.values)

        # Weighted average of predictions
        weighted_pred = (0.4 * pred_catboost +
                        0.3 * pred_xgboost +
                        0.3 * pred_tabnet)

        # Convert to class labels
        final_pred = np.argmax(weighted_pred, axis=1)
        return self.label_encoder.inverse_transform(final_pred)

# Load and prepare your data
print("Loading data...")
try:
    train = pd.read_csv('/content/UNSW_NB15_training-set.csv')
    test = pd.read_csv('/content/UNSW_NB15_testing-set.csv')

    # Prepare data
    X_train = train.drop(['attack_cat'], axis=1)
    y_train = train['attack_cat']

    if 'attack_cat' in test.columns:
        X_test = test.drop(['attack_cat'], axis=1)
        y_test = test['attack_cat']
    else:
        X_test = test
        y_test = None

    # Train and evaluate model
    model = AdvancedEnsembleModel()
    model.fit(X_train, y_train)

    print("\nMaking predictions...")
    if y_test is not None:
        y_pred = model.predict(X_test)
        results = [
            ["Accuracy", f"{accuracy_score(y_test, y_pred):.4f}"],
            ["Classification Report", "\n" + classification_report(y_test, y_pred)]
        ]
        print("\nResults:")
        print(tabulate(results, headers=["Metric", "Value"]))
    else:
        print("Predictions:", model.predict(X_test))

except FileNotFoundError:
    print("Error: Could not find the dataset files. Please ensure the paths are correct.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Loading data...
Starting advanced model training...
Preprocessing data...
Training CatBoost...
Training XGBoost...
Training TabNet...
Training completed in 155.08 seconds

Making predictions...

Results:
Metric                 Value
---------------------  -------------------------------------------------------
Accuracy               0.8595
Classification Report  precision    recall  f1-score   support

                             Analysis       0.01      0.03      0.01       677
                             Backdoor       0.26      0.02      0.03       583
                                  DoS       0.86      0.00      0.00      4089
                             Exploits       0.64      0.73      0.68     11132
                              Fuzzers       0.83      0.76      0.79      6062
                              Generic       1.00      0.96      0.98     18871
                               Normal       0.91      1.00      0.95     37000
                       Reconnaissance    