In [None]:
# Required libraries: pandas, scikit-learn, tensorflow
# You can install them using pip:
# pip install pandas scikit-learn tensorflow

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# --- Part 1: Generate Scores from the Autoencoder ---
try:
    df = pd.read_csv('../datasets/cybersecurity_attacks.csv')
    print("📄 Successfully loaded original 'cybersecurity_attacks.csv'.")
    df = df.sample(n=10000, random_state=42).copy()
except FileNotFoundError:
    print("❌ Error: 'cybersecurity_attacks.csv' not found.")
    df = None

if df is not None:
    # Define features for the autoencoder
    numerical_features = ['Packet Length', 'Source Port', 'Destination Port']
    categorical_features = ['Protocol', 'Packet Type', 'Traffic Type', 'Attack Type', 'Action Taken', 'Severity Level']

    df_model = df.dropna(subset=numerical_features + categorical_features).copy()

    # Preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    X_processed = preprocessor.fit_transform(df_model)
    if hasattr(X_processed, "toarray"):
        X_processed = X_processed.toarray()

    # Build and train the Autoencoder
    print("🤖 Training autoencoder to generate new scores...")
    input_dim = X_processed.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(64, activation="relu")(input_layer)
    encoder = Dense(32, activation="relu")(encoder)
    encoder = Dense(14, activation="relu")(encoder)
    decoder = Dense(32, activation="relu")(encoder)
    decoder = Dense(64, activation="relu")(decoder)
    output_layer = Dense(input_dim, activation='sigmoid')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mae')
    autoencoder.fit(X_processed, X_processed, epochs=20, batch_size=32, shuffle=True, verbose=0)
    print("✅ Autoencoder training complete.")

    predictions = autoencoder.predict(X_processed)
    df_model['Autoencoder_Anomaly_Score'] = np.mean(np.abs(X_processed - predictions), axis=1)

    # --- Part 2: Combine the Scores ---
    print("\n🔄 Normalizing and combining scores...")
    df = df.merge(df_model[['Autoencoder_Anomaly_Score']], left_index=True, right_index=True, how='left')
    df.dropna(subset=['Anomaly Scores', 'Autoencoder_Anomaly_Score'], inplace=True)
    scaler = MinMaxScaler()
    df['Original_Scaled_Score'] = scaler.fit_transform(df[['Anomaly Scores']])
    df['Autoencoder_Scaled_Score'] = scaler.fit_transform(df[['Autoencoder_Anomaly_Score']])
    df['Combined_Score'] = (df['Original_Scaled_Score'] + df['Autoencoder_Scaled_Score']) / 2
    print("✅ Combined_Score created.")

    # --- Part 3: Classify Logs Based on a Threshold ---
    print("\n🔎 Setting a threshold to classify threats...")

    # Define what percentage of data to classify as a threat (e.g., top 5%)
    # You can tune this value (e.g., 0.01 for top 1%, 0.10 for top 10%)
    anomaly_percentage = 0.05

    # Calculate the threshold based on the percentile
    threshold = df['Combined_Score'].quantile(1 - anomaly_percentage)

    print(f"Threshold set at the {100 * (1 - anomaly_percentage):.0f}th percentile: {threshold:.4f}")

    # Classify based on the threshold
    df['Classification'] = df['Combined_Score'].apply(lambda x: 'Threat' if x > threshold else 'Not a Threat')

    # --- Display Final Results ---
    print("\n--- Classification Results ---")
    print(df['Classification'].value_counts())

    print("\n--- Top 5 Classified Threats ---")
    display_cols = ['Attack Type', 'Combined_Score', 'Classification']
    threats_df = df[df['Classification'] == 'Threat'].sort_values(by='Combined_Score', ascending=False)
    print(threats_df[display_cols].head().to_string())

    # Save the final results to a new file
    df.to_csv('cybersecurity_attacks_with_classifications.csv', index=False)
    print("\n💾 Results saved to 'cybersecurity_attacks_with_classifications.csv'")

In [None]:
# Required libraries: pandas, scikit-learn, tensorflow
# You can install them using pip:
# pip install pandas scikit-learn tensorflow

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# --- Part 1: Generate Scores from the Autoencoder ---
try:
    df = pd.read_csv('cybersecurity_attacks.csv')
    print("📄 Successfully loaded original 'cybersecurity_attacks.csv'.")
    df = df.sample(n=10000, random_state=42).copy()
except FileNotFoundError:
    print("❌ Error: 'cybersecurity_attacks.csv' not found.")
    df = None

if df is not None:
    # Define features for the autoencoder
    numerical_features = ['Packet Length', 'Source Port', 'Destination Port']
    categorical_features = ['Protocol', 'Packet Type', 'Traffic Type', 'Attack Type', 'Action Taken', 'Severity Level']

    df_model = df.dropna(subset=numerical_features + categorical_features).copy()

    # Preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    X_processed = preprocessor.fit_transform(df_model)
    if hasattr(X_processed, "toarray"):
        X_processed = X_processed.toarray()

    # Build and train the Autoencoder
    print("🤖 Training autoencoder to generate new scores...")
    input_dim = X_processed.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(64, activation="relu")(input_layer)
    encoder = Dense(32, activation="relu")(encoder)
    encoder = Dense(14, activation="relu")(encoder)
    decoder = Dense(32, activation="relu")(encoder)
    decoder = Dense(64, activation="relu")(decoder)
    output_layer = Dense(input_dim, activation='sigmoid')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mae')
    autoencoder.fit(X_processed, X_processed, epochs=20, batch_size=32, shuffle=True, verbose=0)
    print("✅ Autoencoder training complete.")

    predictions = autoencoder.predict(X_processed)
    df_model['Autoencoder_Anomaly_Score'] = np.mean(np.abs(X_processed - predictions), axis=1)

    # --- Part 2: Combine the Scores ---
    print("\n🔄 Normalizing and combining scores...")
    df = df.merge(df_model[['Autoencoder_Anomaly_Score']], left_index=True, right_index=True, how='left')
    df.dropna(subset=['Anomaly Scores', 'Autoencoder_Anomaly_Score'], inplace=True)
    scaler = MinMaxScaler()
    df['Original_Scaled_Score'] = scaler.fit_transform(df[['Anomaly Scores']])
    df['Autoencoder_Scaled_Score'] = scaler.fit_transform(df[['Autoencoder_Anomaly_Score']])
    df['Combined_Score'] = (df['Original_Scaled_Score'] + df['Autoencoder_Scaled_Score']) / 2
    print("✅ Combined_Score created.")

    # --- Part 3: Classify Threats into Levels Based on Quantiles ---
    print("\n🔎 Classifying threats into levels based on score quantiles...")

    # Define the percentile cutoffs for each threat level
    level_3_threshold = df['Combined_Score'].quantile(0.99) # Top 1%
    level_2_threshold = df['Combined_Score'].quantile(0.95) # Top 5%
    level_1_threshold = df['Combined_Score'].quantile(0.90) # Top 10%

    print(f"Level 3 Threat threshold (Top 1%): > {level_3_threshold:.4f}")
    print(f"Level 2 Threat threshold (Top 5%): > {level_2_threshold:.4f}")
    print(f"Level 1 Threat threshold (Top 10%): > {level_1_threshold:.4f}")

    # Create a list of conditions and corresponding level assignments
    conditions = [
        df['Combined_Score'] > level_3_threshold,
        df['Combined_Score'] > level_2_threshold,
        df['Combined_Score'] > level_1_threshold
    ]
    levels = ['Level 3 Threat (Critical)', 'Level 2 Threat (High)', 'Level 1 Threat (Medium)']

    # Use numpy.select for efficient conditional assignment
    df['Threat_Level'] = np.select(conditions, levels, default='Not a Threat')

    # --- Display Final Results ---
    print("\n--- Threat Level Distribution ---")
    print(df['Threat_Level'].value_counts())

    print("\n--- Sample of Highest-Level Threats ---")
    display_cols = ['Attack Type', 'Combined_Score', 'Threat_Level']
    # Show the top threats from any level, sorted by score
    threats_df = df[df['Threat_Level'] != 'Not a Threat'].sort_values(by='Combined_Score', ascending=False)
    print(threats_df[display_cols].head().to_string())

    # Save the final results to a new file
    df.to_csv('cybersecurity_attacks_with_threat_levels.csv', index=False)
    print("\n💾 Results saved to 'cybersecurity_attacks_with_threat_levels.csv'")

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import os

def hex_to_int(val):
    """Safely convert hex or string numbers to integers."""
    try:
        if isinstance(val, str) and val.startswith("0x"):
            return int(val, 16)
        return int(val)
    except (ValueError, TypeError):
        return np.nan

def load_and_preprocess_unsupervised(csv_path, feature_cols, max_rows=10000):
    """Loads and preprocesses a single file for unsupervised learning."""
    try:
        df = pd.read_csv(csv_path)
        print(f"  📄 Loaded {len(df)} rows from {csv_path}")
        if len(df) > max_rows:
            df = df.sample(n=max_rows, random_state=42)
            print(f"  📉 Sampled down to {max_rows} rows")

        df.dropna(subset=feature_cols, inplace=True)
        for col in feature_cols:
            df[col] = df[col].apply(hex_to_int)
        df.dropna(subset=feature_cols, inplace=True)
        df[feature_cols] = df[feature_cols].astype(np.float32)

        print(f"  ✅ Final unsupervised dataset: {len(df)} rows")
        return df, df[feature_cols].values
    except Exception as e:
        print(f"  ❌ Error in unsupervised loading for {csv_path}: {e}")
        return None, None

def train_autoencoder(X_train_scaled):
    """Trains a standard autoencoder model."""
    print("  🤖 Training autoencoder...")
    input_dim = X_train_scaled.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(max(16, int(input_dim * 0.75)), activation="relu")(input_layer)
    encoder = Dense(max(8, int(input_dim * 0.5)), activation="relu")(encoder)
    decoder = Dense(max(16, int(input_dim * 0.75)), activation="relu")(encoder)
    output_layer = Dense(input_dim, activation='sigmoid')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mae')
    autoencoder.fit(X_train_scaled, X_train_scaled, epochs=20, batch_size=32, shuffle=True, verbose=0)
    print("  ✅ Autoencoder training complete.")
    return autoencoder

# === CONFIGURATION ===
csv_list = [
    {
        "type": "unsupervised",
        "path": "../datasets/cybersecurity_attacks.csv",
        "features": ["Packet Length", "Source Port", "Destination Port"],
        "out_csv": "classified_cybersecurity_attacks.csv"
    },
    {
        "type": "supervised", # This dataset uses the special supervised method
        "good_path": "../datasets/2good_reqff.csv",
        "bad_path": "../datasets/2bad_reqff.csv",
        "features": ["path_length", "body_length", "badwords_count"],
        "out_csv": "classified_goodbad_requests.csv"
    },
    {
        "type": "unsupervised",
        "path": "../datasets/wls_day-02.csv",
        "features": ["ProcessID", "ParentProcessID", "EventID"],
        "out_csv": "classified_wls_events.csv"
    },
    {
        "type": "unsupervised",
        "path": "../datasets/netflow_day-02.csv",
        "features": ["Duration", "SrcPackets", "DstPackets", "SrcBytes", "DstBytes"],
        "out_csv": "classified_netflow.csv"
    }
]

# === EXECUTION PIPELINE ===
print("🚀 Starting Hybrid Anomaly Classification Pipeline\n")

for i, config in enumerate(csv_list, 1):
    print(f"--- [⚙️] Processing Dataset {i}/{len(csv_list)}: {config.get('path') or config.get('good_path')} ---")

    df, final_score_col = None, None

    if config["type"] == "supervised":
        try:
            good_df = pd.read_csv(config['good_path'])
            bad_df = pd.read_csv(config['bad_path'])
            features = config['features']

            X_train_good = good_df[features].values

            df = pd.concat([good_df, bad_df], ignore_index=True)
            X_all = df[features].values

            scaler = StandardScaler()
            X_train_good_scaled = scaler.fit_transform(X_train_good)
            X_all_scaled = scaler.transform(X_all)

            ae = train_autoencoder(X_train_good_scaled)
            reconstructions = ae.predict(X_all_scaled)
            df['Final_Score'] = np.mean(np.abs(X_all_scaled - reconstructions), axis=1)
            print("  ✅ Generated scores using supervised method.")

        except Exception as e:
            print(f"  ❌ Error in supervised loading: {e}")
            continue

    elif config["type"] == "unsupervised":
        df, X_features = load_and_preprocess_unsupervised(config['path'], config['features'])
        if df is None:
            print(f"--- ⏭️  Skipping {config['path']} ---\n")
            continue

        X_scaled = MinMaxScaler().fit_transform(X_features)
        ae = train_autoencoder(X_scaled)
        predictions = ae.predict(X_scaled)
        df['Final_Score'] = np.mean(np.abs(X_scaled - predictions), axis=1)
        print("  ✅ Generated scores using unsupervised method.")

    # --- Classify into threat levels based on the Final_Score ---
    if df is not None:
        print("  🔎 Classifying threats into levels...")
        level_3_threshold = df['Final_Score'].quantile(0.99)
        level_2_threshold = df['Final_Score'].quantile(0.95)
        level_1_threshold = df['Final_Score'].quantile(0.90)

        conditions = [
            df['Final_Score'] > level_3_threshold,
            df['Final_Score'] > level_2_threshold,
            df['Final_Score'] > level_1_threshold
        ]
        levels = ['Level 3 Threat (Critical)', 'Level 2 Threat (High)', 'Level 1 Threat (Medium)']
        df['Threat_Level'] = np.select(conditions, levels, default='Not a Threat')

        print("\n  --- Threat Level Distribution ---")
        print(df['Threat_Level'].value_counts())
        df.to_csv(config['out_csv'], index=False)
        print(f"\n  💾 Results saved to {config['out_csv']}")
        print(f"\n--- ✅ Completed processing ---\n")

print("🎉🎉 Pipeline finished successfully! 🎉🎉")

🚀 Starting Hybrid Anomaly Classification Pipeline

--- [⚙️] Processing Dataset 1/4: ../datasets/cybersecurity_attacks.csv ---
  📄 Loaded 40000 rows from ../datasets/cybersecurity_attacks.csv
  📉 Sampled down to 10000 rows
  ✅ Final unsupervised dataset: 10000 rows
  🤖 Training autoencoder...
  ✅ Autoencoder training complete.
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671us/step
  ✅ Generated scores using unsupervised method.
  🔎 Classifying threats into levels...

  --- Threat Level Distribution ---
Threat_Level
Not a Threat                 9000
Level 1 Threat (Medium)       500
Level 2 Threat (High)         400
Level 3 Threat (Critical)     100
Name: count, dtype: int64

  💾 Results saved to classified_cybersecurity_attacks.csv

--- ✅ Completed processing ---

--- [⚙️] Processing Dataset 2/4: ../datasets/2good_reqff.csv ---
  🤖 Training autoencoder...
  ✅ Autoencoder training complete.
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 845us/step