In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
from sklearn.feature_selection import SelectKBest, f_classif



import warnings
import time
import os
from tqdm import tqdm

warnings.filterwarnings('ignore')

# Create a directory for saving visualizations
if not os.path.exists('visualizations'):
    os.makedirs('visualizations')

# Define column names for KDD Cup 1999 dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'class'
]

# Load the dataset (assuming you've downloaded it)
print("Loading dataset...")
# Use a smaller subset for faster processing (10% of the data)
data = pd.read_csv('kddcup.data_10_percent', header=None, names=columns)
print(f"Dataset loaded with shape: {data.shape}")

# For full dataset, uncomment the following:
# data = pd.read_csv('kddcup.data', header=None, names=columns)


Loading dataset...
Dataset loaded with shape: (494021, 42)


In [2]:
# Basic data exploration
print("\nData Overview:")
print(data.head())
print("\nData Info:")
print(data.info())
print("\nData Statistics:")
print(data.describe())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Class distribution
print("\nClass Distribution:")
class_counts = data['class'].value_counts()
print(class_counts)

# Visualize class distribution
plt.figure(figsize=(12, 6))
sns.countplot(y='class', data=data, order=data['class'].value_counts().index)
plt.title('Distribution of Attack Types')
plt.tight_layout()
plt.savefig('visualizations/attack_distribution.png')
plt.close()

# Convert attack types to binary classification (normal vs attack)
data['binary_class'] = data['class'].apply(lambda x: 0 if x == 'normal.' else 1)

# Visualize binary class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='binary_class', data=data)
plt.title('Binary Class Distribution (0: Normal, 1: Attack)')
plt.xticks([0, 1], ['Normal', 'Attack'])
plt.savefig('visualizations/binary_class_distribution.png')
plt.close()

# Analyze categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    top_categories = data[col].value_counts().head(10).index
    sns.countplot(y=col, data=data[data[col].isin(top_categories)], 
                 hue='binary_class', palette='viridis')
    plt.title(f'Top 10 {col} by Class')
    plt.tight_layout()
    plt.savefig(f'visualizations/{col}_distribution.png')
    plt.close()

# Analyze numerical features
numerical_cols = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
for col in numerical_cols:
    plt.figure(figsize=(12, 6))
    
    # Log transform for better visualization
    data[f'log_{col}'] = np.log1p(data[col])
    
    # Plot histograms
    plt.subplot(1, 2, 1)
    sns.histplot(data=data, x=f'log_{col}', hue='binary_class', bins=50, kde=True)
    plt.title(f'Log Distribution of {col} by Class')
    
    # Plot boxplots
    plt.subplot(1, 2, 2)
    sns.boxplot(x='binary_class', y=f'log_{col}', data=data)
    plt.title(f'Log {col} by Class')
    
    plt.tight_layout()
    plt.savefig(f'visualizations/{col}_analysis.png')
    plt.close()
    
    # Remove log-transformed column to avoid using it in modeling
    data.drop(f'log_{col}', axis=1, inplace=True)

# Correlation analysis
numeric_data = data.select_dtypes(include=[np.number])
plt.figure(figsize=(20, 16))
correlation = numeric_data.corr()
mask = np.triu(correlation)
sns.heatmap(correlation, annot=False, mask=mask, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.savefig('visualizations/correlation_matrix.png')
plt.close()

# Top correlated features with the target
target_corr = correlation['binary_class'].sort_values(ascending=False)
print("\nTop Correlated Features with Target:")
print(target_corr.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(x=target_corr.head(15).index, y=target_corr.head(15).values)
plt.title('Top 15 Features Correlated with Target')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('visualizations/top_correlated_features.png')
plt.close()



Data Overview:
   duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        181       5450     0   
1         0           tcp    http   SF        239        486     0   
2         0           tcp    http   SF        235       1337     0   
3         0           tcp    http   SF        219       1337     0   
4         0           tcp    http   SF        217       2032     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   9   
1               0       0    0  ...                  19   
2               0       0    0  ...                  29   
3               0       0    0  ...                  39   
4               0       0    0  ...                  49   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     1.0                     0.0   
1                     1.0                     0.0   
2                     1.0                     0.0

In [3]:
# Separate features and target
X = data.drop(['class', 'binary_class'], axis=1)
y = data['binary_class']

# Handle categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

# Feature scaling
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=[np.number]).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Feature selection
print("\nPerforming feature selection...")
selector = SelectKBest(f_classif, k=25)  # Select top 25 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]
print("Selected features:", selected_features.tolist())

# Visualize feature importance scores
plt.figure(figsize=(12, 8))
scores = selector.scores_
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': scores})
feature_scores = feature_scores.sort_values('Score', ascending=False).head(25)
sns.barplot(x='Score', y='Feature', data=feature_scores)
plt.title('Top 25 Features by F-Score')
plt.tight_layout()
plt.savefig('visualizations/feature_importance.png')
plt.close()

# Apply SMOTE for handling class imbalance
print("\nApplying SMOTE for class balancing...")
smote = SMOTE(random_state=42)
X_train_selected_resampled, y_train_resampled = smote.fit_resample(X_train_selected, y_train)
print(f"After SMOTE - Training set shape: {X_train_selected_resampled.shape}")

# Class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Dimensionality reduction for visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_selected)
X_test_pca = pca.transform(X_test_selected)

# Visualize PCA results
plt.figure(figsize=(12, 10))
plt.scatter(X_train_pca[y_train == 0, 0], X_train_pca[y_train == 0, 1], 
           alpha=0.5, label='Normal', s=5)
plt.scatter(X_train_pca[y_train == 1, 0], X_train_pca[y_train == 1, 1], 
           alpha=0.5, label='Attack', s=5)
plt.title('PCA: 2D Projection of the Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.savefig('visualizations/pca_visualization.png')
plt.close()

print("\nData preprocessing completed.")


Training set shape: (395216, 41), Test set shape: (98805, 41)

Performing feature selection...
Selected features: ['duration', 'protocol_type', 'service', 'flag', 'dst_bytes', 'wrong_fragment', 'logged_in', 'num_file_creations', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate']

Applying SMOTE for class balancing...
After SMOTE - Training set shape: (634788, 25)
Class distribution after SMOTE:
binary_class
1    317394
0    317394
Name: count, dtype: int64

Data preprocessing completed.


In [4]:
# Define a function to evaluate models
from sklearn.ensemble import VotingClassifier
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print(f"Inference time: {inference_time:.2f} seconds")
    print("Classification Report:")
    print(report)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'visualizations/confusion_matrix_{model_name}.png')
    plt.close()
    
    # ROC Curve
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend(loc="lower right")
        plt.savefig(f'visualizations/roc_curve_{model_name}.png')
        plt.close()
        
        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        plt.figure(figsize=(8, 6))
        plt.plot(recall, precision, lw=2)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve - {model_name}')
        plt.savefig(f'visualizations/pr_curve_{model_name}.png')
        plt.close()
    
    return model, accuracy, train_time, inference_time

# Train and evaluate individual models first
print("\nTraining and evaluating individual models...")
base_models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, 
                                           random_state=42, n_jobs=-1, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, 
                                                   random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, 
                                random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, 
                                  random_state=42, n_jobs=-1),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.0001,
                                   learning_rate='adaptive', random_state=42)
}

results = {}
trained_models = {}

for name, model in base_models.items():
    print(f"\nTraining {name}...")
    trained_model, accuracy, train_time, inference_time = evaluate_model(
        model, X_train_selected_resampled, X_test_selected, y_train_resampled, y_test, name
    )
    results[name] = {
        'model': trained_model,
        'accuracy': accuracy,
        'train_time': train_time,
        'inference_time': inference_time
    }
    trained_models[name] = trained_model

# Create a hybrid model using voting (combines predictions from multiple models)
print("\nTraining Voting Classifier (Hybrid Model 1)...")
voting_clf = VotingClassifier(
    estimators=[
        ('rf', trained_models['Random Forest']),
        ('gb', trained_models['Gradient Boosting']),
        ('xgb', trained_models['XGBoost']),
        ('lgb', trained_models['LightGBM']),
        ('nn', trained_models['Neural Network'])
    ],
    voting='soft'  # Use probability estimates for voting
)

voting_model, voting_accuracy, voting_train_time, voting_inference_time = evaluate_model(
    voting_clf, X_train_selected_resampled, X_test_selected, y_train_resampled, y_test, "Voting Ensemble"
)

results["Voting Ensemble"] = {
    'model': voting_model,
    'accuracy': voting_accuracy,
    'train_time': voting_train_time,
    'inference_time': voting_inference_time
}

# Create a weighted voting ensemble (gives more weight to better performing models)
print("\nTraining Weighted Voting Classifier (Hybrid Model 2)...")
# Calculate weights based on individual model accuracies
weights = [results[name]['accuracy'] for name in base_models.keys()]
weighted_voting_clf = VotingClassifier(
    estimators=[
        ('rf', trained_models['Random Forest']),
        ('gb', trained_models['Gradient Boosting']),
        ('xgb', trained_models['XGBoost']),
        ('lgb', trained_models['LightGBM']),
        ('nn', trained_models['Neural Network'])
    ],
    voting='soft',
    weights=weights  # Weight by accuracy
)

weighted_voting_model, weighted_voting_accuracy, weighted_voting_train_time, weighted_voting_inference_time = evaluate_model(
    weighted_voting_clf, X_train_selected_resampled, X_test_selected, y_train_resampled, y_test, "Weighted Voting Ensemble"
)

results["Weighted Voting Ensemble"] = {
    'model': weighted_voting_model,
    'accuracy': weighted_voting_accuracy,
    'train_time': weighted_voting_train_time,
    'inference_time': weighted_voting_inference_time
}

# Create a stacked ensemble (uses predictions from base models as features for a meta-model)
from sklearn.ensemble import StackingClassifier

print("\nTraining Stacking Classifier (Hybrid Model 3)...")
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(n_estimators=150, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=150, random_state=42, n_jobs=-1, 
                                 use_label_encoder=False, eval_metric='logloss')),
        ('lgb', lgb.LGBMClassifier(n_estimators=150, random_state=42, n_jobs=-1))
    ],
    final_estimator=xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, random_state=42),
    cv=5  # 5-fold cross-validation
)

stacking_model, stacking_accuracy, stacking_train_time, stacking_inference_time = evaluate_model(
    stacking_clf, X_train_selected_resampled, X_test_selected, y_train_resampled, y_test, "Stacking Ensemble"
)

results["Stacking Ensemble"] = {
    'model': stacking_model,
    'accuracy': stacking_accuracy,
    'train_time': stacking_train_time,
    'inference_time': stacking_inference_time
}

# Create a custom hybrid model (combines the best features of multiple approaches)
print("\nTraining Custom Hybrid Model...")

# First level: Train base models and get their predictions
def get_oof_predictions(models, X, y, X_test, cv=5):
    """Generate out-of-fold predictions for training data and predictions for test data"""
    oof_train = np.zeros((X.shape[0], len(models)))
    oof_test = np.zeros((X_test.shape[0], len(models)))
    
    # For each model
    for i, (name, model) in enumerate(models.items()):
        print(f"  Generating OOF predictions for {name}...")
        # Create KFold object
        kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
        
        # For each fold
        for train_index, val_index in kf.split(X, y):
            # Split data
            X_train_fold, X_val_fold = X[train_index], X[val_index]
            y_train_fold = y[train_index]
            
            # Train model
            model.fit(X_train_fold, y_train_fold)
            
            # Predict on validation fold
            if hasattr(model, 'predict_proba'):
                oof_train[val_index, i] = model.predict_proba(X_val_fold)[:, 1]
            else:
                oof_train[val_index, i] = model.predict(X_val_fold)
        
        # Train model on full data and predict on test
        model.fit(X, y)
        if hasattr(model, 'predict_proba'):
            oof_test[:, i] = model.predict_proba(X_test)[:, 1]
        else:
            oof_test[:, i] = model.predict(X_test)
    
    return oof_train, oof_test

# Define base models for the hybrid approach
hybrid_base_models = {
    'RF': RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1),
    'XGB': xgb.XGBClassifier(n_estimators=150, max_depth=5, learning_rate=0.1, random_state=42, 
                           use_label_encoder=False, eval_metric='logloss'),
    'LGBM': lgb.LGBMClassifier(n_estimators=150, max_depth=5, learning_rate=0.1, random_state=42)
}

# Convert to numpy arrays for faster processing
X_train_np = X_train_selected_resampled
y_train_np = y_train_resampled.values if hasattr(y_train_resampled, 'values') else y_train_resampled
X_test_np = X_test_selected

# Generate out-of-fold predictions
oof_train, oof_test = get_oof_predictions(hybrid_base_models, X_train_np, y_train_np, X_test_np)

# Create a meta-model using the out-of-fold predictions
meta_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.05, random_state=42,
                             use_label_encoder=False, eval_metric='logloss')
meta_model.fit(oof_train, y_train_np)

# Make final predictions
meta_preds = meta_model.predict(oof_test)
meta_accuracy = accuracy_score(y_test, meta_preds)
meta_report = classification_report(y_test, meta_preds)

print("\nCustom Hybrid Model Results:")
print(f"Accuracy: {meta_accuracy:.4f}")
print("Classification Report:")
print(meta_report)

# Confusion Matrix for Custom Hybrid Model
cm = confusion_matrix(y_test, meta_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Custom Hybrid Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('visualizations/confusion_matrix_Custom_Hybrid.png')
plt.close()

# ROC Curve for Custom Hybrid Model
if hasattr(meta_model, "predict_proba"):
    meta_probs = meta_model.predict_proba(oof_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, meta_probs)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Custom Hybrid Model')
    plt.legend(loc="lower right")
    plt.savefig('visualizations/roc_curve_Custom_Hybrid.png')
    plt.close()

results["Custom Hybrid Model"] = {
    'model': meta_model,
    'accuracy': meta_accuracy,
    'train_time': None,  # Not directly measured
    'inference_time': None  # Not directly measured
}

# Compare all models
print("\nModel Comparison:")
accuracies = [results[name]['accuracy'] for name in results.keys()]
model_names = list(results.keys())

plt.figure(figsize=(12, 8))
bars = plt.bar(model_names, accuracies, color='skyblue')
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.ylim(min(accuracies) - 0.05, 1.0)

# Add accuracy values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
            f'{height:.4f}', ha='center', va='bottom', rotation=0)

plt.tight_layout()
plt.savefig('visualizations/model_comparison.png')
plt.close()

# Find the best model
# Find the best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_accuracy = results[best_model_name]['accuracy']
print(f"\nBest Model: {best_model_name} with accuracy: {best_accuracy:.4f}")

# Save the best model
import joblib
best_model = results[best_model_name]['model']
joblib.dump(best_model, 'best_model.pkl')
print(f"Best model saved as 'best_model.pkl'")

# Create a function for real-time anomaly detection
def detect_anomaly(data_point, model, scaler, label_encoders, selected_features_indices):
    """
    Detect if a data point is an anomaly (attack) or normal
    
    Parameters:
    data_point (dict): Dictionary containing feature values
    model: Trained model
    scaler: Fitted scaler
    label_encoders: Dictionary of label encoders for categorical features
    selected_features_indices: Indices of selected features
    
    Returns:
    tuple: (prediction, probability)
    """
    # Convert to DataFrame
    df = pd.DataFrame([data_point])
    
    # Encode categorical features
    for col, encoder in label_encoders.items():
        if col in df.columns:
            df[col] = encoder.transform(df[col])
    
    # Scale numerical features
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    df[numerical_cols] = scaler.transform(df[numerical_cols])
    
    # Select features
    df_selected = df.iloc[:, selected_features_indices]
    
    # Make prediction
    prediction = model.predict(df_selected)[0]
    
    # Get probability if available
    probability = None
    if hasattr(model, 'predict_proba'):
        probability = model.predict_proba(df_selected)[0][1]
    
    return prediction, probability

# Example of using the anomaly detection function
print("\nExample of real-time anomaly detection:")
# Take a sample from the test set
sample_idx = np.random.randint(0, len(X_test))
sample_data = X_test.iloc[sample_idx].to_dict()
true_label = y_test.iloc[sample_idx] if hasattr(y_test, 'iloc') else y_test[sample_idx]

# Detect anomaly
prediction, probability = detect_anomaly(
    sample_data, 
    best_model, 
    scaler, 
    label_encoders, 
    selected_indices
)




print(f"Sample data: {sample_data}")
print(f"True label: {'Attack' if true_label == 1 else 'Normal'}")
print(f"Prediction: {'Attack' if prediction == 1 else 'Normal'}")
if probability is not None:
    print(f"Attack probability: {probability:.4f}")

# Summary of the analysis
print("\n=== Network Intrusion Detection System Analysis Summary ===")
print(f"Dataset: KDD Cup 1999 (10% subset)")
print(f"Total samples: {len(data)}")
print(f"Normal connections: {len(data[data['binary_class'] == 0])}")
print(f"Attack connections: {len(data[data['binary_class'] == 1])}")
print(f"Number of features used: {len(selected_features)}")
print(f"Best model: {best_model_name}")
print(f"Best model accuracy: {best_accuracy:.4f}")

# Print top attack types
print("\nTop attack types in the dataset:")
attack_counts = data[data['binary_class'] == 1]['class'].value_counts().head(5)
for attack, count in attack_counts.items():
    print(f"  - {attack}: {count} instances")






print("\nAnalysis completed successfully!")




Training and evaluating individual models...

Training Random Forest...

Random Forest Results:
Accuracy: 0.9997
Training time: 69.18 seconds
Inference time: 0.43 seconds
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19456
           1       1.00      1.00      1.00     79349

    accuracy                           1.00     98805
   macro avg       1.00      1.00      1.00     98805
weighted avg       1.00      1.00      1.00     98805


Training Gradient Boosting...

Gradient Boosting Results:
Accuracy: 0.9995
Training time: 692.39 seconds
Inference time: 0.48 seconds
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19456
           1       1.00      1.00      1.00     79349

    accuracy                           1.00     98805
   macro avg       1.00      1.00      1.00     98805
weighted avg       1.00      1.00      1.00     

ValueError: y contains previously unseen labels: -0.8118542882542638

In [6]:
def detect_anomaly(data_point, model, scaler, label_encoders, selected_features_indices):
    """
    Detect if a data point is an anomaly (attack) or normal
    
    Parameters:
    data_point (dict): Dictionary containing feature values
    model: Trained model
    scaler: Fitted scaler
    label_encoders: Dictionary of label encoders for categorical features
    selected_features_indices: Indices of selected features
    
    Returns:
    tuple: (prediction, probability)
    """
    # Convert to DataFrame
    df = pd.DataFrame([data_point])
    
    # Encode categorical features
    for col, encoder in label_encoders.items():
        if col in df.columns:
            try:
                # Handle unseen categories by using try-except
                df[col] = encoder.transform(df[col])
            except ValueError:
                # For unseen categories, assign a default value (e.g., -1)
                # or you could use the most common category from training
                df[col] = -1
    
    # Scale numerical features
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    try:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    except ValueError:
        # Handle case where numerical features are out of expected range
        # You might want to clip values to the min/max seen during training
        for col in numerical_cols:
            if col in df.columns:
                df[col] = df[col].clip(lower=scaler.data_min_[numerical_cols.get_loc(col)], 
                                       upper=scaler.data_max_[numerical_cols.get_loc(col)])
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    
    # Select features
    # Make sure to handle the case where selected_features_indices might be out of bounds
    if max(selected_features_indices) < df.shape[1]:
        df_selected = df.iloc[:, selected_features_indices]
    else:
        # Handle the case where indices are out of bounds
        valid_indices = [i for i in selected_features_indices if i < df.shape[1]]
        df_selected = df.iloc[:, valid_indices]
        # Fill missing columns with zeros or appropriate default values
        missing_cols = len(selected_features_indices) - len(valid_indices)
        if missing_cols > 0:
            df_selected = pd.concat([df_selected, pd.DataFrame(np.zeros((1, missing_cols)))], axis=1)
    
    # Make prediction
    prediction = model.predict(df_selected)[0]
    
    # Get probability if available
    probability = None
    if hasattr(model, 'predict_proba'):
        probability = model.predict_proba(df_selected)[0][1]
    
    return prediction, probability


In [8]:
# Example of using the anomaly detection function
print("\nExample of real-time anomaly detection:")
# Take a sample from the test set
sample_idx = np.random.randint(0, len(X_test))
sample_data = X_test.iloc[sample_idx].to_dict()
true_label = y_test.iloc[sample_idx] if hasattr(y_test, 'iloc') else y_test[sample_idx]

# Detect anomaly
prediction, probability = detect_anomaly(
    sample_data, 
    best_model, 
    scaler, 
    label_encoders, 
    selected_indices
)




print(f"Sample data: {sample_data}")
print(f"True label: {'Attack' if true_label == 1 else 'Normal'}")
print(f"Prediction: {'Attack' if prediction == 1 else 'Normal'}")
if probability is not None:
    print(f"Attack probability: {probability:.4f}")

# Summary of the analysis
print("\n=== Network Intrusion Detection System Analysis Summary ===")
print(f"Dataset: KDD Cup 1999 (10% subset)")
print(f"Total samples: {len(data)}")
print(f"Normal connections: {len(data[data['binary_class'] == 0])}")
print(f"Attack connections: {len(data[data['binary_class'] == 1])}")
print(f"Number of features used: {len(selected_features)}")
print(f"Best model: {best_model_name}")
print(f"Best model accuracy: {best_accuracy:.4f}")

# Print top attack types
print("\nTop attack types in the dataset:")
attack_counts = data[data['binary_class'] == 1]['class'].value_counts().head(5)
for attack, count in attack_counts.items():
    print(f"  - {attack}: {count} instances")






print("\nAnalysis completed successfully!")




Example of real-time anomaly detection:
Sample data: {'duration': -0.06800307009034158, 'protocol_type': 0.9257808545370252, 'service': 1.593691861421257, 'flag': -3.03482773048322, 'src_bytes': -0.0031112784721511076, 'dst_bytes': -0.026438242020806794, 'land': -0.006558681512232836, 'wrong_fragment': -0.04774947552523989, 'urgent': -0.0028749940231173306, 'hot': -0.04453436318939023, 'num_failed_logins': -0.009641423014058285, 'logged_in': -0.41706846586997653, 'num_compromised': -0.005738098505962911, 'root_shell': -0.009934281761749255, 'su_attempted': -0.004499167168759692, 'num_root': -0.0055905729129599934, 'num_file_creations': -0.011324949787115271, 'num_shells': -0.009381729957126802, 'num_access_files': -0.02799290711955656, 'num_outbound_cmds': 0.0, 'is_host_login': 0.0, 'is_guest_login': -0.03776995724202753, 'count': -0.27747858242840523, 'srv_count': -1.1074497875524787, 'serror_rate': -0.4640605292301331, 'srv_serror_rate': -0.4635137469590858, 'rerror_rate': 4.0580220