# XIDS Feature Importance Analysis Notebook

This notebook analyzes feature importance using various techniques to identify the most relevant features for intrusion detection.

## Contents:
1. Feature Importance from Tree-Based Models
2. Feature Selection Methods
3. Correlation Analysis
4. Feature Importance Visualization
5. Feature Selection Recommendations

## 1. Feature Importance from Tree-Based Models

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from pathlib import Path

# Set up visualization
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 7)

# Define paths
BACKEND_DIR = Path('../backend')
DATA_DIR = BACKEND_DIR / 'data' / 'raw'

print("Feature Importance Analysis for XIDS")
print("=" * 50)

In [None]:
# Load sample data
try:
    train_file = DATA_DIR / 'KDDTrain+.txt'
    df = pd.read_csv(train_file, nrows=5000)
    print(f"Data loaded: {df.shape}")
except:
    print("Using synthetic data for demonstration")
    n_samples = 1000
    n_features = 41
    df = pd.DataFrame(np.random.randn(n_samples, n_features))
    # Add a synthetic label
    df['label'] = np.random.choice(['BENIGN', 'Attack'], n_samples)

print(f"Dataset shape: {df.shape}")
print(f"Column names (first 5): {list(df.columns[:5])}")

In [None]:
# Prepare data
label_col = df.columns[-1]
X = df.drop(columns=[label_col])
y = df[label_col]

# Convert labels to numeric if needed
if y.dtype == 'object':
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y = le.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Classes: {np.unique(y)}")

In [None]:
# Train Random Forest for feature importance
print("Training Random Forest for feature importance...")
rf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf.fit(X, y)

# Get feature importances
feature_importance_rf = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (Random Forest):")
print(feature_importance_rf.head(10))

## 2. Feature Selection Methods

In [None]:
# SelectKBest with F-score
print("SelectKBest Feature Selection (F-score)...")
select_kbest = SelectKBest(score_func=f_classif, k=20)
select_kbest.fit(X, y)

# Get selected feature indices
selected_indices = select_kbest.get_support(indices=True)
selected_features_kbest = X.columns[selected_indices].tolist()

print(f"Selected features: {len(selected_features_kbest)}")
print(f"Features: {selected_features_kbest[:5]}...")

In [None]:
# Mutual Information
print("\nMutual Information Feature Selection...")
select_mi = SelectKBest(score_func=mutual_info_classif, k=20)
select_mi.fit(X, y)

# Get MI scores
mi_scores = select_mi.scores_
mi_importance = pd.DataFrame({
    'feature': X.columns,
    'score': mi_scores
}).sort_values('score', ascending=False)

print("\nTop 10 Features (Mutual Information):")
print(mi_importance.head(10))

## 3. Correlation Analysis

In [None]:
# Correlation with target
print("Computing correlations...")

# Select only numeric columns for correlation
numeric_cols = X.select_dtypes(include=[np.number]).columns
X_numeric = X[numeric_cols]

if X_numeric.shape[1] > 0:
    correlations = X_numeric.corrwith(y).abs().sort_values(ascending=False)
    print("\nTop 10 Features by Correlation with Target:")
    print(correlations.head(10))
else:
    print("No numeric features for correlation analysis")

## 4. Feature Importance Visualization

In [None]:
# Visualize top features from Random Forest
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Random Forest - Top 15 features
top_n = 15
top_features_rf = feature_importance_rf.head(top_n)
axes[0, 0].barh(range(top_n), top_features_rf['importance'], color='steelblue')
axes[0, 0].set_yticks(range(top_n))
axes[0, 0].set_yticklabels(top_features_rf['feature'])
axes[0, 0].set_xlabel('Importance Score')
axes[0, 0].set_title('Top 15 Features - Random Forest Importance', fontweight='bold')
axes[0, 0].invert_yaxis()

# 2. Feature importance distribution
axes[0, 1].hist(feature_importance_rf['importance'], bins=30, color='steelblue', edgecolor='black')
axes[0, 1].set_xlabel('Importance Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Feature Importances', fontweight='bold')

# 3. Mutual Information - Top 15 features
if len(mi_importance) > 0:
    top_features_mi = mi_importance.head(top_n)
    axes[1, 0].barh(range(min(top_n, len(top_features_mi))), 
                    top_features_mi['score'][:top_n], color='coral')
    axes[1, 0].set_yticks(range(min(top_n, len(top_features_mi))))
    axes[1, 0].set_yticklabels(top_features_mi['feature'][:top_n])
    axes[1, 0].set_xlabel('MI Score')
    axes[1, 0].set_title('Top 15 Features - Mutual Information', fontweight='bold')
    axes[1, 0].invert_yaxis()

# 4. Cumulative importance
cumulative_importance = np.cumsum(feature_importance_rf['importance'].values)
axes[1, 1].plot(range(len(cumulative_importance)), cumulative_importance, marker='o', linewidth=2)
axes[1, 1].axhline(y=0.8, color='red', linestyle='--', label='80% threshold')
axes[1, 1].axhline(y=0.9, color='orange', linestyle='--', label='90% threshold')
axes[1, 1].set_xlabel('Number of Features')
axes[1, 1].set_ylabel('Cumulative Importance')
axes[1, 1].set_title('Cumulative Feature Importance', fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Visualization complete")

## 5. Feature Selection Recommendations

In [None]:
# Analyze feature importance distribution
print("Feature Importance Analysis:")
print(f"Total features: {len(feature_importance_rf)}")
print(f"Mean importance: {feature_importance_rf['importance'].mean():.6f}")
print(f"Std importance: {feature_importance_rf['importance'].std():.6f}")

# Find number of features for 80% and 90% importance
cumulative = np.cumsum(feature_importance_rf['importance'].values)
features_80 = np.argmax(cumulative >= 0.8) + 1
features_90 = np.argmax(cumulative >= 0.9) + 1

print(f"\nFeatures needed for:")
print(f"  80% importance: {features_80} features")
print(f"  90% importance: {features_90} features")
print(f"  95% importance: {np.argmax(cumulative >= 0.95) + 1} features")

In [None]:
# Recommendations
print("\n" + "="*60)
print("FEATURE SELECTION RECOMMENDATIONS")
print("="*60)

print(f"\n1. For Balanced Performance & Speed:")
print(f"   - Select top {features_80} features")
print(f"   - Explanation: Captures 80% of feature importance")
print(f"   - Top features: {list(feature_importance_rf['feature'].head(features_80))}")

print(f"\n2. For High Accuracy:")
print(f"   - Select top {features_90} features")
print(f"   - Explanation: Captures 90% of feature importance")
print(f"   - Top features: {list(feature_importance_rf['feature'].head(features_90))}")

print(f"\n3. For Model Interpretability:")
print(f"   - Use top 10-15 features")
print(f"   - Easier to understand and visualize")
print(f"   - Top 15: {list(feature_importance_rf['feature'].head(15))}")

## Summary

This analysis provides:
1. **Feature Importance Ranking**: Based on Random Forest trained on the dataset
2. **Feature Selection Methods**: Multiple techniques to identify relevant features
3. **Cumulative Analysis**: Shows diminishing returns of adding more features
4. **Recommendations**: Actionable guidance for feature selection

### Key Insights:
- A subset of features captures most of the predictive power
- Dimensionality reduction improves model efficiency
- Different methods may prioritize different features
- Balance between accuracy and interpretability is important

### Next Steps:
1. Apply selected features to train the final model
2. Use explainability techniques to understand predictions
3. Validate feature selection on test set
4. Monitor feature importance in production