# UNSW-NB15 Dataset Exploration for TDA

**Purpose**: Analyze UNSW-NB15 dataset structure and suitability for topological data analysis methods.

**Key Questions**:
1. What are the feature types and distributions?
2. How are attack types distributed?
3. What temporal structure exists?
4. Which features are suitable for TDA methods?
5. How does this compare to CTDAPD dataset limitations?

**Expected Outcome**: Clear dataset assessment and TDA method recommendations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')

print("📊 UNSW-NB15 Dataset Exploration for TDA")
print("=" * 50)

In [None]:
# Load UNSW-NB15 dataset
data_path = Path("../data/apt_datasets/UNSW-NB15")
print(f"📁 Looking for data in: {data_path}")

# Check available files
if data_path.exists():
    files = list(data_path.glob("*.parquet")) + list(data_path.glob("*.csv"))
    print(f"📄 Found files: {[f.name for f in files]}")
else:
    print("❌ Data directory not found")

In [None]:
# Load training and testing sets
try:
    train_df = pd.read_parquet(data_path / "UNSW_NB15_training-set.parquet")
    test_df = pd.read_parquet(data_path / "UNSW_NB15_testing-set.parquet")
    
    print(f"✅ Training set: {train_df.shape}")
    print(f"✅ Testing set: {test_df.shape}")
    print(f"📊 Total samples: {len(train_df) + len(test_df)}")
    
    # Combine for full analysis
    df = pd.concat([train_df, test_df], ignore_index=True)
    print(f"🔗 Combined dataset: {df.shape}")
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    df = None

In [None]:
# Dataset overview
if df is not None:
    print("📋 DATASET OVERVIEW")
    print(f"Shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    print("\n📝 Column Names:")
    for i, col in enumerate(df.columns):
        print(f"{i+1:2d}. {col}")

In [None]:
# Attack type analysis
if df is not None and 'attack_cat' in df.columns:
    print("🎯 ATTACK TYPE DISTRIBUTION")
    attack_counts = df['attack_cat'].value_counts()
    print(attack_counts)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    attack_counts.plot(kind='bar', rot=45)
    plt.title('UNSW-NB15 Attack Type Distribution')
    plt.xlabel('Attack Category')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # Compare with CTDAPD limitations
    print(f"\n✅ Attack diversity: {len(attack_counts)} categories")
    print(f"✅ Balanced distribution: Min/Max ratio = {attack_counts.min()/attack_counts.max():.3f}")
    
elif df is not None and 'Label' in df.columns:
    print("🎯 LABEL DISTRIBUTION")
    label_counts = df['Label'].value_counts()
    print(label_counts)

In [None]:
# Feature analysis for TDA suitability
if df is not None:
    print("🔬 FEATURE ANALYSIS FOR TDA SUITABILITY")
    
    # Identify numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(f"📊 Numeric features: {len(numeric_cols)}")
    
    # Remove target columns
    target_cols = ['label', 'Label', 'attack_cat', 'id']
    feature_cols = [col for col in numeric_cols if col.lower() not in [t.lower() for t in target_cols]]
    print(f"🎯 Feature columns for TDA: {len(feature_cols)}")
    
    # Feature statistics
    print("\n📈 Feature Statistics Summary:")
    feature_stats = df[feature_cols].describe()
    print(feature_stats.iloc[[0, 1, 3, 7]].T)  # count, mean, std, max
    
    # Check for missing values
    missing_vals = df[feature_cols].isnull().sum()
    if missing_vals.sum() > 0:
        print(f"\n⚠️ Missing values found in {missing_vals[missing_vals > 0].shape[0]} columns")
        print(missing_vals[missing_vals > 0])
    else:
        print("\n✅ No missing values in numeric features")

In [None]:
# Temporal analysis (if timestamp columns exist)
if df is not None:
    print("⏰ TEMPORAL STRUCTURE ANALYSIS")
    
    # Look for timestamp/time columns
    time_cols = [col for col in df.columns if any(keyword in col.lower() 
                for keyword in ['time', 'date', 'timestamp', 'start', 'dur'])]
    
    if time_cols:
        print(f"📅 Time-related columns: {time_cols}")
        
        for col in time_cols:
            print(f"\n{col}:")
            print(f"  Type: {df[col].dtype}")
            print(f"  Range: {df[col].min()} to {df[col].max()}")
            print(f"  Unique values: {df[col].nunique()}")
    else:
        print("❌ No clear timestamp columns found")
        print("⚠️ This may limit temporal network analysis methods")
        
    # Network flow features (good for TDA)
    flow_keywords = ['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 
                    'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin']
    flow_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in flow_keywords)]
    
    print(f"\n🌊 Network flow features: {len(flow_cols)}")
    print(f"✅ These are excellent for TDA methods: {flow_cols[:5]}...")

In [None]:
# TDA Method Recommendations
if df is not None:
    print("🎯 TDA METHOD RECOMMENDATIONS FOR UNSW-NB15")
    print("=" * 60)
    
    n_samples = len(df)
    n_features = len(feature_cols) if 'feature_cols' in locals() else 0
    n_attacks = len(df[df.columns[df.columns.str.contains('attack|label', case=False, regex=True)]].iloc[:, 0].unique()) if any(df.columns.str.contains('attack|label', case=False, regex=True)) else 0
    
    print(f"📊 Dataset Scale: {n_samples:,} samples, {n_features} features, {n_attacks} classes")
    
    # Method recommendations based on dataset characteristics
    recommendations = []
    
    if n_samples > 100000:
        recommendations.append("✅ PERSISTENT HOMOLOGY: Large sample size suitable for robust topology")
        recommendations.append("✅ MAPPER: Can handle large-scale data with proper filtering")
    
    if n_features >= 20:
        recommendations.append("✅ VIETORIS-RIPS COMPLEX: Rich feature space for simplicial analysis")
        recommendations.append("✅ DIMENSION REDUCTION + TDA: Use PCA/UMAP preprocessing")
    
    if time_cols:
        recommendations.append("✅ SLIDING WINDOW TDA: Temporal structure available")
        recommendations.append("✅ COLLINS NETWORK METHOD: Good temporal connectivity")
    else:
        recommendations.append("⚠️ STATIC TOPOLOGY ONLY: No temporal analysis methods")
    
    if n_attacks > 5:
        recommendations.append("✅ MULTI-CLASS TDA: Rich attack diversity for topological signatures")
    
    for rec in recommendations:
        print(rec)
    
    print(f"\n🏆 RECOMMENDED PRIORITY ORDER:")
    print(f"1. Enhanced Topological Dissimilarity (proven 5% baseline)")
    print(f"2. Persistent Homology on Network Flow Features")
    print(f"3. Mapper Analysis for Attack Pattern Visualization")
    if time_cols:
        print(f"4. Collins Network Structure Method (if temporal density sufficient)")
    
    print(f"\n📈 EXPECTED IMPROVEMENTS vs CTDAPD:")
    print(f"• Richer feature space: {n_features} vs 15 features")
    print(f"• Better attack diversity: {n_attacks} vs 5 categories")
    print(f"• Larger sample size: {n_samples:,} vs 54,768")
    print(f"• Established benchmark: Literature comparisons available")

In [None]:
# Feature correlation analysis (important for TDA)
if df is not None and 'feature_cols' in locals() and len(feature_cols) > 0:
    print("🔗 FEATURE CORRELATION ANALYSIS")
    
    # Sample features for correlation analysis
    sample_features = feature_cols[:20] if len(feature_cols) > 20 else feature_cols
    
    corr_matrix = df[sample_features].corr()
    
    # Find highly correlated features (potential redundancy)
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.8:
                high_corr_pairs.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    
    if high_corr_pairs:
        print(f"⚠️ High correlation pairs (>0.8): {len(high_corr_pairs)}")
        for pair in high_corr_pairs[:5]:  # Show first 5
            print(f"  {pair[0]} <-> {pair[1]}: {pair[2]:.3f}")
    else:
        print("✅ No highly correlated features found")
    
    # Correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, cmap='coolwarm', center=0, square=True, annot=False)
    plt.title(f'Feature Correlation Matrix (First {len(sample_features)} features)')
    plt.tight_layout()
    plt.show()

In [None]:
# Data quality assessment
if df is not None:
    print("🔍 DATA QUALITY ASSESSMENT FOR TDA")
    print("=" * 50)
    
    # Check for infinite values
    inf_counts = df.select_dtypes(include=[np.number]).apply(lambda x: np.isinf(x).sum())
    if inf_counts.sum() > 0:
        print(f"⚠️ Infinite values found: {inf_counts[inf_counts > 0].sum()}")
    else:
        print("✅ No infinite values")
    
    # Check for zero variance features
    zero_var_features = []
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].std() == 0:
            zero_var_features.append(col)
    
    if zero_var_features:
        print(f"⚠️ Zero variance features: {len(zero_var_features)}")
        print(f"  {zero_var_features}")
    else:
        print("✅ All features have non-zero variance")
    
    # Feature scale analysis
    if 'feature_cols' in locals():
        feature_ranges = df[feature_cols].max() - df[feature_cols].min()
        scale_analysis = {
            'Small scale (< 1)': sum(feature_ranges < 1),
            'Medium scale (1-100)': sum((feature_ranges >= 1) & (feature_ranges <= 100)),
            'Large scale (> 100)': sum(feature_ranges > 100)
        }
        
        print("\n📏 Feature Scale Distribution:")
        for scale, count in scale_analysis.items():
            print(f"  {scale}: {count} features")
        
        if scale_analysis['Large scale (> 100)'] > 0:
            print("⚠️ Consider feature scaling for TDA methods")
        else:
            print("✅ Feature scales relatively uniform")

In [None]:
# Summary and next steps
if df is not None:
    print("📋 UNSW-NB15 DATASET SUMMARY FOR TDA")
    print("=" * 60)
    
    print("✅ STRENGTHS:")
    print(f"  • Large scale: {len(df):,} samples for robust topology")
    print(f"  • Rich features: {len(feature_cols) if 'feature_cols' in locals() else 'N/A'} numeric features")
    print(f"  • Attack diversity: Multiple attack categories for signature analysis")
    print(f"  • Established benchmark: Literature comparisons available")
    print(f"  • Clean data: No major quality issues identified")
    
    if time_cols:
        print(f"  • Temporal features: {len(time_cols)} time-related columns")
    
    print("\n⚠️ CONSIDERATIONS:")
    if not time_cols:
        print(f"  • Limited temporal analysis: No clear timestamp columns")
    if 'high_corr_pairs' in locals() and high_corr_pairs:
        print(f"  • Feature redundancy: {len(high_corr_pairs)} highly correlated pairs")
    print(f"  • Computational scale: Large dataset may require sampling for some TDA methods")
    
    print("\n🎯 RECOMMENDED NEXT STEPS:")
    print("1. Implement topological dissimilarity method on UNSW-NB15")
    print("2. Compare results with CTDAPD baseline (5.0% attack recall)")
    print("3. Explore persistent homology on network flow features")
    print("4. Test enhanced methods (H1/H2 homology, multiple baselines)")
    print("5. Create validation scripts following unified instructions")
    
    print("\n📁 Next notebook: 'validate_topological_dissimilarity_unsw.ipynb'")
else:
    print("❌ Cannot complete analysis - dataset not loaded")