<a href="https://colab.research.google.com/github/sivavithu/Improving-QoS-in-SDN/blob/main/RF6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("🚀 Enhanced CIC-Darknet2020 Application Classifier")
print("Target: 95%+ Accuracy for SDN Application Classification")
print("=" * 60)

🚀 Enhanced CIC-Darknet2020 Application Classifier
Target: 95%+ Accuracy for SDN Application Classification


In [4]:
# ===============================================================================
# CELL 3: Import ML Libraries
# ===============================================================================

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                           precision_recall_fscore_support)
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import xgboost as xgb
import lightgbm as lgb
import joblib

print("✅ All ML libraries imported!")



✅ All ML libraries imported!


In [5]:
# ===============================================================================
# CELL 4: Load and Inspect Dataset
# ===============================================================================

# Load your dataset - UPDATE THIS PATH
df = pd.read_csv('/content/drive/MyDrive/Research/Darknet.CSV')

print(f"📊 Dataset Shape: {df.shape}")
print(f"📊 Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display basic info
print("\n📋 Dataset Info:")
print(f"  Rows: {len(df):,}")
print(f"  Columns: {len(df.columns)}")
print(f"  Missing values: {df.isnull().sum().sum():,}")

# Show first few rows
print("\n📝 First 5 rows:")
df.head()


📊 Dataset Shape: (158616, 85)
📊 Memory Usage: 155.32 MB

📋 Dataset Info:
  Rows: 158,616
  Columns: 85
  Missing values: 48

📝 First 5 rows:


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label.1
0,10.152.152.11-216.58.220.99-57158-443-6,10.152.152.11,57158,216.58.220.99,443,6,24/07/2015 04:09:48 PM,229,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
1,10.152.152.11-216.58.220.99-57159-443-6,10.152.152.11,57159,216.58.220.99,443,6,24/07/2015 04:09:48 PM,407,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
2,10.152.152.11-216.58.220.99-57160-443-6,10.152.152.11,57160,216.58.220.99,443,6,24/07/2015 04:09:48 PM,431,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
3,10.152.152.11-74.125.136.120-49134-443-6,10.152.152.11,49134,74.125.136.120,443,6,24/07/2015 04:09:48 PM,359,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
4,10.152.152.11-173.194.65.127-34697-19305-6,10.152.152.11,34697,173.194.65.127,19305,6,24/07/2015 04:09:45 PM,10778451,591,400,...,0,0,0,0,1437765000000000.0,3117718.0,1437765000000000.0,1437765000000000.0,Non-Tor,AUDIO-STREAMING


In [6]:
# ===============================================================================
# CELL 5: Find Application Column
# ===============================================================================

def find_application_column_enhanced(df):
    """Find the correct application classification column"""

    print("🔍 SEARCHING FOR APPLICATION COLUMN")
    print("=" * 40)

    # Check all object columns
    object_columns = df.select_dtypes(include=['object']).columns

    app_candidates = []

    for col in object_columns:
        unique_vals = df[col].unique()
        sample_text = ' '.join([str(v).lower() for v in unique_vals[:10]])

        print(f"\n📊 Column: '{col}' ({len(unique_vals)} unique values)")
        print(f"   Sample values: {list(unique_vals[:5])}")

        # Enhanced application detection
        app_keywords = ['browsing', 'chat', 'email', 'audio', 'video', 'voip',
                       'p2p', 'stream', 'web', 'http', 'ftp', 'torrent']

        keyword_matches = sum(1 for keyword in app_keywords if keyword in sample_text)

        # Check for application patterns
        has_streaming = 'stream' in sample_text
        has_protocols = any(p in sample_text for p in ['http', 'ftp', 'smtp'])
        reasonable_classes = 5 <= len(unique_vals) <= 25

        score = keyword_matches
        if has_streaming: score += 2
        if has_protocols: score += 1
        if reasonable_classes: score += 1

        app_candidates.append((col, score, len(unique_vals)))

        if score >= 3:
            print(f"   ✅ STRONG application candidate (score: {score})")
        elif score >= 1:
            print(f"   ⚠️ Possible application column (score: {score})")
        else:
            print(f"   ❌ Unlikely application column (score: {score})")

    # Sort by score
    app_candidates.sort(key=lambda x: x[1], reverse=True)

    if app_candidates and app_candidates[0][1] > 0:
        best_col = app_candidates[0][0]
        print(f"\n🎯 SELECTED: '{best_col}' (score: {app_candidates[0][1]})")
        return best_col
    else:
        print("\n⚠️ No clear application column found!")
        return object_columns[0] if len(object_columns) > 0 else df.columns[-1]

# Find application column
app_column = find_application_column_enhanced(df)
print(f"\n✅ Using column: '{app_column}'")


🔍 SEARCHING FOR APPLICATION COLUMN

📊 Column: 'Flow ID' (79160 unique values)
   Sample values: ['10.152.152.11-216.58.220.99-57158-443-6', '10.152.152.11-216.58.220.99-57159-443-6', '10.152.152.11-216.58.220.99-57160-443-6', '10.152.152.11-74.125.136.120-49134-443-6', '10.152.152.11-173.194.65.127-34697-19305-6']
   ❌ Unlikely application column (score: 0)

📊 Column: 'Src IP' (4026 unique values)
   Sample values: ['10.152.152.11', '173.194.33.97', '74.125.28.189', '74.125.228.199', '173.194.65.100']
   ❌ Unlikely application column (score: 0)

📊 Column: 'Dst IP' (7553 unique values)
   Sample values: ['216.58.220.99', '74.125.136.120', '173.194.65.127', '10.152.152.11', '216.58.216.142']
   ❌ Unlikely application column (score: 0)

📊 Column: 'Timestamp' (34836 unique values)
   Sample values: ['24/07/2015 04:09:48 PM', '24/07/2015 04:09:45 PM', '24/07/2015 04:10:00 PM', '24/07/2015 04:09:46 PM', '24/07/2015 04:09:49 PM']
   ❌ Unlikely application column (score: 0)

📊 Column: 'Label' 

In [7]:
# ===============================================================================
# INSERT THIS CODE AFTER CELL 5 (Find Application Column)
# ===============================================================================

def normalize_application_names(df, app_column):
    """Normalize application class names to handle case inconsistencies"""

    print("🔧 NORMALIZING APPLICATION CLASS NAMES")
    print("=" * 40)

    # Get original distribution
    original_dist = df[app_column].value_counts()
    print(f"📊 Original classes: {len(original_dist)}")

    # Show original distribution
    print("\n📋 Original Distribution:")
    for app, count in original_dist.items():
        print(f"  {app:<30} {count:6,}")

    # Normalize class names
    df_normalized = df.copy()

    # Convert to lowercase and standardize
    df_normalized[app_column] = df_normalized[app_column].str.lower()

    # Apply consistent formatting rules
    normalization_rules = {
        'audio-streaming': 'Audio-Streaming',
        'video-streaming': 'Video-Streaming',
        'file-transfer': 'File-Transfer',
        'browsing': 'Browsing',
        'chat': 'Chat',
        'email': 'Email',
        'p2p': 'P2P',
        'voip': 'VOIP'
    }

    # Apply normalization
    for old_name, new_name in normalization_rules.items():
        df_normalized[app_column] = df_normalized[app_column].replace(old_name, new_name)

    # Get normalized distribution
    normalized_dist = df_normalized[app_column].value_counts()
    print(f"\n✅ Normalized classes: {len(normalized_dist)}")

    # Show normalized distribution
    print("\n📋 Normalized Distribution:")
    total_samples = len(df_normalized)
    for app, count in normalized_dist.items():
        percentage = count / total_samples * 100
        print(f"  {app:<30} {count:6,} ({percentage:5.1f}%)")

    # Show the consolidation effects
    print(f"\n🎯 CONSOLIDATION EFFECTS:")
    original_classes = set(original_dist.index)
    normalized_classes = set(normalized_dist.index)

    # Find merged classes
    for norm_class in normalized_classes:
        # Find all original classes that map to this normalized class
        original_variants = []
        for orig_class in original_classes:
            if orig_class.lower().replace('-', '-') == norm_class.lower().replace('-', '-'):
                original_variants.append(orig_class)

        if len(original_variants) > 1:
            total_count = sum(original_dist[variant] for variant in original_variants)
            print(f"  📎 {norm_class}: merged {original_variants} → {total_count:,} total samples")

    classes_reduced = len(original_dist) - len(normalized_dist)
    if classes_reduced > 0:
        print(f"\n✨ Reduced classes by {classes_reduced} (from {len(original_dist)} to {len(normalized_dist)})")

    return df_normalized

# Apply normalization
print(f"\n🔧 APPLYING CLASS NAME NORMALIZATION")
print("=" * 42)

df = normalize_application_names(df, app_column)

print(f"✅ Class normalization completed!")
print(f"📊 Final dataset shape: {df.shape}")

# Verify the fix worked
final_dist = df[app_column].value_counts()
print(f"\n🎯 FINAL CLASS VERIFICATION:")
print("=" * 30)
for i, (app, count) in enumerate(final_dist.items()):
    percentage = count / len(df) * 100
    print(f"  {i+1:2d}. {app:<25} {count:6,} ({percentage:5.1f}%)")

# Check for any remaining case issues
class_names_lower = [name.lower() for name in final_dist.index]
if len(class_names_lower) != len(set(class_names_lower)):
    print("⚠️ WARNING: Still have case inconsistencies!")
else:
    print("✅ All class names are now properly normalized!")


🔧 APPLYING CLASS NAME NORMALIZATION
🔧 NORMALIZING APPLICATION CLASS NAMES
📊 Original classes: 11

📋 Original Distribution:
  P2P                            48,520
  Browsing                       46,457
  Audio-Streaming                19,830
  Chat                           11,629
  File-Transfer                  11,098
  Video-Streaming                 9,486
  Email                           6,145
  VOIP                            3,566
  AUDIO-STREAMING                 1,520
  Video-streaming                   281
  File-transfer                      84

✅ Normalized classes: 8

📋 Normalized Distribution:
  P2P                            48,520 ( 30.6%)
  Browsing                       46,457 ( 29.3%)
  Audio-Streaming                21,350 ( 13.5%)
  Chat                           11,629 (  7.3%)
  File-Transfer                  11,182 (  7.0%)
  Video-Streaming                 9,767 (  6.2%)
  Email                           6,145 (  3.9%)
  VOIP                            3,566 

In [8]:
# ===============================================================================
# CELL 6: Analyze Application Distribution
# ===============================================================================

# Analyze the selected application column
y_raw = df[app_column].copy()
app_distribution = y_raw.value_counts()

print(f"🎯 APPLICATION ANALYSIS")
print("=" * 25)
print(f"Application Column: '{app_column}'")
print(f"Number of Applications: {len(app_distribution)}")

print(f"\n📊 Application Distribution:")
for i, (app, count) in enumerate(app_distribution.items()):
    percentage = count / len(df) * 100
    print(f"  {i+1:2d}. {app:<30} {count:6,} ({percentage:5.1f}%)")

# Check if suitable for classification
if len(app_distribution) < 2:
    print("❌ ERROR: Need at least 2 classes for classification!")
elif len(app_distribution) > 50:
    print("⚠️ WARNING: Too many classes, might affect performance")
else:
    print(f"✅ Good for classification: {len(app_distribution)} classes")

🎯 APPLICATION ANALYSIS
Application Column: 'Label.1'
Number of Applications: 8

📊 Application Distribution:
   1. P2P                            48,520 ( 30.6%)
   2. Browsing                       46,457 ( 29.3%)
   3. Audio-Streaming                21,350 ( 13.5%)
   4. Chat                           11,629 (  7.3%)
   5. File-Transfer                  11,182 (  7.0%)
   6. Video-Streaming                 9,767 (  6.2%)
   7. Email                           6,145 (  3.9%)
   8. VOIP                            3,566 (  2.2%)
✅ Good for classification: 8 classes


In [9]:
# ===============================================================================
# CELL 7: Data Preprocessing
# ===============================================================================

def preprocess_data_enhanced(df, target_col):
    """Enhanced data preprocessing for maximum accuracy"""

    print("🔧 ENHANCED DATA PREPROCESSING")
    print("=" * 35)

    # Separate features and target
    X = df.drop(columns=[target_col]).copy()
    y = df[target_col].copy()

    print(f"Initial: {X.shape[1]} features, {len(X)} samples")

    # Remove ID and network-specific columns (not available in SDN)
    sdn_incompatible = [
        'Flow ID', 'Src IP', 'Dst IP', 'Src Port', 'Dst Port', 'Timestamp',
        'flow_id', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'timestamp',
        'FlowID', 'SrcIP', 'DstIP', 'SrcPort', 'DstPort', 'Flow_ID'
    ]

    removed_cols = []
    for col in sdn_incompatible:
        if col in X.columns:
            X = X.drop(columns=[col])
            removed_cols.append(col)

    if removed_cols:
        print(f"🗑️ Removed SDN-incompatible: {removed_cols}")

    # Remove other label columns
    other_labels = ['Label', 'label', 'class', 'Class', 'Attack', 'attack']
    for col in list(X.columns):
        if any(label in col for label in other_labels):
            X = X.drop(columns=[col])
            print(f"🗑️ Removed label column: {col}")

    # Convert object columns to numeric
    for col in X.columns:
        if X[col].dtype == 'object':
            try:
                X[col] = pd.to_numeric(X[col], errors='coerce')
            except:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                print(f"🔤 Label encoded: {col}")

    # Handle infinite values
    print("🔄 Handling infinite values...")
    X = X.replace([np.inf, -np.inf], np.nan)

    # Remove high missing columns (>85% missing)
    missing_thresh = 0.85
    high_missing = X.columns[X.isnull().sum() / len(X) > missing_thresh]
    if len(high_missing) > 0:
        X = X.drop(columns=high_missing)
        print(f"🗑️ Removed {len(high_missing)} high-missing columns")

    # Remove constant columns
    constant_cols = []
    for col in X.columns:
        if X[col].nunique() <= 1:
            constant_cols.append(col)

    if constant_cols:
        X = X.drop(columns=constant_cols)
        print(f"🗑️ Removed {len(constant_cols)} constant columns")

    # Smart missing value imputation
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if abs(X[col].skew()) > 1:  # Skewed data
                X[col] = X[col].fillna(X[col].median())
            else:  # Normal data
                X[col] = X[col].fillna(X[col].mean())

    # Remove duplicates
    before_dedup = len(X)
    duplicated = X.duplicated()
    X = X[~duplicated]
    y = y[~duplicated]
    after_dedup = len(X)

    if before_dedup != after_dedup:
        print(f"📄 Removed {before_dedup - after_dedup} duplicates")

    print(f"✅ Final: {X.shape[1]} features, {len(X)} samples")

    return X, y

# Apply preprocessing
X_clean, y_clean = preprocess_data_enhanced(df, app_column)

# ===============================================================================
# CELL 8: Label Encoding and Analysis
# ===============================================================================

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_clean)
class_names = label_encoder.classes_

print("🏷️ LABEL ENCODING RESULTS")
print("=" * 28)
print(f"Number of classes: {len(class_names)}")

# Show class distribution
unique_classes, class_counts = np.unique(y_encoded, return_counts=True)
print(f"\n📊 Encoded Class Distribution:")
for i, (cls, count) in enumerate(zip(class_names, class_counts)):
    print(f"  {i:2d}. {cls:<25} {count:6,} samples ({count/len(y_encoded)*100:5.1f}%)")

# Calculate class imbalance
imbalance_ratio = max(class_counts) / min(class_counts)
print(f"\n📈 Class imbalance ratio: {imbalance_ratio:.2f}")

if imbalance_ratio > 10:
    print("⚠️ HIGH imbalance - will apply advanced balancing")
elif imbalance_ratio > 3:
    print("⚠️ MODERATE imbalance - will apply balancing")
else:
    print("✅ Classes reasonably balanced")

🔧 ENHANCED DATA PREPROCESSING
Initial: 84 features, 158616 samples
🗑️ Removed SDN-incompatible: ['Flow ID', 'Src IP', 'Dst IP', 'Src Port', 'Dst Port', 'Timestamp']
🗑️ Removed label column: Label
🔄 Handling infinite values...
🗑️ Removed 15 constant columns
📄 Removed 56244 duplicates
✅ Final: 62 features, 102372 samples
🏷️ LABEL ENCODING RESULTS
Number of classes: 8

📊 Encoded Class Distribution:
   0. Audio-Streaming           13,493 samples ( 13.2%)
   1. Browsing                  29,803 samples ( 29.1%)
   2. Chat                       9,654 samples (  9.4%)
   3. Email                      5,313 samples (  5.2%)
   4. File-Transfer             10,474 samples ( 10.2%)
   5. P2P                       23,233 samples ( 22.7%)
   6. VOIP                       1,930 samples (  1.9%)
   7. Video-Streaming            8,472 samples (  8.3%)

📈 Class imbalance ratio: 15.44
⚠️ HIGH imbalance - will apply advanced balancing


In [10]:
# ===============================================================================
# CELL 9: Feature Engineering
# ===============================================================================

def create_advanced_features(X):
    """Create engineered features for better classification"""

    print("⚙️ FEATURE ENGINEERING")
    print("=" * 23)

    X_eng = X.copy()
    initial_count = X_eng.shape[1]

    # Statistical features across all features for each sample
    print("Creating statistical features...")
    X_eng['row_sum'] = X_eng.sum(axis=1)
    X_eng['row_mean'] = X_eng.mean(axis=1)
    X_eng['row_std'] = X_eng.std(axis=1)
    X_eng['row_max'] = X_eng.max(axis=1)
    X_eng['row_min'] = X_eng.min(axis=1)
    X_eng['row_range'] = X_eng['row_max'] - X_eng['row_min']
    X_eng['row_median'] = X_eng.median(axis=1)

    # Network flow specific features
    flow_keywords = ['byte', 'packet', 'length', 'duration', 'rate', 'time']
    flow_cols = [col for col in X.columns if any(kw in col.lower() for kw in flow_keywords)]

    if len(flow_cols) >= 4:
        print(f"Creating ratio features from {len(flow_cols)} flow columns...")

        # Create meaningful ratios
        for i in range(0, min(len(flow_cols), 8), 2):
            if i+1 < len(flow_cols):
                col1, col2 = flow_cols[i], flow_cols[i+1]
                ratio_name = f'ratio_{i//2}'
                X_eng[ratio_name] = X_eng[col1] / (X_eng[col2] + 1e-8)

                # Handle infinite ratios
                X_eng[ratio_name] = X_eng[ratio_name].replace([np.inf, -np.inf], 0)

    # Remove any columns that became constant after engineering
    for col in X_eng.columns:
        if X_eng[col].nunique() <= 1:
            X_eng = X_eng.drop(columns=[col])

    new_features = X_eng.shape[1] - initial_count
    print(f"✅ Added {new_features} engineered features")
    print(f"📊 Total features: {X_eng.shape[1]}")

    return X_eng

# Apply feature engineering
X_engineered = create_advanced_features(X_clean)


⚙️ FEATURE ENGINEERING
Creating statistical features...
Creating ratio features from 32 flow columns...
✅ Added 11 engineered features
📊 Total features: 73


In [11]:
# ===============================================================================
# CELL 10: Advanced Feature Selection
# ===============================================================================

def select_best_features(X, y, target_features=80):
    """Multi-stage feature selection for optimal performance"""

    print(f"🎯 ADVANCED FEATURE SELECTION")
    print(f"Target: {target_features} features")
    print("=" * 32)

    # Stage 1: Remove highly correlated features
    print("Stage 1: Correlation filtering...")
    corr_matrix = X.corr().abs()

    # Find highly correlated pairs
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )

    high_corr_features = [column for column in upper_triangle.columns
                         if any(upper_triangle[column] > 0.95)]

    if high_corr_features:
        X = X.drop(columns=high_corr_features)
        print(f"  Removed {len(high_corr_features)} highly correlated features")

    # Stage 2: Univariate feature selection
    print("Stage 2: Univariate selection...")
    k_univariate = min(target_features * 2, X.shape[1])

    selector_univariate = SelectKBest(score_func=f_classif, k=k_univariate)
    X_univariate = selector_univariate.fit_transform(X, y)
    selected_features = X.columns[selector_univariate.get_support()]

    print(f"  Selected {len(selected_features)} features")

    # Stage 3: Recursive Feature Elimination
    print("Stage 3: Recursive elimination...")

    # Use a fast estimator for RFE
    rf_estimator = RandomForestClassifier(
        n_estimators=50,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    # RFECV with cross-validation
    rfecv = RFECV(
        estimator=rf_estimator,
        step=1,
        cv=3,
        scoring='accuracy',
        min_features_to_select=max(20, target_features//2),
        n_jobs=-1
    )

    X_rfe = rfecv.fit_transform(X_univariate, y)
    final_features = selected_features[rfecv.get_support()]

    print(f"  RFECV selected {len(final_features)} optimal features")

    # Stage 4: Final selection based on importance
    print("Stage 4: Importance-based selection...")

    rf_final = RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    rf_final.fit(X_rfe, y)

    # Get feature importances
    importances = rf_final.feature_importances_

    # Select top features
    n_final = min(target_features, len(final_features))
    top_indices = np.argsort(importances)[-n_final:]

    X_final = pd.DataFrame(
        X_rfe[:, top_indices],
        columns=final_features[top_indices]
    )

    # Create feature importance dataframe
    feature_importance_df = pd.DataFrame({
        'feature': final_features[top_indices],
        'importance': importances[top_indices]
    }).sort_values('importance', ascending=False)

    print(f"✅ Final selection: {n_final} features")

    print(f"\n🏆 Top 10 Selected Features:")
    for i, (_, row) in enumerate(feature_importance_df.head(10).iterrows()):
        print(f"  {i+1:2d}. {row['feature']:<25} ({row['importance']:.4f})")

    return X_final, final_features[top_indices], feature_importance_df
# ===============================================================================
# CELL 15: Create Advanced Ensembles
# ===============================================================================

print("🔗 CREATING ADVANCED ENSEMBLES")
print("=" * 34)

# Voting Classifier (Soft Voting)
print("Creating Voting Ensemble...")
voting_classifier = VotingClassifier(
…print(f"\n🏆 ENSEMBLE RESULTS:")
print("=" * 20)
for name in ['Voting_Ensemble', 'Stacking_Ensemble']:
    score = validation_scores[name]
    status = "🎯" if score >= 0.95 else "📈" if score >= 0.90 else "⚠️"
    print(f"  {status} {name:<18} {score:.4f}")

# Apply feature selection
X_selected, selected_feature_names, feature_importance = select_best_features(
    X_engineered, y_encoded, target_features=80
)

print(f"\n✅ Feature selection completed!")
print(f"📊 Selected {X_selected.shape[1]} features from {X_engineered.shape[1]} original features")


🎯 ADVANCED FEATURE SELECTION
Target: 80 features
Stage 1: Correlation filtering...
  Removed 17 highly correlated features
Stage 2: Univariate selection...
  Selected 56 features
Stage 3: Recursive elimination...
  RFECV selected 51 optimal features
Stage 4: Importance-based selection...
✅ Final selection: 51 features

🏆 Top 10 Selected Features:
   1. Idle Min                  (0.0912)
   2. Idle Mean                 (0.0854)
   3. FWD Init Win Bytes        (0.0454)
   4. Flow IAT Max              (0.0404)
   5. Flow IAT Min              (0.0394)
   6. Flow Duration             (0.0372)
   7. Bwd Init Win Bytes        (0.0358)
   8. Fwd Packets/s             (0.0327)
   9. Flow Packets/s            (0.0310)
  10. ratio_0                   (0.0305)

✅ Feature selection completed!
📊 Selected 51 features from 73 original features


In [12]:
# ===============================================================================
# CELL 11: Data Splitting and Scaling
# ===============================================================================

print("📊 DATA SPLITTING AND SCALING")
print("=" * 32)

# Stratified train-test split
X_temp, X_test, y_temp, y_test = train_test_split(
    X_selected, y_encoded,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=y_encoded
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.176,  # This gives us 70-15-15 split
    random_state=RANDOM_STATE,
    stratify=y_temp
)

print(f"📈 Training set:   {len(X_train):,} samples ({len(X_train)/len(X_selected)*100:.1f}%)")
print(f"📈 Validation set: {len(X_val):,} samples ({len(X_val)/len(X_selected)*100:.1f}%)")
print(f"📈 Test set:       {len(X_test):,} samples ({len(X_test)/len(X_selected)*100:.1f}%)")

# Advanced scaling with RobustScaler (handles outliers better)
print("\n⚖️ Applying RobustScaler...")
scaler = RobustScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("✅ Scaling completed!")

# Verify no data leakage
print(f"\n🔍 Data validation:")
print(f"  Training features shape: {X_train_scaled.shape}")
print(f"  Validation features shape: {X_val_scaled.shape}")
print(f"  Test features shape: {X_test_scaled.shape}")
print(f"  No missing values: {not np.isnan(X_train_scaled).any()}")


📊 DATA SPLITTING AND SCALING
📈 Training set:   71,701 samples (70.0%)
📈 Validation set: 15,315 samples (15.0%)
📈 Test set:       15,356 samples (15.0%)

⚖️ Applying RobustScaler...
✅ Scaling completed!

🔍 Data validation:
  Training features shape: (71701, 51)
  Validation features shape: (15315, 51)
  Test features shape: (15356, 51)
  No missing values: True


In [13]:
# ===============================================================================
# CELL 12: Class Balancing
# ===============================================================================

def apply_advanced_balancing(X_train, y_train):
    """Apply advanced class balancing techniques"""

    print("⚖️ ADVANCED CLASS BALANCING")
    print("=" * 30)

    # Analyze current distribution
    unique_train, counts_train = np.unique(y_train, return_counts=True)

    print("📊 Current training distribution:")
    for cls, count in zip(unique_train, counts_train):
        cls_name = class_names[cls]
        print(f"  {cls_name:<25} {count:,} samples ({count/len(y_train)*100:.1f}%)")

    imbalance_ratio = max(counts_train) / min(counts_train)
    print(f"\n📈 Imbalance ratio: {imbalance_ratio:.2f}")

    if imbalance_ratio > 5:
        print("🔄 Applying SMOTE + Tomek Links for optimal balancing...")

        try:
            # SMOTETomek combines oversampling and undersampling
            smote_tomek = SMOTETomek(
                smote=SMOTE(random_state=RANDOM_STATE, k_neighbors=3),
                random_state=RANDOM_STATE
            )

            X_balanced, y_balanced = smote_tomek.fit_resample(X_train, y_train)

            print("📊 After SMOTE + Tomek:")
            unique_balanced, counts_balanced = np.unique(y_balanced, return_counts=True)
            for cls, count in zip(unique_balanced, counts_balanced):
                cls_name = class_names[cls]
                print(f"  {cls_name:<25} {count:,} samples")

            print(f"✅ Balanced dataset: {len(X_balanced):,} samples")
            return X_balanced, y_balanced

        except Exception as e:
            print(f"⚠️ Balancing failed: {e}")
            print("📊 Using original data with class weights in models")
            return X_train, y_train

    elif imbalance_ratio > 2:
        print("🔄 Applying standard SMOTE...")

        try:
            smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=3)
            X_balanced, y_balanced = smote.fit_resample(X_train, y_train)

            print(f"✅ SMOTE applied: {len(X_balanced):,} samples")
            return X_balanced, y_balanced

        except Exception as e:
            print(f"⚠️ SMOTE failed: {e}")
            return X_train, y_train

    else:
        print("✅ Classes reasonably balanced, no resampling needed")
        return X_train, y_train

# Apply balancing
X_train_balanced, y_train_balanced = apply_advanced_balancing(X_train_scaled, y_train)

# ===============================================================================
# CELL 13: Model Definition and Training
# ===============================================================================

def create_optimized_models():
    """Create highly optimized models for maximum accuracy"""

    print("🤖 CREATING OPTIMIZED MODELS")
    print("=" * 30)

    models = {}

    # 1. Enhanced Random Forest
    print("Creating Enhanced Random Forest...")
    models['Enhanced_RF'] = RandomForestClassifier(
        n_estimators=500,           # More trees
        max_depth=25,               # Deeper trees
        min_samples_split=2,        # More flexible splits
        min_samples_leaf=1,         # More flexible leaves
        max_features='sqrt',        # Good for high-dimensional data
        bootstrap=True,
        class_weight='balanced_subsample',  # Handle imbalance
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    # 2. Extra Trees (Extremely Randomized Trees)
    print("Creating Extra Trees Classifier...")
    models['Extra_Trees'] = ExtraTreesClassifier(
        n_estimators=500,
        max_depth=30,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=False,
        class_weight='balanced',
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    # 3. XGBoost
    print("Creating XGBoost Classifier...")
    models['XGBoost'] = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        eval_metric='mlogloss'
    )

    # 4. LightGBM
    print("Creating LightGBM Classifier...")
    models['LightGBM'] = lgb.LGBMClassifier(
        n_estimators=500,
        max_depth=10,
        learning_rate=0.1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=-1
    )

    print(f"✅ Created {len(models)} optimized models")
    return models

# Create models
optimized_models = create_optimized_models()


⚖️ ADVANCED CLASS BALANCING
📊 Current training distribution:
  Audio-Streaming           9,450 samples (13.2%)
  Browsing                  20,874 samples (29.1%)
  Chat                      6,762 samples (9.4%)
  Email                     3,721 samples (5.2%)
  File-Transfer             7,336 samples (10.2%)
  P2P                       16,272 samples (22.7%)
  VOIP                      1,352 samples (1.9%)
  Video-Streaming           5,934 samples (8.3%)

📈 Imbalance ratio: 15.44
🔄 Applying SMOTE + Tomek Links for optimal balancing...
📊 After SMOTE + Tomek:
  Audio-Streaming           19,936 samples
  Browsing                  19,591 samples
  Chat                      18,891 samples
  Email                     18,626 samples
  File-Transfer             19,761 samples
  P2P                       20,336 samples
  VOIP                      18,969 samples
  Video-Streaming           19,664 samples
✅ Balanced dataset: 155,774 samples
🤖 CREATING OPTIMIZED MODELS
Creating Enhanced Random For

In [14]:
# ===============================================================================
# CELL 14: Train Individual Models
# ===============================================================================

print("🚀 TRAINING INDIVIDUAL MODELS")
print("=" * 32)

trained_models = {}
validation_scores = {}

for name, model in optimized_models.items():
    print(f"\n📚 Training {name}...")

    # Train the model
    model.fit(X_train_balanced, y_train_balanced)
    trained_models[name] = model

    # Validate on validation set
    val_pred = model.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, val_pred)
    validation_scores[name] = val_accuracy

    print(f"   ✅ {name} validation accuracy: {val_accuracy:.4f}")

    # Check if we've reached our target
    if val_accuracy >= 0.95:
        print(f"   🎉 TARGET REACHED: {val_accuracy:.4f} >= 0.95!")

print(f"\n📊 INDIVIDUAL MODEL RESULTS:")
print("=" * 35)
for name, score in validation_scores.items():
    status = "🎯" if score >= 0.95 else "📈" if score >= 0.90 else "⚠️"
    print(f"  {status} {name:<15} {score:.4f}")


🚀 TRAINING INDIVIDUAL MODELS

📚 Training Enhanced_RF...
   ✅ Enhanced_RF validation accuracy: 0.8739

📚 Training Extra_Trees...
   ✅ Extra_Trees validation accuracy: 0.8601

📚 Training XGBoost...
   ✅ XGBoost validation accuracy: 0.8857

📚 Training LightGBM...
   ✅ LightGBM validation accuracy: 0.8855

📊 INDIVIDUAL MODEL RESULTS:
  ⚠️ Enhanced_RF     0.8739
  ⚠️ Extra_Trees     0.8601
  ⚠️ XGBoost         0.8857
  ⚠️ LightGBM        0.8855


In [15]:
# ===============================================================================
# CELL 15: Memory-Efficient Advanced Ensembles
# ===============================================================================

import gc  # For garbage collection

print("CREATING MEMORY-EFFICIENT ENSEMBLES")
print("=" * 40)

# Clear memory before ensemble creation
gc.collect()

# Memory monitoring function
def check_memory_usage():
    """Check current memory usage"""
    import psutil
    memory = psutil.virtual_memory()
    print(f"Memory usage: {memory.percent:.1f}% ({memory.used/1024**3:.1f}GB/{memory.total/1024**3:.1f}GB)")

check_memory_usage()

# Strategy 1: Use only the best performing individual models for ensemble
print("\nSelecting best models for ensemble...")

# Sort models by validation performance
sorted_models = sorted(validation_scores.items(), key=lambda x: x[1], reverse=True)
print("Model performance ranking:")
for i, (name, score) in enumerate(sorted_models):
    print(f"  {i+1}. {name}: {score:.4f}")

# Select only top 3 models to reduce memory usage
top_models = dict(sorted_models[:3])
print(f"\nSelected top 3 models for ensemble: {list(top_models.keys())}")

# Clear unused models from memory
for model_name in list(trained_models.keys()):
    if model_name not in top_models:
        del trained_models[model_name]
        print(f"Removed {model_name} from memory")

gc.collect()
check_memory_usage()

# Strategy 2: Lightweight Voting Classifier
print("\nCreating lightweight Voting Ensemble...")

try:
    # Use hard voting (less memory intensive than soft voting)
    voting_classifier = VotingClassifier(
        estimators=[(name, trained_models[name]) for name in top_models.keys()],
        voting='hard',  # Changed from 'soft' to reduce memory
        n_jobs=1        # Reduce parallel processing to save memory
    )

    print("Training Voting Ensemble...")
    voting_classifier.fit(X_train_balanced, y_train_balanced)

    # Clear training data temporarily to free memory
    X_temp = X_train_balanced
    y_temp = y_train_balanced
    del X_train_balanced, y_train_balanced
    gc.collect()

    # Validate voting ensemble
    voting_val_pred = voting_classifier.predict(X_val_scaled)
    voting_val_accuracy = accuracy_score(y_val, voting_val_pred)
    print(f"Voting Ensemble validation accuracy: {voting_val_accuracy:.4f}")

    # Restore training data
    X_train_balanced = X_temp
    y_train_balanced = y_temp
    del X_temp, y_temp

    # Add to models
    trained_models['Voting_Ensemble'] = voting_classifier
    validation_scores['Voting_Ensemble'] = voting_val_accuracy

    if voting_val_accuracy >= 0.95:
        print(f"TARGET REACHED: {voting_val_accuracy:.4f} >= 0.95!")

    gc.collect()
    check_memory_usage()

except Exception as e:
    print(f"Voting ensemble failed due to memory: {e}")
    print("Continuing with individual models only...")

# Strategy 3: Skip Stacking Classifier (most memory intensive)
print("\nSkipping Stacking Ensemble due to memory constraints...")
print("Stacking requires storing predictions from all base models, which exceeds available RAM")

# Strategy 4: Create a simple weighted ensemble instead
print("\nCreating Memory-Efficient Weighted Ensemble...")

class WeightedEnsemble:
    def __init__(self, models, weights):
        self.models = models
        self.weights = weights

    def predict(self, X):
        """Make weighted predictions"""
        predictions = []

        for (name, model), weight in zip(self.models.items(), self.weights):
            pred = model.predict(X)
            predictions.append(pred * weight)

        # Sum weighted predictions and take argmax
        weighted_sum = np.sum(predictions, axis=0)
        return np.array([np.argmax(np.bincount(row.astype(int))) for row in weighted_sum.reshape(-1, 1)])

    def predict_proba(self, X):
        """Make weighted probability predictions if available"""
        if not hasattr(list(self.models.values())[0], 'predict_proba'):
            return None

        prob_predictions = []
        for (name, model), weight in zip(self.models.items(), self.weights):
            try:
                proba = model.predict_proba(X)
                prob_predictions.append(proba * weight)
            except:
                # If predict_proba fails, skip this model
                continue

        if prob_predictions:
            return np.mean(prob_predictions, axis=0)
        return None

# Create weighted ensemble with performance-based weights
weights = [score for score in top_models.values()]
weights = np.array(weights) / np.sum(weights)  # Normalize weights

print(f"Model weights: {dict(zip(top_models.keys(), weights))}")

weighted_ensemble = WeightedEnsemble(
    {name: trained_models[name] for name in top_models.keys()},
    weights
)

# Test weighted ensemble
weighted_val_pred = weighted_ensemble.predict(X_val_scaled)
weighted_val_accuracy = accuracy_score(y_val, weighted_val_pred)
print(f"Weighted Ensemble validation accuracy: {weighted_val_accuracy:.4f}")

if weighted_val_accuracy >= 0.95:
    print(f"TARGET REACHED: {weighted_val_accuracy:.4f} >= 0.95!")

# Add weighted ensemble
trained_models['Weighted_Ensemble'] = weighted_ensemble
validation_scores['Weighted_Ensemble'] = weighted_val_accuracy

# Final memory cleanup
gc.collect()
check_memory_usage()

print(f"\nENSEMBLE RESULTS:")
print("=" * 20)
ensemble_models = ['Voting_Ensemble', 'Weighted_Ensemble']
for name in ensemble_models:
    if name in validation_scores:
        score = validation_scores[name]
        status = "TARGET REACHED" if score >= 0.95 else "CLOSE" if score >= 0.90 else "NEEDS IMPROVEMENT"
        print(f"  {status}: {name:<18} {score:.4f}")

print(f"\nMemory-efficient ensemble creation completed!")
print(f"Available models: {list(trained_models.keys())}")

CREATING MEMORY-EFFICIENT ENSEMBLES
Memory usage: 71.6% (8.7GB/12.7GB)

Selecting best models for ensemble...
Model performance ranking:
  1. XGBoost: 0.8857
  2. LightGBM: 0.8855
  3. Enhanced_RF: 0.8739
  4. Extra_Trees: 0.8601

Selected top 3 models for ensemble: ['XGBoost', 'LightGBM', 'Enhanced_RF']
Removed Extra_Trees from memory
Memory usage: 71.6% (8.7GB/12.7GB)

Creating lightweight Voting Ensemble...
Training Voting Ensemble...
Voting Ensemble validation accuracy: 0.8859
Memory usage: 84.2% (10.3GB/12.7GB)

Skipping Stacking Ensemble due to memory constraints...
Stacking requires storing predictions from all base models, which exceeds available RAM

Creating Memory-Efficient Weighted Ensemble...
Model weights: {'XGBoost': np.float64(0.3348309059491484), 'LightGBM': np.float64(0.33478153542335226), 'Enhanced_RF': np.float64(0.33038755862749936)}
Weighted Ensemble validation accuracy: 0.8611
Memory usage: 84.2% (10.3GB/12.7GB)

ENSEMBLE RESULTS:
  NEEDS IMPROVEMENT: Voting_Ense

In [16]:
# ===============================================================================
# CELL 16: Final Evaluation on Test Set
# ===============================================================================

print("🎯 FINAL EVALUATION ON TEST SET")
print("=" * 35)

def evaluate_model_comprehensive(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='weighted', zero_division=0
    )

    print(f"\n📊 {model_name} Test Results:")
    print(f"   🎯 Accuracy:  {accuracy:.4f} ({'✅ TARGET REACHED!' if accuracy >= 0.95 else '📈 Close!' if accuracy >= 0.90 else '⚠️ Needs improvement'})")
    print(f"   🔍 Precision: {precision:.4f}")
    print(f"   📥 Recall:    {recall:.4f}")
    print(f"   ⚖️ F1-Score:  {f1:.4f}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': y_pred
    }

# Evaluate all models on test set
test_results = {}
best_accuracy = 0
best_model_name = ""

print("🔬 TESTING ALL MODELS:")
print("=" * 25)

for name, model in trained_models.items():
    results = evaluate_model_comprehensive(model, X_test_scaled, y_test, name)
    test_results[name] = results

    if results['accuracy'] > best_accuracy:
        best_accuracy = results['accuracy']
        best_model_name = name

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"🎯 BEST TEST ACCURACY: {best_accuracy:.4f}")

if best_accuracy >= 0.95:
    print("🎉 CONGRATULATIONS! 95%+ ACCURACY ACHIEVED!")
else:
    gap = 0.95 - best_accuracy
    print(f"📈 Gap to 95% target: {gap:.4f} ({gap*100:.2f} percentage points)")

🎯 FINAL EVALUATION ON TEST SET
🔬 TESTING ALL MODELS:

📊 Enhanced_RF Test Results:
   🎯 Accuracy:  0.8635 (⚠️ Needs improvement)
   🔍 Precision: 0.8689
   📥 Recall:    0.8635
   ⚖️ F1-Score:  0.8653

📊 XGBoost Test Results:
   🎯 Accuracy:  0.8768 (⚠️ Needs improvement)
   🔍 Precision: 0.8825
   📥 Recall:    0.8768
   ⚖️ F1-Score:  0.8783

📊 LightGBM Test Results:
   🎯 Accuracy:  0.8780 (⚠️ Needs improvement)
   🔍 Precision: 0.8847
   📥 Recall:    0.8780
   ⚖️ F1-Score:  0.8797

📊 Voting_Ensemble Test Results:
   🎯 Accuracy:  0.8797 (⚠️ Needs improvement)
   🔍 Precision: 0.8855
   📥 Recall:    0.8797
   ⚖️ F1-Score:  0.8812

📊 Weighted_Ensemble Test Results:
   🎯 Accuracy:  0.8491 (⚠️ Needs improvement)
   🔍 Precision: 0.8543
   📥 Recall:    0.8491
   ⚖️ F1-Score:  0.8508

🏆 BEST MODEL: Voting_Ensemble
🎯 BEST TEST ACCURACY: 0.8797
📈 Gap to 95% target: 0.0703 (7.03 percentage points)
