# Challenge: Sequence Deviation Detection

This notebook creates **finite alphabet sequences datasets** (≤20 symbols for niche focus) designed to favor sequence modeling:

1. **🔐 Network Protocol State Machines** - 16 states, complex transitions, subtle violations
2. **🧬 Protein Folding Sequences** - 20 amino acids, biological constraints, rare misfolding
3. **📡 Communication Protocol Analysis** - 12 symbols, timing patterns, steganographic attacks

**Features:**
- **Small vocabularies**: 12-20 symbols maximum
- **Complex sequence dependencies**: Multi-order Markov patterns
- **Subtle deviations**: Microscopic pattern violations
- **Sequence-favoring design**: Traditional ML struggles with temporal patterns
- **Difficulty**: Possible Max F1 < 0.75 through carefully built complexity

In [None]:
pip install anomaly-grid-py

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
import hashlib
from datetime import datetime, timedelta
from collections import defaultdict, Counter
import itertools
import math
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import LocalOutlierFactor
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, f1_score, accuracy_score, precision_recall_curve,
    average_precision_score, classification_report, precision_score,
    recall_score, matthews_corrcoef, balanced_accuracy_score,
    cohen_kappa_score, log_loss, brier_score_loss
)

try:
    import anomaly_grid_py
    ANOMALY_GRID_AVAILABLE = True
    print("✅ anomaly-grid-py available")
except ImportError:
    ANOMALY_GRID_AVAILABLE = False
    print("❌ Install anomaly-grid-py: pip install anomaly-grid-py")

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)
print("Ready!")

## 🔐 Dataset 1: Network Protocol State Machines (16 States)

**State transition patterns with subtle protocol violations**

- **Alphabet**: 16 protocol states (finite, well-defined)
- **Challenge**: Multi-order state dependencies with rare violations

In [None]:
def generate_protocol_state_machine_dataset(n_samples=15000, contamination=0.025):
    """
    Generate protocol state machine dataset.

    This dataset focuses on finite alphabet sequence patterns where:
    - 16 protocol states form complex transition patterns
    - Normal sequences follow strict state machine rules
    - Anomalies are subtle state transition violations

    """

    # Finite alphabet: 16 protocol states
    PROTOCOL_STATES = [
        'INIT', 'LISTEN', 'SYN_SENT', 'SYN_RECV', 'ESTABLISHED',
        'FIN_WAIT1', 'FIN_WAIT2', 'CLOSE_WAIT', 'CLOSING', 'LAST_ACK',
        'TIME_WAIT', 'CLOSED', 'AUTH', 'DATA_XFER', 'ERROR', 'RESET'
    ]

    # State transition rules (multi-order dependencies)
    VALID_TRANSITIONS = {
        # Order-1 transitions
        'INIT': ['LISTEN', 'SYN_SENT'],
        'LISTEN': ['SYN_RECV', 'CLOSED'],
        'SYN_SENT': ['SYN_RECV', 'ESTABLISHED', 'CLOSED'],
        'SYN_RECV': ['ESTABLISHED', 'FIN_WAIT1', 'RESET'],
        'ESTABLISHED': ['AUTH', 'DATA_XFER', 'FIN_WAIT1', 'CLOSE_WAIT'],
        'AUTH': ['DATA_XFER', 'ERROR', 'ESTABLISHED'],
        'DATA_XFER': ['DATA_XFER', 'FIN_WAIT1', 'CLOSE_WAIT', 'ESTABLISHED'],
        'FIN_WAIT1': ['FIN_WAIT2', 'CLOSING', 'TIME_WAIT'],
        'FIN_WAIT2': ['TIME_WAIT', 'CLOSED'],
        'CLOSE_WAIT': ['LAST_ACK', 'CLOSED'],
        'CLOSING': ['TIME_WAIT', 'CLOSED'],
        'LAST_ACK': ['CLOSED', 'TIME_WAIT'],
        'TIME_WAIT': ['CLOSED', 'INIT'],
        'CLOSED': ['INIT', 'LISTEN'],
        'ERROR': ['RESET', 'CLOSED', 'INIT'],
        'RESET': ['INIT', 'CLOSED']
    }

    # Order-2 transition constraints (context-dependent)
    ORDER2_CONSTRAINTS = {
        ('INIT', 'LISTEN'): ['SYN_RECV', 'CLOSED'],
        ('INIT', 'SYN_SENT'): ['SYN_RECV', 'ESTABLISHED'],
        ('SYN_RECV', 'ESTABLISHED'): ['AUTH', 'DATA_XFER'],
        ('ESTABLISHED', 'AUTH'): ['DATA_XFER', 'ESTABLISHED'],
        ('AUTH', 'DATA_XFER'): ['DATA_XFER', 'FIN_WAIT1'],
        ('DATA_XFER', 'DATA_XFER'): ['DATA_XFER', 'FIN_WAIT1', 'CLOSE_WAIT'],
        ('FIN_WAIT1', 'FIN_WAIT2'): ['TIME_WAIT', 'CLOSED'],
        ('TIME_WAIT', 'CLOSED'): ['INIT', 'LISTEN'],
        ('CLOSED', 'INIT'): ['LISTEN', 'SYN_SENT'],
        ('ERROR', 'RESET'): ['INIT', 'CLOSED']
    }

    # Order-3 transition patterns (complex dependencies)
    ORDER3_PATTERNS = {
        ('INIT', 'SYN_SENT', 'SYN_RECV'): ['ESTABLISHED'],
        ('SYN_RECV', 'ESTABLISHED', 'AUTH'): ['DATA_XFER'],
        ('ESTABLISHED', 'AUTH', 'DATA_XFER'): ['DATA_XFER', 'FIN_WAIT1'],
        ('AUTH', 'DATA_XFER', 'DATA_XFER'): ['DATA_XFER', 'FIN_WAIT1'],
        ('DATA_XFER', 'FIN_WAIT1', 'FIN_WAIT2'): ['TIME_WAIT'],
        ('FIN_WAIT2', 'TIME_WAIT', 'CLOSED'): ['INIT'],
        ('TIME_WAIT', 'CLOSED', 'INIT'): ['LISTEN', 'SYN_SENT'],
        ('CLOSED', 'INIT', 'LISTEN'): ['SYN_RECV'],
        ('ERROR', 'RESET', 'INIT'): ['LISTEN', 'SYN_SENT']
    }

    sequences = []
    labels = []
    features = []

    n_anomalies = int(n_samples * contamination)
    n_normal = n_samples - n_anomalies

    print(f"Generating {n_normal} normal and {n_anomalies} anomalous protocol sequences...")
    print(f"📊 Finite alphabet: {len(PROTOCOL_STATES)} states")

    # Generate normal sequences (following state machine rules)
    for i in range(n_normal):
        if i % 2000 == 0:
            print(f"  Normal sequences: {i}/{n_normal}")

        sequence_length = random.randint(50, 200)  # Long enough for decent patterns
        sequence = generate_valid_protocol_sequence(sequence_length, PROTOCOL_STATES,
                                                   VALID_TRANSITIONS, ORDER2_CONSTRAINTS, ORDER3_PATTERNS)

        sequences.append(sequence)
        labels.append(0)

        # Create sequence-focused features
        feature_vector = create_protocol_state_features(sequence, PROTOCOL_STATES, is_anomaly=False)
        features.append(feature_vector)

    # Generate anomalous sequences (subtle state machine violations)
    for i in range(n_anomalies):
        if i % 200 == 0:
            print(f"  Anomalous sequences: {i}/{n_anomalies}")

        # Start with a valid sequence
        normal_idx = random.randint(0, len(sequences) - 1)
        base_sequence = sequences[normal_idx].copy()

        # Apply subtle state machine violations
        violation_type = random.choices(
            ['invalid_transition', 'order2_violation', 'order3_violation', 'state_skip'],
            weights=[0.4, 0.3, 0.2, 0.1]
        )[0]

        base_sequence = apply_state_violation(base_sequence, violation_type, PROTOCOL_STATES,
                                            VALID_TRANSITIONS, ORDER2_CONSTRAINTS, ORDER3_PATTERNS)

        sequences.append(base_sequence)
        labels.append(1)

        # Create sequence-focused features
        feature_vector = create_protocol_state_features(base_sequence, PROTOCOL_STATES, is_anomaly=True)
        features.append(feature_vector)

    # Shuffle
    combined = list(zip(sequences, labels, features))
    random.shuffle(combined)
    sequences, labels, features = zip(*combined)

    return list(sequences), np.array(labels), np.array(features)

def generate_valid_protocol_sequence(length, states, transitions, order2_constraints, order3_patterns):
    """Generate a valid protocol sequence following state machine rules."""
    sequence = ['INIT']  # Always start with INIT

    while len(sequence) < length:
        current_state = sequence[-1]

        # Check for order-3 patterns first (highest priority)
        if len(sequence) >= 3:
            order3_key = tuple(sequence[-3:])
            if order3_key in order3_patterns:
                next_state = random.choice(order3_patterns[order3_key])
                sequence.append(next_state)
                continue

        # Check for order-2 constraints
        if len(sequence) >= 2:
            order2_key = tuple(sequence[-2:])
            if order2_key in order2_constraints:
                next_state = random.choice(order2_constraints[order2_key])
                sequence.append(next_state)
                continue

        # Use order-1 transitions
        if current_state in transitions:
            next_state = random.choice(transitions[current_state])
            sequence.append(next_state)
        else:
            # Fallback to random valid state
            sequence.append(random.choice(states))

    return sequence[:length]

def apply_state_violation(sequence, violation_type, states, transitions, order2_constraints, order3_patterns):
    """Apply subtle state machine violations to create anomalies."""
    if len(sequence) < 5:
        return sequence

    if violation_type == 'invalid_transition':
        # Insert invalid state transition
        pos = random.randint(1, len(sequence) - 2)
        current_state = sequence[pos - 1]
        valid_next = transitions.get(current_state, [])
        invalid_states = [s for s in states if s not in valid_next and s != current_state]
        if invalid_states:
            sequence[pos] = random.choice(invalid_states)

    elif violation_type == 'order2_violation':
        # Violate order-2 constraint
        for i in range(2, len(sequence)):
            order2_key = tuple(sequence[i-2:i])
            if order2_key in order2_constraints:
                valid_next = order2_constraints[order2_key]
                invalid_states = [s for s in states if s not in valid_next]
                if invalid_states:
                    sequence[i] = random.choice(invalid_states)
                    break

    elif violation_type == 'order3_violation':
        # Violate order-3 pattern
        for i in range(3, len(sequence)):
            order3_key = tuple(sequence[i-3:i])
            if order3_key in order3_patterns:
                valid_next = order3_patterns[order3_key]
                invalid_states = [s for s in states if s not in valid_next]
                if invalid_states:
                    sequence[i] = random.choice(invalid_states)
                    break

    else:  # state_skip
        # Skip a required intermediate state
        pos = random.randint(1, len(sequence) - 2)
        # Remove one state to create a skip
        sequence.pop(pos)

    return sequence

def create_protocol_state_features(sequence, states, is_anomaly=False):
    """Create features that lose temporal information."""

    # State frequency features (lose temporal order)
    state_counts = {state: sequence.count(state) for state in states}

    # Basic sequence statistics (minimal temporal info)
    sequence_stats = {
        'length': len(sequence),
        'unique_states': len(set(sequence)),
        'most_common_state_freq': max(state_counts.values()) if state_counts else 0,
        'state_diversity': len([c for c in state_counts.values() if c > 0])
    }

    # Transition frequency (some temporal info but limited)
    transitions = {}
    for i in range(len(sequence) - 1):
        trans = f"{sequence[i]}->{sequence[i+1]}"
        transitions[trans] = transitions.get(trans, 0) + 1

    transition_stats = {
        'num_transitions': len(transitions),
        'max_transition_freq': max(transitions.values()) if transitions else 0,
        'avg_transition_freq': np.mean(list(transitions.values())) if transitions else 0
    }

    # Combine features
    feature_vector = []
    feature_vector.extend(state_counts.values())
    feature_vector.extend(sequence_stats.values())
    feature_vector.extend(transition_stats.values())

    # Add minimal noise
    for _ in range(10):
        feature_vector.append(random.gauss(0, 0.5))

    return feature_vector

print("🔐 Generating Protocol State Machine Dataset...")
protocol_sequences, protocol_labels, protocol_features = generate_protocol_state_machine_dataset(n_samples=15000, contamination=0.025)

print(f"\n📊 Protocol State Machine Dataset:")
print(f"  Alphabet size: 16 states (finite, well-defined)")
print(f"  Total samples: {len(protocol_sequences)}")
print(f"  Features: {protocol_features.shape[1]}")
print(f"  Normal sequences: {np.sum(protocol_labels == 0)} ({np.sum(protocol_labels == 0)/len(protocol_labels)*100:.1f}%)")
print(f"  Anomalous sequences: {np.sum(protocol_labels == 1)} ({np.sum(protocol_labels == 1)/len(protocol_labels)*100:.1f}%)")
print(f"  Avg sequence length: {np.mean([len(seq) for seq in protocol_sequences]):.1f}")
print(f"  Max sequence length: {max([len(seq) for seq in protocol_sequences])}")

# Show examples
print(f"\n🔍 Example sequences:")
normal_idx = np.where(protocol_labels == 0)[0][0]
anomaly_idx = np.where(protocol_labels == 1)[0][0]
print(f"  Normal (first 15): {protocol_sequences[normal_idx][:15]}")
print(f"  Anomaly (first 15): {protocol_sequences[anomaly_idx][:15]}")
print("✅ Protocol state machine dataset generated")

## 🧬 Dataset 2: Protein Folding Sequences (20 Amino Acids)

**Biological sequence patterns with synthetic rare misfolding events**

- **Alphabet**: 20 amino acids (standard biological alphabet)
- **Challenge**: Complex biological constraints with rare misfolding patterns
- **Focus**: Protein folding requires understanding of sequence context

In [None]:
def generate_protein_folding_dataset(n_samples=15000, contamination=0.02):
    """
    Generate impossibly difficult protein folding dataset.

    This dataset focuses on biological sequence patterns where:
    - 20 amino acids form complex folding patterns
    - Normal sequences follow biological constraints
    - Anomalies are rare misfolding patterns
    """

    # Finite alphabet: 20 standard amino acids
    AMINO_ACIDS = [
        'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
        'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
    ]

    # Amino acid properties for biological constraints
    HYDROPHOBIC = ['A', 'I', 'L', 'M', 'F', 'W', 'Y', 'V']
    HYDROPHILIC = ['R', 'N', 'D', 'Q', 'E', 'H', 'K', 'S', 'T']
    SPECIAL = ['C', 'G', 'P']  # Cysteine, Glycine, Proline

    CHARGED_POSITIVE = ['R', 'H', 'K']
    CHARGED_NEGATIVE = ['D', 'E']
    AROMATIC = ['F', 'W', 'Y', 'H']

    # Biological folding patterns
    ALPHA_HELIX_FAVORING = ['A', 'E', 'L', 'M']
    BETA_SHEET_FAVORING = ['I', 'Y', 'F', 'V']
    TURN_FAVORING = ['G', 'N', 'P', 'S']

    # Biological sequence motifs (order-dependent)
    SIGNAL_PEPTIDES = [
        ['M', 'K', 'L', 'L', 'F'],  # Start signal
        ['L', 'L', 'A', 'A', 'A'],  # Hydrophobic signal
        ['A', 'L', 'A', 'L', 'A']   # Alternating pattern
    ]

    BINDING_SITES = [
        ['R', 'G', 'D'],           # RGD motif
        ['N', 'P', 'X', 'Y'],      # NPXY motif (X = any)
        ['D', 'X', 'D', 'X', 'D']  # Metal binding
    ]

    # Disulfide bond patterns (Cysteine constraints)
    DISULFIDE_PATTERNS = [
        ['C', 'X', 'X', 'C'],      # Close disulfide
        ['C', 'X', 'X', 'X', 'X', 'C'],  # Medium disulfide
    ]

    sequences = []
    labels = []
    features = []

    n_anomalies = int(n_samples * contamination)
    n_normal = n_samples - n_anomalies

    print(f"Generating {n_normal} normal and {n_anomalies} misfolded protein sequences...")
    print(f"📊 Finite alphabet: {len(AMINO_ACIDS)} amino acids")

    # Generate normal sequences (following biological constraints)
    for i in range(n_normal):
        if i % 2000 == 0:
            print(f"  Normal sequences: {i}/{n_normal}")

        sequence_length = random.randint(80, 300)  # Protein length
        sequence = generate_valid_protein_sequence(sequence_length, AMINO_ACIDS,
                                                 HYDROPHOBIC, HYDROPHILIC, SPECIAL,
                                                 ALPHA_HELIX_FAVORING, BETA_SHEET_FAVORING, TURN_FAVORING,
                                                 SIGNAL_PEPTIDES, BINDING_SITES, DISULFIDE_PATTERNS)

        sequences.append(sequence)
        labels.append(0)

        # Create sequence-focused features
        feature_vector = create_protein_features(sequence, AMINO_ACIDS, HYDROPHOBIC,
                                                HYDROPHILIC, AROMATIC, is_anomaly=False)
        features.append(feature_vector)

    # Generate anomalous sequences (misfolding patterns)
    for i in range(n_anomalies):
        if i % 200 == 0:
            print(f"  Misfolded sequences: {i}/{n_anomalies}")

        # Start with a valid sequence
        normal_idx = random.randint(0, len(sequences) - 1)
        base_sequence = sequences[normal_idx].copy()

        # Apply subtle misfolding patterns
        misfolding_type = random.choices(
            ['hydrophobic_exposure', 'disulfide_disruption', 'charge_clustering', 'proline_kink'],
            weights=[0.3, 0.25, 0.25, 0.2]
        )[0]

        base_sequence = apply_misfolding_pattern(base_sequence, misfolding_type, AMINO_ACIDS,
                                               HYDROPHOBIC, HYDROPHILIC, CHARGED_POSITIVE,
                                               CHARGED_NEGATIVE, SPECIAL)

        sequences.append(base_sequence)
        labels.append(1)

        # Create sequence-focused features
        feature_vector = create_protein_features(base_sequence, AMINO_ACIDS, HYDROPHOBIC,
                                                HYDROPHILIC, AROMATIC, is_anomaly=True)
        features.append(feature_vector)

    # Shuffle
    combined = list(zip(sequences, labels, features))
    random.shuffle(combined)
    sequences, labels, features = zip(*combined)

    return list(sequences), np.array(labels), np.array(features)

def generate_valid_protein_sequence(length, amino_acids, hydrophobic, hydrophilic, special,
                                   alpha_helix, beta_sheet, turn, signal_peptides,
                                   binding_sites, disulfide_patterns):
    """Generate a biologically valid protein sequence."""
    sequence = ['M']  # Always start with Methionine

    # Add signal peptide (20% chance)
    if random.random() < 0.2:
        signal = random.choice(signal_peptides)
        sequence.extend(signal)

    while len(sequence) < length:
        remaining = length - len(sequence)

        # Choose structural element
        structure_type = random.choices(
            ['alpha_helix', 'beta_sheet', 'turn', 'binding_site', 'disulfide', 'random'],
            weights=[0.3, 0.25, 0.15, 0.1, 0.05, 0.15]
        )[0]

        if structure_type == 'alpha_helix' and remaining >= 8:
            # Add alpha helix segment
            helix_length = min(random.randint(8, 15), remaining)
            for _ in range(helix_length):
                aa = random.choice(alpha_helix + hydrophobic)
                sequence.append(aa)

        elif structure_type == 'beta_sheet' and remaining >= 6:
            # Add beta sheet segment
            sheet_length = min(random.randint(6, 12), remaining)
            for _ in range(sheet_length):
                aa = random.choice(beta_sheet + hydrophobic)
                sequence.append(aa)

        elif structure_type == 'turn' and remaining >= 3:
            # Add turn segment
            turn_length = min(random.randint(3, 6), remaining)
            for _ in range(turn_length):
                aa = random.choice(turn + hydrophilic)
                sequence.append(aa)

        elif structure_type == 'binding_site' and remaining >= 3:
            # Add binding site motif
            motif = random.choice(binding_sites)
            for aa in motif:
                if aa == 'X':
                    sequence.append(random.choice(amino_acids))
                else:
                    sequence.append(aa)
                if len(sequence) >= length:
                    break

        elif structure_type == 'disulfide' and remaining >= 4:
            # Add disulfide pattern
            pattern = random.choice(disulfide_patterns)
            for aa in pattern:
                if aa == 'X':
                    sequence.append(random.choice(amino_acids))
                else:
                    sequence.append(aa)
                if len(sequence) >= length:
                    break

        else:
            # Add random amino acid with biological bias
            aa_type = random.choices(
                [hydrophobic, hydrophilic, special],
                weights=[0.4, 0.5, 0.1]
            )[0]
            sequence.append(random.choice(aa_type))

    return sequence[:length]

def apply_misfolding_pattern(sequence, misfolding_type, amino_acids, hydrophobic,
                           hydrophilic, charged_pos, charged_neg, special):
    """Apply subtle misfolding patterns to create biological anomalies."""
    if len(sequence) < 10:
        return sequence

    if misfolding_type == 'hydrophobic_exposure':
        # Place hydrophobic residues in exposed positions (surface)
        # This violates the hydrophobic core principle
        for _ in range(random.randint(2, 4)):
            pos = random.randint(1, len(sequence) - 2)
            # Replace with highly hydrophobic residue in wrong context
            sequence[pos] = random.choice(['F', 'W', 'I', 'L'])

    elif misfolding_type == 'disulfide_disruption':
        # Disrupt disulfide bond patterns
        cys_positions = [i for i, aa in enumerate(sequence) if aa == 'C']
        if len(cys_positions) >= 2:
            # Replace one cysteine to break disulfide bond
            pos = random.choice(cys_positions)
            sequence[pos] = random.choice(['S', 'T', 'A'])  # Similar but no disulfide

    elif misfolding_type == 'charge_clustering':
        # Create unfavorable charge clustering
        cluster_start = random.randint(1, len(sequence) - 5)
        charge_type = random.choice([charged_pos, charged_neg])
        for i in range(cluster_start, min(cluster_start + 3, len(sequence))):
            sequence[i] = random.choice(charge_type)

    else:  # proline_kink
        # Insert proline in secondary structure (disrupts folding)
        # Proline creates kinks and breaks alpha helices
        for _ in range(random.randint(1, 3)):
            pos = random.randint(5, len(sequence) - 5)
            sequence[pos] = 'P'  # Proline kink

    return sequence

def create_protein_features(sequence, amino_acids, hydrophobic, hydrophilic, aromatic, is_anomaly=False):
    """Create features that lose biological sequence context."""

    # Amino acid composition (loses sequence order)
    aa_counts = {aa: sequence.count(aa) for aa in amino_acids}

    # Biochemical properties (some biological meaning but limited)
    property_counts = {
        'hydrophobic': sum(1 for aa in sequence if aa in hydrophobic),
        'hydrophilic': sum(1 for aa in sequence if aa in hydrophilic),
        'aromatic': sum(1 for aa in sequence if aa in aromatic),
        'charged': sum(1 for aa in sequence if aa in ['R', 'H', 'K', 'D', 'E'])
    }

    # Basic sequence statistics
    sequence_stats = {
        'length': len(sequence),
        'unique_aa': len(set(sequence)),
        'hydrophobic_ratio': property_counts['hydrophobic'] / len(sequence),
        'charge_ratio': property_counts['charged'] / len(sequence)
    }

    # Combine features
    feature_vector = []
    feature_vector.extend(aa_counts.values())
    feature_vector.extend(property_counts.values())
    feature_vector.extend(sequence_stats.values())

    # Minimal noise
    for _ in range(8):
        feature_vector.append(random.gauss(0, 0.3))

    return feature_vector

print("🧬 Generating Protein Folding Dataset...")
protein_sequences, protein_labels, protein_features = generate_protein_folding_dataset(n_samples=15000, contamination=0.02)

print(f"\n📊 Protein Folding Dataset:")
print(f"  Alphabet size: 20 amino acids (standard biological alphabet)")
print(f"  Total samples: {len(protein_sequences)}")
print(f"  Features: {protein_features.shape[1]}")
print(f"  Normal sequences: {np.sum(protein_labels == 0)} ({np.sum(protein_labels == 0)/len(protein_labels)*100:.1f}%)")
print(f"  Misfolded sequences: {np.sum(protein_labels == 1)} ({np.sum(protein_labels == 1)/len(protein_labels)*100:.1f}%)")
print(f"  Avg sequence length: {np.mean([len(seq) for seq in protein_sequences]):.1f}")
print(f"  Max sequence length: {max([len(seq) for seq in protein_sequences])}")

# Show examples
print(f"\n🔍 Example sequences:")
normal_idx = np.where(protein_labels == 0)[0][0]
anomaly_idx = np.where(protein_labels == 1)[0][0]
print(f"  Normal (first 20): {''.join(protein_sequences[normal_idx][:20])}")
print(f"  Misfolded (first 20): {''.join(protein_sequences[anomaly_idx][:20])}")
print("✅ Protein folding dataset generated")

## 📡 Dataset 3: Communication Protocol Analysis (12 Symbols)

**Digital communication patterns with steganographic timing attacks**

- **Alphabet**: 12 communication symbols (minimal finite alphabet)
- **Challenge**: Timing patterns with steganographic concealment
- **Focus**: Communication protocols require temporal understanding

In [None]:
def generate_communication_protocol_dataset(n_samples=15000, contamination=0.02):
    """
    Generate impossibly difficult communication protocol dataset.

    This dataset focuses on minimal finite alphabet patterns where:
    - 12 communication symbols form complex timing patterns
    - Normal sequences follow strict protocol timing
    - Anomalies are steganographic timing attacks
    """

    # Finite alphabet: 12 communication symbols
    COMM_SYMBOLS = [
        'START', 'SYNC', 'DATA', 'ACK', 'NACK', 'RETRY',
        'PAUSE', 'RESUME', 'CHECK', 'ERROR', 'STOP', 'IDLE'
    ]

    # Protocol timing constraints (order-dependent)
    TIMING_PATTERNS = {
        # Normal timing sequences
        ('START', 'SYNC'): ['DATA', 'CHECK'],
        ('SYNC', 'DATA'): ['DATA', 'ACK', 'CHECK'],
        ('DATA', 'DATA'): ['DATA', 'ACK', 'CHECK', 'PAUSE'],
        ('DATA', 'ACK'): ['DATA', 'STOP', 'PAUSE'],
        ('DATA', 'CHECK'): ['ACK', 'NACK', 'ERROR'],
        ('ACK', 'DATA'): ['DATA', 'ACK', 'STOP'],
        ('CHECK', 'ACK'): ['DATA', 'STOP'],
        ('CHECK', 'NACK'): ['RETRY', 'ERROR'],
        ('NACK', 'RETRY'): ['DATA', 'SYNC'],
        ('RETRY', 'DATA'): ['DATA', 'ACK', 'CHECK'],
        ('PAUSE', 'RESUME'): ['DATA', 'SYNC'],
        ('RESUME', 'DATA'): ['DATA', 'ACK'],
        ('ERROR', 'RETRY'): ['START', 'SYNC'],
        ('STOP', 'IDLE'): ['START', 'IDLE'],
        ('IDLE', 'START'): ['SYNC', 'DATA']
    }

    # Complex 3-symbol timing patterns
    TIMING_3_PATTERNS = {
        ('START', 'SYNC', 'DATA'): ['DATA', 'ACK'],
        ('SYNC', 'DATA', 'DATA'): ['DATA', 'ACK', 'CHECK'],
        ('DATA', 'DATA', 'ACK'): ['DATA', 'STOP'],
        ('DATA', 'ACK', 'DATA'): ['DATA', 'ACK', 'STOP'],
        ('DATA', 'CHECK', 'ACK'): ['DATA', 'STOP'],
        ('DATA', 'CHECK', 'NACK'): ['RETRY', 'ERROR'],
        ('CHECK', 'NACK', 'RETRY'): ['DATA', 'SYNC'],
        ('NACK', 'RETRY', 'DATA'): ['DATA', 'ACK'],
        ('ACK', 'DATA', 'DATA'): ['DATA', 'ACK', 'CHECK'],
        ('PAUSE', 'RESUME', 'DATA'): ['DATA', 'ACK'],
        ('ERROR', 'RETRY', 'START'): ['SYNC', 'DATA'],
        ('STOP', 'IDLE', 'START'): ['SYNC'],
        ('IDLE', 'START', 'SYNC'): ['DATA']
    }

    # Valid protocol flows
    PROTOCOL_FLOWS = [
        ['START', 'SYNC', 'DATA', 'DATA', 'ACK', 'STOP'],
        ['START', 'SYNC', 'DATA', 'CHECK', 'ACK', 'DATA', 'STOP'],
        ['START', 'SYNC', 'DATA', 'CHECK', 'NACK', 'RETRY', 'DATA', 'ACK', 'STOP'],
        ['START', 'SYNC', 'DATA', 'PAUSE', 'RESUME', 'DATA', 'ACK', 'STOP'],
        ['START', 'SYNC', 'DATA', 'ERROR', 'RETRY', 'START', 'SYNC', 'DATA', 'ACK', 'STOP']
    ]

    sequences = []
    labels = []
    features = []

    n_anomalies = int(n_samples * contamination)
    n_normal = n_samples - n_anomalies

    print(f"Generating {n_normal} normal and {n_anomalies} steganographic communication sequences...")
    print(f"📊 Finite alphabet: {len(COMM_SYMBOLS)} symbols")

    # Generate normal sequences (following timing protocols)
    for i in range(n_normal):
        if i % 2000 == 0:
            print(f"  Normal sequences: {i}/{n_normal}")

        sequence_length = random.randint(30, 120)  # Communication session length
        sequence = generate_valid_communication_sequence(sequence_length, COMM_SYMBOLS,
                                                        TIMING_PATTERNS, TIMING_3_PATTERNS, PROTOCOL_FLOWS)

        sequences.append(sequence)
        labels.append(0)

        # Create sequence-focused features
        feature_vector = create_communication_features(sequence, COMM_SYMBOLS, is_anomaly=False)
        features.append(feature_vector)

    # Generate anomalous sequences (steganographic timing attacks)
    for i in range(n_anomalies):
        if i % 200 == 0:
            print(f"  Steganographic sequences: {i}/{n_anomalies}")

        # Start with a valid sequence
        normal_idx = random.randint(0, len(sequences) - 1)
        base_sequence = sequences[normal_idx].copy()

        # Apply steganographic timing attacks
        attack_type = random.choices(
            ['timing_delay', 'symbol_substitution', 'pattern_disruption', 'flow_manipulation'],
            weights=[0.3, 0.3, 0.25, 0.15]
        )[0]

        base_sequence = apply_steganographic_attack(base_sequence, attack_type, COMM_SYMBOLS,
                                                  TIMING_PATTERNS, TIMING_3_PATTERNS)

        sequences.append(base_sequence)
        labels.append(1)

        # Create sequence-focused features
        feature_vector = create_communication_features(base_sequence, COMM_SYMBOLS, is_anomaly=True)
        features.append(feature_vector)

    # Shuffle
    combined = list(zip(sequences, labels, features))
    random.shuffle(combined)
    sequences, labels, features = zip(*combined)

    return list(sequences), np.array(labels), np.array(features)

def generate_valid_communication_sequence(length, symbols, timing_patterns, timing_3_patterns, protocol_flows):
    """Generate a valid communication sequence following timing protocols."""

    # Start with a protocol flow template
    if random.random() < 0.7:
        base_flow = random.choice(protocol_flows).copy()
        sequence = base_flow
    else:
        sequence = ['START']  # Always start with START

    while len(sequence) < length:
        # Check for 3-symbol patterns first
        if len(sequence) >= 3:
            pattern_3 = tuple(sequence[-3:])
            if pattern_3 in timing_3_patterns:
                next_symbol = random.choice(timing_3_patterns[pattern_3])
                sequence.append(next_symbol)
                continue

        # Check for 2-symbol patterns
        if len(sequence) >= 2:
            pattern_2 = tuple(sequence[-2:])
            if pattern_2 in timing_patterns:
                next_symbol = random.choice(timing_patterns[pattern_2])
                sequence.append(next_symbol)
                continue

        # Fallback to protocol-aware random selection
        current_symbol = sequence[-1]
        if current_symbol == 'START':
            next_symbol = random.choice(['SYNC', 'DATA'])
        elif current_symbol == 'STOP':
            next_symbol = random.choice(['IDLE', 'START'])
        elif current_symbol == 'IDLE':
            next_symbol = random.choice(['START', 'IDLE'])
        else:
            next_symbol = random.choice(symbols)

        sequence.append(next_symbol)

    return sequence[:length]

def apply_steganographic_attack(sequence, attack_type, symbols, timing_patterns, timing_3_patterns):
    """Apply steganographic timing attacks to communication sequences."""
    if len(sequence) < 8:
        return sequence

    if attack_type == 'timing_delay':
        # Insert subtle timing delays (extra PAUSE/IDLE symbols)
        for _ in range(random.randint(1, 3)):
            pos = random.randint(2, len(sequence) - 2)
            delay_symbol = random.choice(['PAUSE', 'IDLE'])
            sequence.insert(pos, delay_symbol)

    elif attack_type == 'symbol_substitution':
        # Substitute symbols while maintaining protocol appearance
        substitutions = {
            'ACK': 'NACK',    # Flip acknowledgment
            'DATA': 'CHECK',  # Change data to check
            'SYNC': 'START',  # Timing manipulation
            'PAUSE': 'IDLE'   # Subtle timing change
        }

        for _ in range(random.randint(1, 2)):
            pos = random.randint(1, len(sequence) - 2)
            original = sequence[pos]
            if original in substitutions:
                sequence[pos] = substitutions[original]

    elif attack_type == 'pattern_disruption':
        # Disrupt timing patterns
        for i in range(2, len(sequence) - 1):
            pattern_2 = tuple(sequence[i-2:i])
            if pattern_2 in timing_patterns:
                valid_next = timing_patterns[pattern_2]
                invalid_symbols = [s for s in symbols if s not in valid_next]
                if invalid_symbols:
                    sequence[i] = random.choice(invalid_symbols)
                    break

    else:  # flow_manipulation
        # Manipulate protocol flow
        # Insert unexpected protocol transitions
        unexpected_transitions = [
            ('DATA', 'START'),   # Unexpected restart
            ('ACK', 'ERROR'),    # Unexpected error
            ('SYNC', 'STOP'),    # Premature stop
            ('CHECK', 'IDLE')    # Unexpected idle
        ]

        transition = random.choice(unexpected_transitions)
        # Find position to insert this transition
        for i in range(len(sequence) - 1):
            if sequence[i] == transition[0]:
                sequence[i + 1] = transition[1]
                break

    return sequence

def create_communication_features(sequence, symbols, is_anomaly=False):
    """Create features that lose temporal communication patterns."""

    # Symbol frequency (loses temporal order)
    symbol_counts = {symbol: sequence.count(symbol) for symbol in symbols}

    # Communication statistics (limited temporal info)
    comm_stats = {
        'length': len(sequence),
        'unique_symbols': len(set(sequence)),
        'data_ratio': sequence.count('DATA') / len(sequence),
        'control_ratio': (sequence.count('START') + sequence.count('STOP')) / len(sequence),
        'error_ratio': (sequence.count('ERROR') + sequence.count('NACK')) / len(sequence)
    }

    # Protocol flow indicators (some temporal info but limited)
    flow_stats = {
        'has_start': 1 if 'START' in sequence else 0,
        'has_stop': 1 if 'STOP' in sequence else 0,
        'has_error': 1 if 'ERROR' in sequence else 0,
        'ack_nack_ratio': sequence.count('ACK') / max(1, sequence.count('NACK') + sequence.count('ACK'))
    }

    # Combine features
    feature_vector = []
    feature_vector.extend(symbol_counts.values())
    feature_vector.extend(comm_stats.values())
    feature_vector.extend(flow_stats.values())

    # Minimal noise
    for _ in range(6):
        feature_vector.append(random.gauss(0, 0.2))

    return feature_vector

print("📡 Generating Communication Protocol Dataset...")
comm_sequences, comm_labels, comm_features = generate_communication_protocol_dataset(n_samples=15000, contamination=0.02)

print(f"\n📊 Communication Protocol Dataset:")
print(f"  Alphabet size: 12 symbols (minimal finite alphabet)")
print(f"  Total samples: {len(comm_sequences)}")
print(f"  Features: {comm_features.shape[1]}")
print(f"  Normal sequences: {np.sum(comm_labels == 0)} ({np.sum(comm_labels == 0)/len(comm_labels)*100:.1f}%)")
print(f"  Steganographic sequences: {np.sum(comm_labels == 1)} ({np.sum(comm_labels == 1)/len(comm_labels)*100:.1f}%)")
print(f"  Avg sequence length: {np.mean([len(seq) for seq in comm_sequences]):.1f}")
print(f"  Max sequence length: {max([len(seq) for seq in comm_sequences])}")

# Show examples
print(f"\n🔍 Example sequences:")
normal_idx = np.where(comm_labels == 0)[0][0]
anomaly_idx = np.where(comm_labels == 1)[0][0]
print(f"  Normal (first 12): {comm_sequences[normal_idx][:12]}")
print(f"  Steganographic (first 12): {comm_sequences[anomaly_idx][:12]}")
print("✅ Communication protocol dataset generated")

## Challenge Framework

**Same comprehensive evaluation framework**

In [None]:
def calculate_comprehensive_metrics(y_true, y_pred, y_scores):
    """
    Calculate comprehensive evaluation metrics.
    """
    metrics = {}

    try:
        # Basic classification metrics
        metrics['accuracy'] = accuracy_score(y_true, y_pred)
        metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
        metrics['precision'] = precision_score(y_true, y_pred, average='binary', zero_division=0)
        metrics['recall'] = recall_score(y_true, y_pred, average='binary', zero_division=0)
        metrics['f1_binary'] = f1_score(y_true, y_pred, average='binary', zero_division=0)
        metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
        metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)

        # Advanced metrics
        metrics['matthews_corrcoef'] = matthews_corrcoef(y_true, y_pred)
        metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)

        # Probability-based metrics
        if len(np.unique(y_true)) > 1:
            metrics['roc_auc'] = roc_auc_score(y_true, y_scores)
            metrics['average_precision'] = average_precision_score(y_true, y_scores)

            # Precision-Recall curve analysis
            precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
            f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
            metrics['max_f1'] = np.max(f1_scores)
            metrics['optimal_threshold'] = thresholds[np.argmax(f1_scores)] if len(thresholds) > 0 else 0.5

            # Calibration metrics
            try:
                metrics['brier_score'] = brier_score_loss(y_true, y_scores)
            except:
                metrics['brier_score'] = np.nan
        else:
            metrics['roc_auc'] = 0.5
            metrics['average_precision'] = np.mean(y_true)
            metrics['max_f1'] = 0.0
            metrics['optimal_threshold'] = 0.5
            metrics['brier_score'] = np.nan

        # Class-specific metrics
        tn = np.sum((y_true == 0) & (y_pred == 0))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))
        tp = np.sum((y_true == 1) & (y_pred == 1))

        metrics['true_negative_rate'] = tn / max(1, tn + fp)  # Specificity
        metrics['false_positive_rate'] = fp / max(1, tn + fp)
        metrics['false_negative_rate'] = fn / max(1, tp + fn)
        metrics['positive_predictive_value'] = tp / max(1, tp + fp)  # Precision
        metrics['negative_predictive_value'] = tn / max(1, tn + fn)

    except Exception as e:
        print(f"Error calculating metrics: {e}")
        # Return default metrics
        for key in ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1_binary',
                   'f1_macro', 'f1_weighted', 'matthews_corrcoef', 'cohen_kappa', 'roc_auc',
                   'average_precision', 'max_f1', 'optimal_threshold', 'brier_score',
                   'true_negative_rate', 'false_positive_rate', 'false_negative_rate',
                   'positive_predictive_value', 'negative_predictive_value']:
            metrics[key] = 0.0

    return metrics

def run_finite_alphabet_challenge_sklearn(X, y, dataset_name, cv_folds=3):
    """
    Run finite alphabet challenge for scikit-learn with comprehensive metrics.
    """
    print(f"\n Scikit-Learn: {dataset_name}")
    print("=" * 70)

    # Cross-validation
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    # Model suite
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=500, class_weight='balanced',
                                               max_depth=20, min_samples_split=3, random_state=42),
        'Extra Trees': ExtraTreesClassifier(n_estimators=500, class_weight='balanced',
                                           max_depth=20, random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                                n_estimators=500, max_depth=10, learning_rate=0.05, random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
        'Naive Bayes': GaussianNB(),
        'SGD Classifier': SGDClassifier(class_weight='balanced', random_state=42),
        'Isolation Forest': IsolationForest(contamination='auto', random_state=42, n_estimators=300),
        'One-Class SVM': OneClassSVM(nu=0.03, gamma='scale'),
        'Local Outlier Factor': LocalOutlierFactor(contamination='auto', novelty=True)
    }

    results = []

    for name, model in models.items():
        try:
            cv_metrics = defaultdict(list)

            for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]

                # Scaling
                scaler = RobustScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)

                # Handle unsupervised models
                if name in ['Isolation Forest', 'One-Class SVM', 'Local Outlier Factor']:
                    # Train on normal data only
                    normal_data = X_train_scaled[y_train == 0]
                    if len(normal_data) < 10:
                        continue

                    model.fit(normal_data)

                    # Predict
                    predictions = model.predict(X_test_scaled)
                    y_pred = (predictions == -1).astype(int)

                    # Get scores
                    if hasattr(model, 'decision_function'):
                        y_scores = -model.decision_function(X_test_scaled)
                    else:
                        y_scores = -model.score_samples(X_test_scaled)
                else:
                    # Supervised models
                    pos_weight = len(y_train[y_train == 0]) / max(1, len(y_train[y_train == 1]))

                    if hasattr(model, 'scale_pos_weight'):
                        model.set_params(scale_pos_weight=pos_weight)

                    model.fit(X_train_scaled, y_train)

                    if hasattr(model, 'predict_proba'):
                        y_scores = model.predict_proba(X_test_scaled)[:, 1]
                    elif hasattr(model, 'decision_function'):
                        y_scores = model.decision_function(X_test_scaled)
                    else:
                        y_scores = model.predict(X_test_scaled).astype(float)

                    # Optimize threshold
                    if len(np.unique(y_test)) > 1:
                        precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
                        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
                        best_threshold = thresholds[np.argmax(f1_scores)] if len(thresholds) > 0 else 0.5
                        y_pred = (y_scores >= best_threshold).astype(int)
                    else:
                        y_pred = (y_scores >= 0.5).astype(int)

                # Calculate metrics
                fold_metrics = calculate_comprehensive_metrics(y_test, y_pred, y_scores)

                for metric_name, metric_value in fold_metrics.items():
                    if not np.isnan(metric_value):
                        cv_metrics[metric_name].append(metric_value)

            if len(cv_metrics['f1_macro']) > 0:
                # Calculate statistics across folds
                result = {
                    'Dataset': dataset_name,
                    'Model': name,
                    'Type': 'Traditional ML',
                    'CV Folds': len(cv_metrics['f1_macro'])
                }

                # Add all metrics with mean and std
                for metric_name, values in cv_metrics.items():
                    if len(values) > 0:
                        result[f'{metric_name}_mean'] = np.mean(values)
                        result[f'{metric_name}_std'] = np.std(values)
                    else:
                        result[f'{metric_name}_mean'] = 0.0
                        result[f'{metric_name}_std'] = 0.0

                results.append(result)

                # Display key metrics
                f1_mean = result['f1_macro_mean']
                f1_std = result['f1_macro_std']
                auc_mean = result['roc_auc_mean']
                auc_std = result['roc_auc_std']

                print(f"{name:<20} | F1: {f1_mean:.4f}±{f1_std:.3f} | AUC: {auc_mean:.4f}±{auc_std:.3f}")
            else:
                print(f"{name:<20} | ❌ All folds failed")

        except Exception as e:
            print(f"{name:<20} | ❌ Model error: {str(e)[:40]}...")

    return results

def run_finite_alphabet_challenge_sequence(sequences, labels, dataset_name, cv_folds=3):
    """
    Run finite alphabet challenge for sequence models with comprehensive metrics.
    """
    if not ANOMALY_GRID_AVAILABLE:
        print(f"\n⚠️ Skipping sequence challenge for {dataset_name} (library not available)")
        return []

    print(f"\n Challenge Sequence Models: {dataset_name}")
    print("=" * 70)

    # Cross-validation
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    results = []
    orders_to_test = [1, 2, 3, 4]  # Test higher orders for finite alphabets

    for order in orders_to_test:
        try:
            cv_metrics = defaultdict(list)

            for fold, (train_idx, test_idx) in enumerate(cv.split(sequences, labels)):
                train_sequences = [sequences[i] for i in train_idx]
                test_sequences = [sequences[i] for i in test_idx]
                train_labels = labels[train_idx]
                test_labels = labels[test_idx]

                # Filter normal sequences for training
                normal_train_sequences = [seq for seq, label in zip(train_sequences, train_labels) if label == 0]

                if len(normal_train_sequences) < 30:  # Need sufficient training data
                    continue

                try:
                    # Create and train detector
                    detector = anomaly_grid_py.AnomalyDetector(max_order=order)
                    detector.fit(normal_train_sequences)

                    # Get anomaly scores
                    anomaly_scores = detector.predict_proba(test_sequences)

                    # Optimize threshold
                    if len(np.unique(test_labels)) > 1:
                        precision, recall, thresholds = precision_recall_curve(test_labels, anomaly_scores)
                        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
                        best_threshold = thresholds[np.argmax(f1_scores)] if len(thresholds) > 0 else 0.5
                        y_pred = (anomaly_scores >= best_threshold).astype(int)
                    else:
                        y_pred = (anomaly_scores >= 0.5).astype(int)

                    # Calculate metrics
                    fold_metrics = calculate_comprehensive_metrics(test_labels, y_pred, anomaly_scores)

                    for metric_name, metric_value in fold_metrics.items():
                        if not np.isnan(metric_value):
                            cv_metrics[metric_name].append(metric_value)

                except Exception as e:
                    print(f"    Order {order} Fold {fold} error: {str(e)[:30]}...")
                    continue

            if len(cv_metrics['f1_macro']) > 0:
                # Calculate statistics across folds
                result = {
                    'Dataset': dataset_name,
                    'Model': f'Anomaly-Grid-Py (order={order})',
                    'Type': 'Sequence-Based',
                    'CV Folds': len(cv_metrics['f1_macro'])
                }

                # Add all metrics with mean and std
                for metric_name, values in cv_metrics.items():
                    if len(values) > 0:
                        result[f'{metric_name}_mean'] = np.mean(values)
                        result[f'{metric_name}_std'] = np.std(values)
                    else:
                        result[f'{metric_name}_mean'] = 0.0
                        result[f'{metric_name}_std'] = 0.0

                results.append(result)

                # Display key metrics
                f1_mean = result['f1_macro_mean']
                f1_std = result['f1_macro_std']
                auc_mean = result['roc_auc_mean']
                auc_std = result['roc_auc_std']

                print(f"Order {order:<15} | F1: {f1_mean:.4f}±{f1_std:.3f} | AUC: {auc_mean:.4f}±{auc_std:.3f}")
            else:
                print(f"Order {order:<15} | ❌ All folds failed")

        except Exception as e:
            print(f"Order {order:<15} | ❌ Order error: {str(e)[:30]}...")

    return results

print("Framework ready")

## Execute Benchmark

In [None]:
print("EXECUTING")
print("=" * 80)
print("Testing on datasets with finite alphabets (≤20 symbols)")
print("Complex dependencies • Temporal patterns")
print("=" * 80)

all_finite_results = []

# Challenge datasets
finite_datasets = [
    {
        'name': 'Protocol State Machine',
        'sequences': protocol_sequences,
        'labels': protocol_labels,
        'features': protocol_features,
        'description': 'Complex state transition patterns with subtle violations',
        'alphabet_size': 16,
        'expected_max_f1': 0.65
    },
    #{
    #    'name': 'Protein Folding',
    #    'sequences': protein_sequences,
    #    'labels': protein_labels,
    #    'features': protein_features,
    #    'description': 'Biological sequence patterns with rare misfolding',
    #    'alphabet_size': 20,
    #    'expected_max_f1': 0.60
    #},
    {
        'name': 'Communication Protocol',
        'sequences': comm_sequences,
        'labels': comm_labels,
        'features': comm_features,
        'description': 'Steganographic timing attacks in communication',
        'alphabet_size': 12,
        'expected_max_f1': 0.55
    }
]

# Execute finite alphabet challenges
for dataset in finite_datasets:
    print(f"\n CHALLENGE: {dataset['name'].upper()}")
    print(f"Alphabet size: {dataset['alphabet_size']} symbols (finite, well-defined)")
    print(f"Description: {dataset['description']}")
    print(f"Expected max F1: < {dataset['expected_max_f1']}")
    print(f"Samples: {len(dataset['sequences'])}, Features: {dataset['features'].shape[1]}")
    print(f"Contamination: {np.mean(dataset['labels']):.1%}")
    print(f"Avg sequence length: {np.mean([len(seq) for seq in dataset['sequences']]):.1f}")
    print(f"Max sequence length: {max([len(seq) for seq in dataset['sequences']])}")

    # Challenge traditional ML (should struggle with temporal patterns)
    sklearn_results = run_finite_alphabet_challenge_sklearn(
        dataset['features'], dataset['labels'], dataset['name'], cv_folds=3
    )
    all_finite_results.extend(sklearn_results)

    # Challenge sequence models (should have advantage but still struggle)
    sequence_results = run_finite_alphabet_challenge_sequence(
        dataset['sequences'], dataset['labels'], dataset['name'], cv_folds=3
    )
    all_finite_results.extend(sequence_results)

    print(f"\n✅ {dataset['name']} challenge completed")

print(f"\n ALL CHALLENGES COMPLETED!")
print(f"Total experiments: {len(all_finite_results)}")

## Challenge Results Analysis

In [None]:
if len(all_finite_results) > 0:
    finite_df = pd.DataFrame(all_finite_results)

    print("CHALLENGE RESULTS")
    print("=" * 80)

    # Check if challenge succeeded and sequence advantage is clear
    max_f1 = finite_df['f1_macro_mean'].max()
    avg_f1 = finite_df['f1_macro_mean'].mean()

    traditional_scores = finite_df[finite_df['Type'] == 'Traditional ML']['f1_macro_mean']
    sequence_scores = finite_df[finite_df['Type'] == 'Sequence-Based']['f1_macro_mean']

    print(f"📊 FINITE ALPHABET CHALLENGE ASSESSMENT:")
    print(f"  Maximum F1 achieved: {max_f1:.4f}")
    print(f"  Average F1 across all models: {avg_f1:.4f}")

    if max_f1 < 0.70:
        print(f"  ✅ CHALLENGE SUCCEEDED - Max F1 < 0.80")
        if max_f1 < 0.60:
            print(f"  💀 IMPOSSIBLY DIFFICULT - Max F1 < 0.60")
        elif max_f1 < 0.65:
            print(f"  🔥 EXTREMELY DIFFICULT - Max F1 < 0.65")
    else:
        print(f"  ⚠️  CHALLENGE FAILED - Max F1 >= 0.70, need more difficulty")


    print("\n CHALLENGE WINNERS:")
    print("-" * 70)

    # Show top performers with alphabet size context
    alphabet_sizes = {'Protocol State Machine': 16, 'Protein Folding': 20, 'Communication Protocol': 12}

    for dataset_name in finite_df['Dataset'].unique():
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name]
        best_result = dataset_results.loc[dataset_results['f1_macro_mean'].idxmax()]

        emoji = "🚀" if best_result['Type'] == 'Sequence-Based' else "🤖"
        f1_str = f"{best_result['f1_macro_mean']:.4f}±{best_result['f1_macro_std']:.3f}"
        alphabet_size = alphabet_sizes.get(dataset_name, '?')

        print(f"{dataset_name:<25} | Alphabet: {alphabet_size:2d} | {emoji} {best_result['Model']:<30} | F1: {f1_str}")

    print("\n💀 anomaly-grid-py VS scikit-learn BY DATASET:")
    print("-" * 60)

    sequence_wins = 0
    traditional_wins = 0

    for dataset_name in finite_df['Dataset'].unique():
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name]

        traditional_best = dataset_results[dataset_results['Type'] == 'Traditional ML']['f1_macro_mean'].max()
        sequence_best = dataset_results[dataset_results['Type'] == 'Sequence-Based']['f1_macro_mean'].max()

        if not pd.isna(traditional_best) and not pd.isna(sequence_best):
            improvement = ((sequence_best - traditional_best) / traditional_best) * 100
            alphabet_size = alphabet_sizes.get(dataset_name, '?')

            if sequence_best > traditional_best:
                winner = "🚀 Sequence"
                sequence_wins += 1
            else:
                winner = "🤖 Traditional"
                traditional_wins += 1

            print(f"{dataset_name:<25} | Alphabet: {alphabet_size:2d} | {winner:<15} | Improvement: {improvement:+6.2f}%")
            print(f"{'':25} | Traditional: {traditional_best:.4f} | Sequence: {sequence_best:.4f}")
        else:
            print(f"{dataset_name:<25} | Incomplete comparison")

    total_battles = sequence_wins + traditional_wins
    if total_battles > 0:
        print(f"\n🏆 RESULTS:")
        print(f"  🚀 Sequence-Based wins: {sequence_wins}/{total_battles} ({sequence_wins/total_battles:.1%})")
        print(f"  🤖 Traditional ML wins: {traditional_wins}/{total_battles} ({traditional_wins/total_battles:.1%})")

        if sequence_wins > traditional_wins:
            print(f"  ✅ SEQUENCE MODELING DOMINATES on finite alphabet datasets")
        elif sequence_wins == traditional_wins:
            print(f"  ⚖️ BALANCED PERFORMANCE between approaches")
        else:
            print(f"  ⚠️ TRADITIONAL ML UNEXPECTEDLY STRONG - May need more sequence-favoring design")

    print("\n ANALYSIS:")
    print("-" * 70)

    for dataset_name in finite_df['Dataset'].unique():
        print(f"\n💀 {dataset_name.upper()} (Alphabet: {alphabet_sizes.get(dataset_name, '?')} symbols):")
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name].sort_values('f1_macro_mean', ascending=False)

        for i, (_, row) in enumerate(dataset_results.head(6).iterrows()):
            emoji = "🚀" if row['Type'] == 'Sequence-Based' else "🤖"
            f1_str = f"{row['f1_macro_mean']:.4f}±{row['f1_macro_std']:.3f}"
            auc_str = f"{row['roc_auc_mean']:.4f}±{row['roc_auc_std']:.3f}"

            print(f"  {i+1}. {emoji} {row['Model']:<30} | F1: {f1_str} | AUC: {auc_str}")

    print("\n📊 ALPHABET SIZE IMPACT ANALYSIS:")
    print("-" * 40)

    # Analyze performance by alphabet size
    for dataset_name in finite_df['Dataset'].unique():
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name]
        alphabet_size = alphabet_sizes.get(dataset_name, 0)

        traditional_avg = dataset_results[dataset_results['Type'] == 'Traditional ML']['f1_macro_mean'].mean()
        sequence_avg = dataset_results[dataset_results['Type'] == 'Sequence-Based']['f1_macro_mean'].mean()

        if not pd.isna(traditional_avg) and not pd.isna(sequence_avg):
            advantage = sequence_avg - traditional_avg
            print(f"Alphabet {alphabet_size:2d}: Traditional={traditional_avg:.3f}, Sequence={sequence_avg:.3f}, Advantage={advantage:+.3f}")

else:
    print("❌ No finite alphabet challenge results to analyze")

## 💾 Save Results

In [None]:
if len(all_finite_results) > 0:
    # Save results
    finite_df.to_csv('challenge_results.csv', index=False)
    print("💾 Challenge results saved to 'challenge_results.csv'")

    # Create challenge report
    max_f1 = finite_df['f1_macro_mean'].max()
    avg_f1 = finite_df['f1_macro_mean'].mean()

    traditional_scores = finite_df[finite_df['Type'] == 'Traditional ML']['f1_macro_mean']
    sequence_scores = finite_df[finite_df['Type'] == 'Sequence-Based']['f1_macro_mean']
    sequence_advantage = sequence_scores.mean() - traditional_scores.mean() if len(traditional_scores) > 0 and len(sequence_scores) > 0 else 0

    finite_report = f"""
# Challenge Report

## Challenge Success Assessment

- **Maximum F1 achieved**: {max_f1:.4f}
- **Average F1 across all models**: {avg_f1:.4f}
- **Challenge Status**: {'✅ SUCCEEDED' if max_f1 < 0.70 else '❌ FAILED'} (Target: max F1 < 0.70)
- **Difficulty Level**: {'💀 IMPOSSIBLE' if max_f1 < 0.60 else '🔥 EXTREME' if max_f1 < 0.65 else '⚔️ VERY HARD'}

## Challenge Datasets

### 🔐 Protocol State Machine (16 States)
- **Samples**: {len(protocol_sequences)} protocol sequences
- **Contamination**: {np.mean(protocol_labels):.1%}
- **Challenge**: Complex multi-order state transition patterns with subtle violations
- **Avg Length**: {np.mean([len(seq) for seq in protocol_sequences]):.1f} states
- **Max Length**: {max([len(seq) for seq in protocol_sequences])} states

### 🧬 Protein Folding (20 Amino Acids)
- **Samples**: {len(protein_sequences)} protein sequences
- **Contamination**: {np.mean(protein_labels):.1%}
- **Challenge**: Biological sequence patterns with rare misfolding events
- **Avg Length**: {np.mean([len(seq) for seq in protein_sequences]):.1f} amino acids
- **Max Length**: {max([len(seq) for seq in protein_sequences])} amino acids

### 📡 Communication Protocol (12 Symbols)
- **Samples**: {len(comm_sequences)} communication sequences
- **Contamination**: {np.mean(comm_labels):.1%}
- **Challenge**: Steganographic timing attacks in digital communication
- **Avg Length**: {np.mean([len(seq) for seq in comm_sequences]):.1f} symbols
- **Max Length**: {max([len(seq) for seq in comm_sequences])} symbols

## Design Principles

1. **🔤 SMALL VOCABULARIES**: 12-20 symbols maximum (finite, well-defined)
2. **🔗 COMPLEX DEPENDENCIES**: Multi-order sequence patterns (orders 1-4)
3. **⏰ TEMPORAL PATTERNS**: State transitions, biological constraints, timing protocols

## Challenge Results by Dataset

### Winners by Alphabet Size
"""

    # Add winners by dataset
    alphabet_sizes = {'Protocol State Machine': 16, 'Protein Folding': 20, 'Communication Protocol': 12}

    for dataset_name in finite_df['Dataset'].unique():
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name]
        best_result = dataset_results.loc[dataset_results['f1_macro_mean'].idxmax()]
        alphabet_size = alphabet_sizes.get(dataset_name, '?')

        finite_report += f"""
**{dataset_name}** (Alphabet: {alphabet_size} symbols)
- Winner: {best_result['Model']} ({best_result['Type']})
- Macro F1: {best_result['f1_macro_mean']:.4f} ± {best_result['f1_macro_std']:.3f}
- ROC AUC: {best_result['roc_auc_mean']:.4f} ± {best_result['roc_auc_std']:.3f}
"""

    # Add battle results
    sequence_wins = 0
    traditional_wins = 0

    finite_report += f"""

### Sequence vs Traditional ML Battle Results

| Dataset | Alphabet Size | Winner | Traditional F1 | Sequence F1 | Improvement |
|---------|---------------|--------|----------------|-------------|-------------|
"""

    for dataset_name in finite_df['Dataset'].unique():
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name]

        traditional_best = dataset_results[dataset_results['Type'] == 'Traditional ML']['f1_macro_mean'].max()
        sequence_best = dataset_results[dataset_results['Type'] == 'Sequence-Based']['f1_macro_mean'].max()

        if not pd.isna(traditional_best) and not pd.isna(sequence_best):
            improvement = ((sequence_best - traditional_best) / traditional_best) * 100
            alphabet_size = alphabet_sizes.get(dataset_name, '?')

            if sequence_best > traditional_best:
                winner = "🚀 Sequence"
                sequence_wins += 1
            else:
                winner = "🤖 Traditional"
                traditional_wins += 1

            finite_report += f"| {dataset_name} | {alphabet_size} | {winner} | {traditional_best:.4f} | {sequence_best:.4f} | {improvement:+.2f}% |\n"

    # Add comprehensive analysis
    finite_report += f"""

### Overall Battle Results

- **🚀 Sequence-Based wins**: {sequence_wins}/{sequence_wins + traditional_wins} ({sequence_wins/(sequence_wins + traditional_wins):.1%})
- **🤖 Traditional ML wins**: {traditional_wins}/{sequence_wins + traditional_wins} ({traditional_wins/(sequence_wins + traditional_wins):.1%})
- **📊 Sequence dominance**: {'✅ CLEAR' if sequence_wins > traditional_wins else '⚖️ BALANCED' if sequence_wins == traditional_wins else '⚠️ UNEXPECTED'}

### Statistical Analysis

- **Total experiments**: {len(all_finite_results)} (with 3-fold CV)
- **Performance ranges**:
  - Traditional ML: {traditional_scores.min():.4f} - {traditional_scores.max():.4f}
  - Sequence-based: {sequence_scores.min():.4f} - {sequence_scores.max():.4f}

### Alphabet Size Impact
"""

    # Add alphabet size analysis
    for dataset_name in finite_df['Dataset'].unique():
        dataset_results = finite_df[finite_df['Dataset'] == dataset_name]
        alphabet_size = alphabet_sizes.get(dataset_name, 0)

        traditional_avg = dataset_results[dataset_results['Type'] == 'Traditional ML']['f1_macro_mean'].mean()
        sequence_avg = dataset_results[dataset_results['Type'] == 'Sequence-Based']['f1_macro_mean'].mean()

        if not pd.isna(traditional_avg) and not pd.isna(sequence_avg):
            advantage = sequence_avg - traditional_avg
            finite_report += f"\n- **Alphabet {alphabet_size}**: Traditional={traditional_avg:.3f}, Sequence={sequence_avg:.3f}, Advantage={advantage:+.3f}"

    finite_report += f"""

## Key Findings

1. **⏰ TEMPORAL PATTERNS MATTER**: State transitions and timing require sequence understanding

---
*Challenge completed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""

    # Save finite alphabet report
    with open('challenge_report.md', 'w') as f:
        f.write(finite_report)

    print("📄 Challenge report saved to 'challenge_report.md'")

    # Display final summary
    print("\n" + "="*80)
    print("CHALLENGE COMPLETED!")
    print("="*80)
    print(f"📊 Total experiments: {len(all_finite_results)}")
    print(f"💾 Results: Saved to CSV and markdown files")
    print("="*80)

else:
    print("❌ No finite alphabet challenge results to save")