# Transformer WAF - LogBERT Training & Preprocessing Notebook

This notebook demonstrates the preprocessing and training pipeline for the LogBERT-based Web Application Firewall.

## Overview
- **Data Source**: Live Tomcat access logs
- **Model**: LogBERT Transformer for anomaly detection
- **Training**: Unsupervised learning on benign traffic patterns
- **Output**: Trained model for real-time anomaly detection

## 1. Setup and Imports

In [None]:
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
import re
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print(f"🐍 Python version: {sys.version}")

In [None]:
# Import WAF components
try:
    from log_parser_normalizer import LogParserNormalizer
    from logbert_transformer_model import LogBERTModel, LogBERTConfig
    from waf_training_pipeline import WAFTrainingPipeline
    from incremental_lora_learning import LoRAIncrementalLearner
    print("✅ WAF components imported successfully!")
except ImportError as e:
    print(f"⚠️ Error importing WAF components: {e}")
    print("Make sure all WAF Python files are in the same directory.")

## 2. Configuration and Data Paths

In [None]:
# Configuration
CONFIG = {
    'data_paths': {
        'tomcat_logs': '/opt/tomcat/logs',
        'local_logs': 'logs',
        'sample_logs': 'sample_data/access_logs.txt'
    },
    'model_config': {
        'vocab_size': 10000,
        'hidden_size': 256,
        'num_layers': 6,
        'num_heads': 8,
        'max_sequence_length': 512
    },
    'training': {
        'batch_size': 32,
        'learning_rate': 2e-5,
        'num_epochs': 3,
        'warmup_steps': 100
    },
    'output_paths': {
        'models': 'models',
        'plots': 'plots',
        'reports': 'reports'
    }
}

# Create directories
for path in CONFIG['output_paths'].values():
    Path(path).mkdir(exist_ok=True)
    
print("📁 Directories created successfully!")
print(json.dumps(CONFIG, indent=2))

## 3. Data Loading and Exploration

In [None]:
def load_log_files(log_directories):
    """Load log files from multiple directories"""
    log_entries = []
    
    for log_dir in log_directories:
        log_path = Path(log_dir)
        if not log_path.exists():
            print(f"⚠️ Directory not found: {log_path}")
            continue
            
        # Find log files
        log_files = list(log_path.glob("*.log"))
        print(f"📁 Found {len(log_files)} log files in {log_path}")
        
        for log_file in log_files:
            try:
                with open(log_file, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                    for line in lines:
                        line = line.strip()
                        if line:
                            log_entries.append({
                                'raw_log': line,
                                'file': str(log_file),
                                'timestamp': datetime.now()
                            })
            except Exception as e:
                print(f"❌ Error reading {log_file}: {e}")
                
    return log_entries

# Load logs
log_directories = [
    CONFIG['data_paths']['local_logs'],
    # Add other directories as they become available
]

raw_logs = load_log_files(log_directories)
print(f"📊 Loaded {len(raw_logs)} log entries")

In [None]:
# Generate sample data if no logs are found
if len(raw_logs) == 0:
    print("🔧 Generating sample log data for demonstration...")
    
    sample_logs = [
        '192.168.1.100 - - [01/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 2345 "-" "Mozilla/5.0"',
        '192.168.1.101 - - [01/Dec/2023:10:00:01 +0000] "POST /api/login HTTP/1.1" 200 567 "http://example.com/login" "Mozilla/5.0"',
        '192.168.1.102 - - [01/Dec/2023:10:00:02 +0000] "GET /products?id=123 HTTP/1.1" 200 1234 "-" "Mozilla/5.0"',
        '192.168.1.103 - - [01/Dec/2023:10:00:03 +0000] "GET /api/users HTTP/1.1" 200 890 "-" "curl/7.68.0"',
        '192.168.1.104 - - [01/Dec/2023:10:00:04 +0000] "POST /checkout HTTP/1.1" 200 456 "http://example.com/cart" "Mozilla/5.0"',
        # Potentially anomalous entries
        '192.168.1.105 - - [01/Dec/2023:10:00:05 +0000] "GET /admin/../../etc/passwd HTTP/1.1" 404 123 "-" "curl/7.68.0"',
        '192.168.1.106 - - [01/Dec/2023:10:00:06 +0000] "POST /search?q=<script>alert(1)</script> HTTP/1.1" 200 789 "-" "Mozilla/5.0"',
    ]
    
    raw_logs = []
    for i, log in enumerate(sample_logs * 50):  # Multiply for more data
        raw_logs.append({
            'raw_log': log,
            'file': 'sample_data',
            'timestamp': datetime.now() + timedelta(seconds=i)
        })
        
    print(f"✅ Generated {len(raw_logs)} sample log entries")

# Show first few logs
print("\n📋 Sample log entries:")
for i, log in enumerate(raw_logs[:5]):
    print(f"{i+1}. {log['raw_log'][:100]}...")

## 4. Log Parsing and Normalization

In [None]:
# Initialize log parser
log_parser = LogParserNormalizer()
await log_parser.initialize()

print("🔧 Log parser initialized successfully!")

In [None]:
def parse_clf_log(log_line):
    """Parse Common Log Format log entry"""
    # CLF regex pattern
    clf_pattern = re.compile(
        r'(?P<remote_addr>\S+) \S+ \S+ \[(?P<timestamp>[^\]]+)\] '
        r'"(?P<method>\S+) (?P<uri>\S+) (?P<protocol>[^"]+)" '
        r'(?P<status>\d+) (?P<bytes_sent>\S+)'
        r'(?: "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)")?'
    )
    
    match = clf_pattern.match(log_line)
    if not match:
        return None
        
    groups = match.groupdict()
    
    # Parse URI and query string
    uri = groups.get('uri', '')
    query_string = ''
    if '?' in uri:
        uri, query_string = uri.split('?', 1)
        
    return {
        'timestamp': groups.get('timestamp', ''),
        'remote_addr': groups.get('remote_addr', ''),
        'method': groups.get('method', ''),
        'uri': uri,
        'query_string': query_string,
        'protocol': groups.get('protocol', ''),
        'status': int(groups.get('status', 0)),
        'bytes_sent': int(groups.get('bytes_sent', 0)) if groups.get('bytes_sent', '-') != '-' else 0,
        'referer': groups.get('referer', ''),
        'user_agent': groups.get('user_agent', '')
    }

# Parse all logs
parsed_logs = []
parsing_errors = 0

print("🔍 Parsing log entries...")
for log_entry in raw_logs:
    try:
        parsed = parse_clf_log(log_entry['raw_log'])
        if parsed:
            # Add template using Drain algorithm
            template = log_parser.parse_log(parsed['uri'])
            parsed['template'] = template
            parsed['file'] = log_entry['file']
            parsed_logs.append(parsed)
        else:
            parsing_errors += 1
    except Exception as e:
        parsing_errors += 1
        
print(f"✅ Successfully parsed {len(parsed_logs)} log entries")
print(f"⚠️ Parsing errors: {parsing_errors}")

# Convert to DataFrame for easier analysis
df_logs = pd.DataFrame(parsed_logs)
print(f"📊 Created DataFrame with shape: {df_logs.shape}")

In [None]:
# Display basic statistics
print("📈 Log Data Overview:")
print("="*50)
print(f"Total log entries: {len(df_logs)}")
print(f"Unique IP addresses: {df_logs['remote_addr'].nunique()}")
print(f"Unique URIs: {df_logs['uri'].nunique()}")
print(f"Unique templates: {df_logs['template'].nunique()}")
print(f"Date range: {df_logs['timestamp'].min()} to {df_logs['timestamp'].max()}")

# Display first few rows
print("\n📋 Sample parsed logs:")
display(df_logs.head())

## 5. Exploratory Data Analysis

In [None]:
# HTTP methods distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
method_counts = df_logs['method'].value_counts()
plt.pie(method_counts.values, labels=method_counts.index, autopct='%1.1f%%')
plt.title('HTTP Methods Distribution')

# Status codes distribution
plt.subplot(1, 3, 2)
status_counts = df_logs['status'].value_counts().head(10)
plt.bar(range(len(status_counts)), status_counts.values)
plt.xticks(range(len(status_counts)), status_counts.index, rotation=45)
plt.title('Top 10 Status Codes')
plt.ylabel('Count')

# URI templates distribution
plt.subplot(1, 3, 3)
template_counts = df_logs['template'].value_counts().head(10)
plt.barh(range(len(template_counts)), template_counts.values)
plt.yticks(range(len(template_counts)), [t[:30] + '...' if len(t) > 30 else t for t in template_counts.index])
plt.title('Top 10 URI Templates')
plt.xlabel('Count')

plt.tight_layout()
plt.savefig(f"{CONFIG['output_paths']['plots']}/log_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

print("📊 Exploratory data analysis complete!")

In [None]:
# Analyze request patterns
print("🔍 Request Pattern Analysis:")
print("="*50)

# Most common user agents
print("\n🕷️ Top User Agents:")
ua_counts = df_logs['user_agent'].value_counts().head(5)
for ua, count in ua_counts.items():
    print(f"  {count:3d}: {ua[:60]}...")

# Most active IPs
print("\n🌐 Most Active IP Addresses:")
ip_counts = df_logs['remote_addr'].value_counts().head(5)
for ip, count in ip_counts.items():
    print(f"  {count:3d}: {ip}")

# Suspicious patterns
print("\n🚨 Potential Security Issues:")
suspicious_patterns = [
    ('../', 'Directory Traversal'),
    ('script>', 'XSS Attempt'),
    ('union', 'SQL Injection'),
    ('alert(', 'XSS Attempt'),
    ('etc/passwd', 'File Access Attempt')
]

for pattern, attack_type in suspicious_patterns:
    suspicious_logs = df_logs[df_logs['uri'].str.contains(pattern, case=False, na=False)]
    if len(suspicious_logs) > 0:
        print(f"  {attack_type}: {len(suspicious_logs)} instances")
        for _, log in suspicious_logs.head(2).iterrows():
            print(f"    Example: {log['method']} {log['uri'][:50]}...")

## 6. Feature Engineering and Preprocessing

In [None]:
def extract_features(df):
    """Extract features from log data for training"""
    features = df.copy()
    
    # Basic features
    features['uri_length'] = features['uri'].str.len()
    features['query_length'] = features['query_string'].str.len()
    features['has_query'] = (features['query_length'] > 0).astype(int)
    
    # URI characteristics
    features['uri_depth'] = features['uri'].str.count('/')
    features['has_params'] = features['uri'].str.contains('\?').astype(int)
    features['has_extension'] = features['uri'].str.contains('\\.[a-zA-Z0-9]+$').astype(int)
    
    # Suspicious pattern indicators
    suspicious_patterns = {
        'has_traversal': r'\.\.[\\/]',
        'has_script': r'<script',
        'has_sql': r'(union|select|insert|update|delete|drop)\s',
        'has_exec': r'(exec|eval|cmd)',
        'has_special_chars': r'[<>"\';()]'
    }
    
    for feature_name, pattern in suspicious_patterns.items():
        features[feature_name] = features['uri'].str.contains(pattern, case=False, regex=True).astype(int)
    
    # Categorical encoding
    features['method_encoded'] = pd.Categorical(features['method']).codes
    features['status_class'] = (features['status'] // 100).astype(int)
    
    # Numerical features
    features['bytes_sent_log'] = np.log1p(features['bytes_sent'])
    
    return features

# Extract features
print("🔧 Extracting features...")
df_features = extract_features(df_logs)

# Select feature columns for training
feature_columns = [
    'template', 'method', 'uri', 'status', 'user_agent',
    'uri_length', 'query_length', 'has_query', 'uri_depth',
    'has_params', 'has_extension', 'has_traversal', 'has_script',
    'has_sql', 'has_exec', 'has_special_chars', 'bytes_sent_log'
]

print(f"✅ Feature extraction complete! Features: {len(feature_columns)}")
print(f"Features: {feature_columns}")

In [None]:
# Create sequences for LogBERT training
def create_training_sequences(df, max_length=512):
    """Create training sequences from log data"""
    sequences = []
    
    for _, row in df.iterrows():
        # Create sequence from log components
        parts = [
            str(row.get('method', '')),
            str(row.get('uri', '')),
            str(row.get('template', '')),
            str(row.get('status', '')),
            str(row.get('user_agent', ''))[:100]  # Truncate user agent
        ]
        
        # Join parts and clean
        sequence = ' '.join(part for part in parts if part and part != 'nan')
        sequence = re.sub(r'\s+', ' ', sequence).strip()
        
        if len(sequence) > 0:
            sequences.append({
                'sequence': sequence[:max_length],
                'length': min(len(sequence), max_length),
                'original_log': row.to_dict()
            })
    
    return sequences

# Create training sequences
print("📝 Creating training sequences...")
training_sequences = create_training_sequences(df_features)
print(f"✅ Created {len(training_sequences)} training sequences")

# Show sequence statistics
sequence_lengths = [seq['length'] for seq in training_sequences]
print(f"\n📏 Sequence Length Statistics:")
print(f"  Mean: {np.mean(sequence_lengths):.1f}")
print(f"  Median: {np.median(sequence_lengths):.1f}")
print(f"  Max: {max(sequence_lengths)}")
print(f"  Min: {min(sequence_lengths)}")

# Show sample sequences
print("\n📋 Sample Training Sequences:")
for i, seq in enumerate(training_sequences[:3]):
    print(f"{i+1}. {seq['sequence'][:80]}...")

## 7. Model Training Setup

In [None]:
# Initialize LogBERT model
print("🤖 Initializing LogBERT model...")

# Create model configuration
model_config = LogBERTConfig(
    vocab_size=CONFIG['model_config']['vocab_size'],
    hidden_size=CONFIG['model_config']['hidden_size'],
    num_hidden_layers=CONFIG['model_config']['num_layers'],
    num_attention_heads=CONFIG['model_config']['num_heads'],
    max_position_embeddings=CONFIG['model_config']['max_sequence_length'],
    intermediate_size=CONFIG['model_config']['hidden_size'] * 4
)

# Initialize model
model = LogBERTModel(model_config)
print(f"✅ LogBERT model initialized with {sum(p.numel() for p in model.parameters()):,} parameters")

# Print model architecture
print(f"\n🏗️ Model Architecture:")
print(f"  Vocab Size: {model_config.vocab_size:,}")
print(f"  Hidden Size: {model_config.hidden_size}")
print(f"  Layers: {model_config.num_hidden_layers}")
print(f"  Attention Heads: {model_config.num_attention_heads}")
print(f"  Max Sequence Length: {model_config.max_position_embeddings}")

In [None]:
# Initialize training pipeline
print("🚀 Initializing training pipeline...")

# Create training pipeline
training_pipeline = WAFTrainingPipeline(
    model_config=model_config,
    batch_size=CONFIG['training']['batch_size'],
    learning_rate=CONFIG['training']['learning_rate'],
    num_epochs=CONFIG['training']['num_epochs']
)

# Prepare training data
benign_sequences = []
for seq in training_sequences:
    # Simple benign filtering (in practice, this would be more sophisticated)
    log_data = seq['original_log']
    if (log_data.get('status', 0) in [200, 201, 301, 302, 304, 404] and
        not any(log_data.get(f'has_{pattern}', False) for pattern in ['traversal', 'script', 'sql', 'exec'])):
        benign_sequences.append(seq['sequence'])

print(f"✅ Prepared {len(benign_sequences)} benign training sequences")
print(f"📊 Training data split: {len(benign_sequences)} benign sequences")

## 8. Model Training

In [None]:
# Train the model
print("🏋️ Starting model training...")
print("This may take several minutes depending on the data size and hardware.")

try:
    # Train the model
    training_results = await training_pipeline.train(
        train_sequences=benign_sequences,
        save_path=CONFIG['output_paths']['models']
    )
    
    print("\n🎉 Training completed successfully!")
    print(f"📈 Training Results:")
    for key, value in training_results.items():
        print(f"  {key}: {value}")
        
except Exception as e:
    print(f"❌ Training failed: {e}")
    print("💡 This is expected if running in demo mode without GPU support.")

In [None]:
# Visualize training metrics (if training was successful)
if 'training_results' in locals() and training_results:
    plt.figure(figsize=(12, 4))
    
    # Training loss
    if 'train_losses' in training_results:
        plt.subplot(1, 2, 1)
        plt.plot(training_results['train_losses'])
        plt.title('Training Loss')
        plt.xlabel('Step')
        plt.ylabel('Loss')
        plt.grid(True)
    
    # Learning rate schedule
    if 'learning_rates' in training_results:
        plt.subplot(1, 2, 2)
        plt.plot(training_results['learning_rates'])
        plt.title('Learning Rate Schedule')
        plt.xlabel('Step')
        plt.ylabel('Learning Rate')
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_paths']['plots']}/training_metrics.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    print("📊 Training metrics visualization saved!")
else:
    print("⚠️ Training metrics not available for visualization.")

## 9. Model Evaluation and Anomaly Detection Testing

In [None]:
# Create test cases for anomaly detection
test_cases = [
    # Benign requests
    {
        'sequence': 'GET /index.html GET /index.html 200 Mozilla/5.0',
        'expected': 'benign',
        'description': 'Normal homepage request'
    },
    {
        'sequence': 'POST /api/login POST /api/login 200 Mozilla/5.0',
        'expected': 'benign',
        'description': 'Normal login request'
    },
    # Anomalous requests
    {
        'sequence': 'GET /admin/../etc/passwd GET /admin/*/etc/passwd 404 curl/7.68.0',
        'expected': 'anomaly',
        'description': 'Directory traversal attempt'
    },
    {
        'sequence': 'POST /search?q=<script>alert(1)</script> POST /search?q=* 200 Mozilla/5.0',
        'expected': 'anomaly',
        'description': 'XSS attempt'
    },
    {
        'sequence': 'GET /api/users?id=1 UNION SELECT * FROM users GET /api/users?id=* 200 sqlmap/1.0',
        'expected': 'anomaly',
        'description': 'SQL injection attempt'
    }
]

print(f"🧪 Created {len(test_cases)} test cases for evaluation")

# Display test cases
print("\n📋 Test Cases:")
for i, test_case in enumerate(test_cases, 1):
    print(f"{i}. {test_case['description']} ({test_case['expected']})")
    print(f"   Sequence: {test_case['sequence'][:60]}...")

In [None]:
# Mock anomaly detection for demonstration
def mock_anomaly_detection(sequence):
    """Mock anomaly detection function for demonstration"""
    # Simple rule-based approach for demo
    suspicious_patterns = [
        '../', 'script>', 'alert(', 'union', 'select', 'exec', 'eval',
        'etc/passwd', 'cmd', 'shell'
    ]
    
    sequence_lower = sequence.lower()
    anomaly_score = 0.0
    
    # Check for suspicious patterns
    for pattern in suspicious_patterns:
        if pattern in sequence_lower:
            anomaly_score += 0.3
    
    # Check for unusual characteristics
    if len(sequence) > 200:
        anomaly_score += 0.2
    if 'curl' in sequence_lower or 'sqlmap' in sequence_lower:
        anomaly_score += 0.4
    
    # Normalize score
    anomaly_score = min(anomaly_score, 1.0)
    
    return {
        'anomaly_score': anomaly_score,
        'is_anomaly': anomaly_score > 0.5,
        'confidence': min(anomaly_score * 1.2, 1.0)
    }

# Test anomaly detection
print("🔍 Testing anomaly detection...")
print("\n📊 Results:")
print("="*80)

correct_predictions = 0
total_predictions = len(test_cases)

for i, test_case in enumerate(test_cases, 1):
    prediction = mock_anomaly_detection(test_case['sequence'])
    
    predicted_class = 'anomaly' if prediction['is_anomaly'] else 'benign'
    is_correct = predicted_class == test_case['expected']
    
    if is_correct:
        correct_predictions += 1
    
    status_emoji = "✅" if is_correct else "❌"
    print(f"{status_emoji} Test {i}: {test_case['description']}")
    print(f"   Expected: {test_case['expected']} | Predicted: {predicted_class}")
    print(f"   Anomaly Score: {prediction['anomaly_score']:.3f} | Confidence: {prediction['confidence']:.3f}")
    print()

accuracy = correct_predictions / total_predictions
print(f"🎯 Overall Accuracy: {accuracy:.1%} ({correct_predictions}/{total_predictions})")

## 10. Model Saving and Deployment Preparation

In [None]:
# Save model configuration and metadata
model_metadata = {
    'model_config': model_config.to_dict() if hasattr(model_config, 'to_dict') else CONFIG['model_config'],
    'training_config': CONFIG['training'],
    'training_data': {
        'total_sequences': len(training_sequences),
        'benign_sequences': len(benign_sequences),
        'sequence_length_stats': {
            'mean': float(np.mean(sequence_lengths)),
            'median': float(np.median(sequence_lengths)),
            'max': int(max(sequence_lengths)),
            'min': int(min(sequence_lengths))
        }
    },
    'feature_columns': feature_columns,
    'timestamp': datetime.now().isoformat(),
    'evaluation': {
        'test_cases': len(test_cases),
        'mock_accuracy': accuracy
    }
}

# Save metadata
metadata_path = Path(CONFIG['output_paths']['models']) / 'model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"💾 Model metadata saved to: {metadata_path}")

# Save training vocabulary (mock)
vocab_path = Path(CONFIG['output_paths']['models']) / 'vocab.json'
vocab = {}
vocab_counter = 0

# Build vocabulary from sequences
special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
for token in special_tokens:
    vocab[token] = vocab_counter
    vocab_counter += 1

# Add tokens from training sequences
all_tokens = set()
for seq in benign_sequences[:100]:  # Sample for demo
    tokens = seq.lower().split()
    all_tokens.update(tokens)

for token in sorted(all_tokens):
    if token not in vocab:
        vocab[token] = vocab_counter
        vocab_counter += 1

with open(vocab_path, 'w') as f:
    json.dump(vocab, f, indent=2)

print(f"📚 Vocabulary saved to: {vocab_path} (size: {len(vocab)})")

In [None]:
# Create deployment configuration
deployment_config = {
    'model_path': str(Path(CONFIG['output_paths']['models']) / 'logbert_model.pth'),
    'config_path': str(Path(CONFIG['output_paths']['models']) / 'logbert_config.json'),
    'vocab_path': str(vocab_path),
    'metadata_path': str(metadata_path),
    'inference_config': {
        'batch_size': 32,
        'max_sequence_length': CONFIG['model_config']['max_sequence_length'],
        'anomaly_threshold': 0.5,
        'device': 'cpu'  # Default to CPU for deployment
    },
    'preprocessing': {
        'feature_columns': feature_columns,
        'normalization': 'standard',
        'max_uri_length': 500,
        'max_user_agent_length': 200
    }
}

# Save deployment configuration
deployment_path = Path(CONFIG['output_paths']['models']) / 'deployment_config.json'
with open(deployment_path, 'w') as f:
    json.dump(deployment_config, f, indent=2)

print(f"🚀 Deployment configuration saved to: {deployment_path}")

## 11. Training Summary and Next Steps

In [None]:
# Generate training summary report
summary_report = f"""
# LogBERT WAF Training Summary Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Data Summary
- **Total Log Entries**: {len(raw_logs):,}
- **Successfully Parsed**: {len(parsed_logs):,}
- **Training Sequences**: {len(training_sequences):,}
- **Benign Training Sequences**: {len(benign_sequences):,}

## Model Configuration
- **Architecture**: LogBERT Transformer
- **Vocabulary Size**: {CONFIG['model_config']['vocab_size']:,}
- **Hidden Size**: {CONFIG['model_config']['hidden_size']}
- **Layers**: {CONFIG['model_config']['num_layers']}
- **Attention Heads**: {CONFIG['model_config']['num_heads']}
- **Max Sequence Length**: {CONFIG['model_config']['max_sequence_length']}

## Training Configuration
- **Batch Size**: {CONFIG['training']['batch_size']}
- **Learning Rate**: {CONFIG['training']['learning_rate']}
- **Epochs**: {CONFIG['training']['num_epochs']}

## Data Analysis Results
- **Unique IP Addresses**: {df_logs['remote_addr'].nunique()}
- **Unique URIs**: {df_logs['uri'].nunique()}
- **Unique Templates**: {df_logs['template'].nunique()}
- **HTTP Methods**: {', '.join(df_logs['method'].value_counts().head().index.tolist())}
- **Common Status Codes**: {', '.join(map(str, df_logs['status'].value_counts().head().index.tolist()))}

## Feature Engineering
- **Total Features**: {len(feature_columns)}
- **Feature Types**: Categorical, Numerical, Boolean, Text
- **Suspicious Pattern Detection**: Directory Traversal, XSS, SQL Injection, Command Execution

## Mock Evaluation Results
- **Test Cases**: {len(test_cases)}
- **Mock Accuracy**: {accuracy:.1%}
- **Anomaly Detection**: Rule-based approach for demonstration

## Output Files Generated
- Model Metadata: `{metadata_path}`
- Vocabulary: `{vocab_path}`
- Deployment Config: `{deployment_path}`
- Analysis Plots: `{CONFIG['output_paths']['plots']}/`

## Next Steps for Production Deployment
1. **Real Model Training**: Train on larger dataset with GPU acceleration
2. **Hyperparameter Tuning**: Optimize model architecture and training parameters
3. **Validation Dataset**: Create proper validation set with labeled anomalies
4. **Performance Optimization**: Implement model quantization and optimization
5. **Integration Testing**: Test with real Tomcat logs and live traffic
6. **Continuous Learning**: Implement online learning for model updates
7. **Monitoring Setup**: Deploy monitoring and alerting for production use

## Technical Notes
- This notebook demonstrates the complete preprocessing and training pipeline
- Mock data was used for demonstration purposes
- Real production deployment requires substantial computing resources
- Model performance will improve significantly with larger, diverse datasets
"""

# Save summary report
report_path = Path(CONFIG['output_paths']['reports']) / 'training_summary.md'
Path(CONFIG['output_paths']['reports']).mkdir(exist_ok=True)
with open(report_path, 'w') as f:
    f.write(summary_report)

print(f"📄 Training summary report saved to: {report_path}")
print("\n" + "="*80)
print("🎉 LOGBERT WAF TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
print("="*80)
print(summary_report)

## 12. Integration with Real-time WAF System

In [None]:
# Integration instructions and test commands
integration_instructions = """
🔗 INTEGRATION WITH REAL-TIME WAF SYSTEM

To integrate this trained model with the real-time WAF system:

1. **Start the Live Log Processor**:
   ```bash
   python live_log_processor.py
   ```

2. **Start the Continuous Trainer**:
   ```bash
   python continuous_logbert_trainer.py
   ```

3. **Launch the Real-time WAF Service**:
   ```bash
   python realtime_waf_service.py
   ```

4. **Access the Dashboard**:
   - Open http://localhost:8000 for the real-time dashboard
   - Monitor anomalies and system status

5. **Generate Test Traffic**:
   ```bash
   python live_traffic_generator.py
   ```

6. **View Tomcat Logs**:
   - Configure Tomcat with setup_complete_waf.sh
   - Monitor /opt/tomcat/logs/access_log.json

📊 Model files generated:
- Model metadata: models/model_metadata.json
- Vocabulary: models/vocab.json  
- Deployment config: models/deployment_config.json

🚀 The system is ready for live anomaly detection!
"""

print(integration_instructions)

# Save integration instructions
integration_path = Path(CONFIG['output_paths']['reports']) / 'integration_instructions.md'
with open(integration_path, 'w') as f:
    f.write(integration_instructions)

print(f"\n📋 Integration instructions saved to: {integration_path}")