# LogBERT-style Transformer WAF: Benign-only Training and Malicious Payload Evaluation

This notebook demonstrates:
- Parsing HTTP access logs into sequences.
- Training a LogBERT-style transformer on benign traffic only.
- Scoring custom payloads and computing simple metrics vs. benign samples.

Prerequisites: run your virtualenv and ensure `waf-system/requirements.txt` is installed.

In [1]:
# Imports and workspace setup
from pathlib import Path
import sys, os, time, random
import pandas as pd
import numpy as np

# Torch / training
import torch
from torch.utils.data import DataLoader

# Find waf-system root (search upwards for the folder)
CWD = Path.cwd().resolve()
WAF_ROOT = None
for p in [CWD] + list(CWD.parents):
    if (p / 'waf-system').is_dir():
        WAF_ROOT = p / 'waf-system'
        break
if WAF_ROOT is None:
    # If this notebook is already inside waf-system, use it
    if (CWD / 'ml-pipeline').is_dir():
        WAF_ROOT = CWD
    else:
        raise RuntimeError('Could not locate waf-system root from current working directory')

# Add pipeline paths for imports
sys.path.insert(0, str(WAF_ROOT / 'ml-pipeline'))
sys.path.insert(0, str(WAF_ROOT / 'ml-pipeline' / 'preprocessing'))
sys.path.insert(0, str(WAF_ROOT / 'ml-pipeline' / 'training'))

from log_processor import LogPreprocessor
from trainer import WAFTrainer, prepare_training_data, collate_fn
from waf_model import create_waf_model

print('WAF_ROOT =', WAF_ROOT)

WAF_ROOT = /Users/majjipradeepkumar/Downloads/WAF/Sample-apps-for-training-a-transformer-based-WAF-pipleline/waf-system


In [2]:
# Load benign access logs and build token sequences
from datetime import datetime

preprocessor = LogPreprocessor()

# Default log locations (nginx and tomcat)
log_globs = [
    str(WAF_ROOT / 'data' / 'logs' / 'access.log'),
    str(WAF_ROOT / 'tomcat' / 'current' / 'logs' / 'localhost_access_log*.txt'),
]

max_lines = 5000
sequences = []
parsed_cache = []

for pattern in log_globs:
    # Expand glob manually
    paths = list(Path().glob(pattern) if any(ch in pattern for ch in '*?[]') else [Path(pattern)])
    for path in paths:
        if not path.exists() or not path.is_file():
            continue
        with path.open('r', errors='ignore') as f:
            for i, line in enumerate(f):
                if len(sequences) >= max_lines:
                    break
                pl = line.strip()
                if not pl:
                    continue
                processed = preprocessor.process_log_entry(pl)
                if not processed:
                    continue
                parsed = processed['parsed']
                # Build minimal sequence: METHOD, PATH, STATUS, Agent
                seq = [
                    parsed.get('method', 'GET'),
                    parsed.get('path_only', '/'),
                    str(parsed.get('status', 200)),
                ]
                ua = parsed.get('http_user_agent', '') or ''
                if 'Mozilla' in ua:
                    seq.append('Mozilla')
                elif 'curl' in ua:
                    seq.append('curl')
                elif 'python' in ua.lower():
                    seq.append('python')
                else:
                    seq.append('Other-Agent')
                sequences.append(seq)
                parsed_cache.append(parsed)
        if len(sequences) >= max_lines:
            break

if len(sequences) < 50:
    # Fallback synthetic benign sequences if logs are scarce
    base = [
        ['GET', '/ecommerce/', '200', 'Mozilla'],
        ['GET', '/ecommerce/products', '200', 'Mozilla'],
        ['GET', '/blog-cms/', '200', 'Mozilla'],
        ['GET', '/rest-api/api/users', '200', 'curl']
    ]
    sequences.extend(base * 50)

print(f"Collected {len(sequences)} benign sequences for training")
print('Sample:', sequences[0] if sequences else None)

NotImplementedError: Non-relative patterns are unsupported

In [3]:
# Create model, tokenize, and prepare datasets
model, tokenizer = create_waf_model()
train_ds, val_ds = prepare_training_data(sequences, tokenizer)

batch_size = 32
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
trainer = WAFTrainer(model, tokenizer, device=device)

print('Train size:', len(train_ds), 'Val size:', len(val_ds), 'Device:', device)



ValueError: num_samples should be a positive integer value, but got num_samples=0

In [4]:
# Train briefly (1 epoch) and evaluate
train_metrics = trainer.train_epoch(train_loader)
val_metrics = trainer.evaluate(val_loader)

print('Train metrics:', train_metrics)
print('Val metrics:', val_metrics)

# Save a copy for reuse
models_dir = WAF_ROOT / 'data' / 'models'
models_dir.mkdir(parents=True, exist_ok=True)
save_path = models_dir / 'notebook_model.pt'
trainer.save_model(str(save_path))
print('Model saved to', save_path)

# Switch to eval
trainer.model.eval()

NameError: name 'trainer' is not defined

In [5]:
# Helper functions for scoring arbitrary URIs/payloads
from typing import Dict, Any, List, Tuple

def build_sequence_from_parsed(parsed: Dict[str, Any]) -> List[str]:
    seq = [
        parsed.get('method', 'GET'),
        parsed.get('path_only', '/'),
        str(parsed.get('status', 200)),
    ]
    ua = parsed.get('http_user_agent', '') or ''
    if 'Mozilla' in ua:
        seq.append('Mozilla')
    elif 'curl' in ua:
        seq.append('curl')
    elif 'python' in ua.lower():
        seq.append('python')
    else:
        seq.append('Other-Agent')
    return seq

# Construct a common log-format line for a test request
from datetime import datetime

def make_log_line(method: str, uri: str, user_agent: str = 'Mozilla/5.0', ip: str = '127.0.0.1') -> str:
    ts = datetime.utcnow().strftime('%d/%b/%Y:%H:%M:%S +0000')
    return f'{ip} - - [{ts}] "{method} {uri} HTTP/1.1" 200 0 "-" "{user_agent}"'

@torch.no_grad()
def score_request(method: str, uri: str, user_agent: str = 'Mozilla/5.0', threshold: float = 0.5) -> Tuple[float, bool]:
    line = make_log_line(method, uri, user_agent)
    processed = preprocessor.process_log_entry(line)
    if not processed:
        return 0.0, False
    seq = build_sequence_from_parsed(processed['parsed'])
    enc = tokenizer.encode(seq, max_length=128)
    input_ids = enc['input_ids'].unsqueeze(0)
    attention_mask = enc['attention_mask'].unsqueeze(0)
    outputs = trainer.model(input_ids=input_ids, attention_mask=attention_mask)
    score = float(outputs['anomaly_score'].item())
    is_anom = score > threshold
    return score, is_anom

print(score_request('GET', '/ecommerce/health'))

  ts = datetime.utcnow().strftime('%d/%b/%Y:%H:%M:%S +0000')


NameError: name 'trainer' is not defined

In [6]:
# Build a small eval set: benign vs malicious payloads
random.seed(42)

# Benign URIs from parsed_cache or defaults
benign_uris = []
for p in parsed_cache:
    benign_uris.append(p.get('path_only', '/'))
    if len(benign_uris) >= 50:
        break
if not benign_uris:
    benign_uris = ['/ecommerce/', '/ecommerce/products', '/blog-cms/', '/rest-api/api/users'] * 10

malicious_payloads = [
    "/ecommerce/search?q=' OR '1'='1",
    "/ecommerce/search?q=%27%20OR%201%3D1--",
    "/ecommerce/product?id=1;DROP TABLE users;--",
    "/blog-cms/?q=<script>alert(1)</script>",
    "/rest-api/api/users/../../../../etc/passwd",
    "/rest-api/api/users?name=`cat /etc/passwd`",
]

# Score samples
rows = []
for uri in random.sample(benign_uris, min(50, len(benign_uris))):
    s, yhat = score_request('GET', uri)
    rows.append({
        'uri': uri,
        'label': 0,
        'score': s,
        'pred': int(yhat)
    })
for uri in malicious_payloads:
    s, yhat = score_request('GET', uri)
    rows.append({
        'uri': uri,
        'label': 1,
        'score': s,
        'pred': int(yhat)
    })

df = pd.DataFrame(rows)
df

  ts = datetime.utcnow().strftime('%d/%b/%Y:%H:%M:%S +0000')


NameError: name 'trainer' is not defined

In [7]:
# Compute simple metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

y_true = df['label'].values
y_pred = df['pred'].values
y_score = df['score'].values

acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
try:
    auc = roc_auc_score(y_true, y_score)
except Exception:
    auc = float('nan')

print({'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': auc})

# Show top suspicious by score
df.sort_values('score', ascending=False).head(10)

NameError: name 'df' is not defined

## Notes
- This demo uses only benign traffic for training, so performance against malicious payloads depends on how "out-of-distribution" they appear.
- Improve results by adding more benign coverage, tuning thresholding, and expanding the sequence/features (templates, params, headers).
- For continuous learning, periodically retrain with fresh benign logs and validate on a held-out malicious set.