In [1]:
# synthetic_nginx_logs_with_attacks.py
import csv
import random
import datetime

# ------------------------
# CONFIG
# ------------------------
NUM_BENIGN = 5000
NUM_ATTACKS = 2000
OUTPUT_FILE = "synthetic_nginx_logs.csv"
NOW = datetime.datetime.utcnow()

# ------------------------
# HELPERS
# ------------------------
def random_ipv4():
    return f"{random.randint(1,255)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(0,255)}"

def random_ipv6_like():
    parts = [format(random.randint(0,0xffff), 'x') for _ in range(8)]
    return ":".join(parts)

def upstream_addr():
    return f"{random.randint(10,200)}.{random.randint(0,255)}.{random.randint(0,255)}:{random.randint(1024,65535)}"

def iso8601_minus(seconds):
    return (NOW - datetime.timedelta(seconds=seconds)).isoformat()

# ------------------------
# TEMPLATES / PAYLOADS
# ------------------------
BENIGN_PATHS = [
    "/dvwa/login.php",
    "/login",
    "/search",
    "/product/123",
    "/cart/add",
    "/api/v1/user",
    "/dashboard",
    "/profile",
    "/checkout",
]

ATTACK_PAYLOADS = [
    # SQL Injection
    "' OR 1=1 --",
    "admin' --",
    "1; DROP TABLE users;",
    "' UNION SELECT * FROM users --",
    # XSS
    "<script>alert(1)</script>",
    "\" onmouseover=\"alert('XSS')\"",
    # Path traversal
    "../../etc/passwd",
    "..%2F..%2F..%2Fetc%2Fpasswd",
    # Command injection
    "; rm -rf /",
    "| cat /etc/passwd",
]

USER_AGENTS_BENIGN = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/143.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0 Safari/537.36",
    "curl/7.68.0",
    "PostmanRuntime/7.28.4",
]

USER_AGENTS_ATTACK = [
    "sqlmap/1.5.6 (http://sqlmap.org)",
    "masscan/1.0",
    "python-requests/2.25.1",
    "nikto/2.1.6",
]

CONTENT_TYPES = ["-", "text/html", "application/json", "application/x-www-form-urlencoded"]

METHODS = ["GET", "POST"]

PROTOCOLS = ["HTTP/1.1", "HTTP/2.0"]

# ------------------------
# FIELD NAMES (exact nginx fields + label at end)
# ------------------------
FIELDNAMES = [
    "time","msec","client_ip","host","method","uri","path","query","protocol",
    "status","body_bytes_sent","request_length","request_time","upstream_response_time",
    "upstream_addr","user_agent","referer","content_type","x_forwarded_for",
    "ssl_protocol","ssl_cipher","label"
]

# ------------------------
# ATTACKER IP SET (simulate focused attackers)
# ------------------------
NUM_ATTACKER_IPS = max(5, int(NUM_ATTACKS * 0.01))  # a few attacker IPs
attacker_ips = [random_ipv4() for _ in range(NUM_ATTACKER_IPS)]

# ------------------------
# GENERATE ROW
# ------------------------
def generate_row(is_attack=False):
    seconds_ago = random.randint(0, 24 * 3600)
    time_iso = iso8601_minus(seconds_ago)
    msec = str(int((NOW - datetime.timedelta(seconds=seconds_ago)).timestamp() * 1000))

    if is_attack:
        # choose an attacker IP (clustered)
        client_ip = random.choice(attacker_ips)
        user_agent = random.choice(USER_AGENTS_ATTACK)
        # attacks: more likely to have payloads in query; occasionally in uri
        if random.random() < 0.85:
            query = random.choice(ATTACK_PAYLOADS)
            uri = random.choice(BENIGN_PATHS)
            path = uri
        else:
            payload = random.choice(ATTACK_PAYLOADS)
            uri = "/" + payload.replace(" ", "_")[:200]
            path = uri
            query = "-"
        # attack status skewed to errors or blocks
        status = random.choices([200,403,404,500,400], weights=[0.2,0.35,0.1,0.25,0.1])[0]
        body_bytes_sent = random.randint(20, 4000)
        request_length = random.randint(500, 6000)  # larger due to payloads
        request_time = round(random.uniform(0.001, 5.0), 3)
        upstream_response_time = f"{round(random.uniform(0.05, 3.0), 3):.3f}"
        upstream = random.choice(["172.18.0.5:80", upstream_addr()])
        referer = "-" if random.random() < 0.9 else "https://malicious.example"
        content_type = random.choice(CONTENT_TYPES)
        x_forwarded_for = random_ipv6_like() if random.random() < 0.6 else "-"
        ssl_protocol = "-"  # often absent for scanners
        ssl_cipher = "-"
        user_agent_final = user_agent
        label = 1
    else:
        # benign
        client_ip = random_ipv4()
        user_agent_final = random.choice(USER_AGENTS_BENIGN)
        uri = random.choice(BENIGN_PATHS)
        path = uri
        query = "-" if random.random() < 0.5 else f"q={random.randint(1,999)}"
        status = random.choices([200,301,302,404], weights=[0.92,0.02,0.02,0.04])[0]
        body_bytes_sent = random.randint(200, 8000)
        request_length = random.randint(200, 2000)
        request_time = round(random.uniform(0.001, 1.0), 3)
        upstream_response_time = f"{round(random.uniform(0.001, 0.5), 3):.3f}"
        upstream = random.choice(["172.18.0.5:80", upstream_addr()])
        referer = "-" if random.random() < 0.8 else "https://google.com"
        content_type = random.choice(CONTENT_TYPES)
        x_forwarded_for = "-" if random.random() < 0.98 else random_ipv6_like()
        ssl_protocol = random.choice(["TLSv1.2", "TLSv1.3", "-"])
        ssl_cipher = "-" if ssl_protocol == "-" else random.choice([
            "ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES256-GCM-SHA384"
        ])
        label = 0

    row = {
        "time": time_iso,
        "msec": msec,
        "client_ip": client_ip,
        "host": random.choice(["alpha-triradiate-adalberto.ngrok-free.dev", "example.com", "shop.example.com"]),
        "method": random.choice(METHODS),
        "uri": uri,
        "path": path,
        "query": query,
        "protocol": random.choice(PROTOCOLS),
        "status": status,
        "body_bytes_sent": body_bytes_sent,
        "request_length": request_length,
        "request_time": request_time,
        "upstream_response_time": upstream_response_time,
        "upstream_addr": upstream,
        "user_agent": user_agent_final,
        "referer": referer,
        "content_type": content_type,
        "x_forwarded_for": x_forwarded_for,
        "ssl_protocol": ssl_protocol,
        "ssl_cipher": ssl_cipher,
        "label": label,
    }
    return row

# ------------------------
# GENERATE ALL ROWS (interleave benign + attack to avoid pure blocks)
# ------------------------
rows = []
# interleave to simulate mixed traffic
total = NUM_BENIGN + NUM_ATTACKS
benign_remaining = NUM_BENIGN
attack_remaining = NUM_ATTACKS

while benign_remaining > 0 or attack_remaining > 0:
    # probability proportional to remaining counts
    if attack_remaining > 0 and random.random() < (attack_remaining / (benign_remaining + attack_remaining)):
        rows.append(generate_row(is_attack=True))
        attack_remaining -= 1
    elif benign_remaining > 0:
        rows.append(generate_row(is_attack=False))
        benign_remaining -= 1

# ------------------------
# SAVE TO CSV
# ------------------------
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

print("✔ CSV saved:", OUTPUT_FILE)
print("Total rows:", len(rows))
print("Benign:", NUM_BENIGN, "Attacks:", NUM_ATTACKS)
print("Attacker IPs example:", attacker_ips[:5])


✔ CSV saved: synthetic_nginx_logs.csv
Total rows: 7000
Benign: 5000 Attacks: 2000
Attacker IPs example: ['1.61.182.209', '204.227.137.15', '77.162.183.159', '145.206.127.103', '188.246.116.214']


  NOW = datetime.datetime.utcnow()
