In [1]:
# tokenized_preprocess_nginx_logs.py
import pandas as pd
import torch
from transformers import AutoTokenizer

# ------------------------------
# CONFIG
# ------------------------------
CSV_FILE = "synthetic_nginx_logs.csv"   # point to your CSV (change if needed)
MAX_LENGTH = 128
OUTPUT_PT = "tokenized_logs.pt"
TOKENIZER_NAME = "distilroberta-base"  # keep same

# ------------------------------
# LOAD DATA
# ------------------------------
df = pd.read_csv(CSV_FILE, dtype=str)  # read everything as string to avoid dtype surprises
print(f"Loaded {len(df)} rows from {CSV_FILE}")

# ------------------------------
# ENSURE EXPECTED FIELDS (from your nginx log_format)
# ------------------------------
expected_fields = [
    "time","msec","client_ip","host","method","uri","path","query","protocol",
    "status","body_bytes_sent","request_length","request_time","upstream_response_time",
    "upstream_addr","user_agent","referer","content_type","x_forwarded_for",
    "ssl_protocol","ssl_cipher"
]

# Add any missing expected columns as empty strings (keeps schema stable)
for c in expected_fields:
    if c not in df.columns:
        df[c] = ""

# If there is no label column, create a default (0 = benign). If present, cast to int.
if "label" not in df.columns:
    df["label"] = "0"
else:
    # normalize label column (some CSVs may have strings)
    df["label"] = df["label"].fillna("0").astype(str)

# Replace NaN and None in all fields with empty string
df = df.fillna("")

# ------------------------------
# TEXT FUSION FUNCTION (exact fields)
# ------------------------------
def row_to_text_exact(row):
    # Choose fields that are most informative for text models.
    # Keep order reasonable: method, host, uri, path, query, client_ip, upstream_addr,
    # user_agent, referer, protocol, status, content_type, ssl info, timestamp
    parts = [
        row.get("method", ""),
        row.get("host", ""),
        row.get("uri", ""),
        row.get("path", ""),
        row.get("query", ""),
        row.get("client_ip", ""),
        row.get("upstream_addr", ""),
        row.get("user_agent", ""),
        row.get("referer", ""),
        row.get("protocol", ""),
        row.get("status", ""),
        row.get("content_type", ""),
        row.get("x_forwarded_for", ""),
        row.get("ssl_protocol", ""),
        row.get("ssl_cipher", ""),
        row.get("time", "")
    ]
    # filter out empty items and join with a single space
    return " ".join([str(p).strip() for p in parts if str(p).strip() != ""])

# Build text column
df["text"] = df.apply(row_to_text_exact, axis=1)

print("Example fused text (first 5):")
print(df["text"].head(5).to_list())

# ------------------------------
# TOKENIZER
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

encodings = tokenizer(
    list(df["text"]),
    truncation=True,
    padding="max_length",
    max_length=MAX_LENGTH,
    return_tensors="pt",
)

# ------------------------------
# LABELS
# ------------------------------
# Convert label strings to integers (safe cast)
labels = torch.tensor(df["label"].astype(int).values, dtype=torch.long)

# ------------------------------
# SAVE TENSORS
# ------------------------------
dataset = {
    "input_ids": encodings["input_ids"],
    "attention_mask": encodings["attention_mask"],
    "labels": labels,
    # optional: keep original indices / metadata for debugging
    "meta_index": torch.arange(len(df), dtype=torch.long),
}

torch.save(dataset, OUTPUT_PT)

print(f"\n✔️ Saved tokenized tensors to {OUTPUT_PT}")
print(f"input_ids shape: {dataset['input_ids'].shape}")
print(f"attention_mask shape: {dataset['attention_mask'].shape}")
print(f"labels shape: {dataset['labels'].shape}")


Loaded 7000 rows from synthetic_nginx_logs.csv
Example fused text (first 5):
['GET shop.example.com /product/123 /product/123 1; DROP TABLE users; 243.87.246.107 170.192.56:15901 python-requests/2.25.1 - HTTP/1.1 500 application/json - - - 2025-12-10T20:48:08.841422', 'GET alpha-triradiate-adalberto.ngrok-free.dev /profile /profile q=879 197.1.74.78 74.85.97:8854 PostmanRuntime/7.28.4 - HTTP/1.1 200 application/json - - - 2025-12-11T05:39:32.841422', 'POST shop.example.com /dashboard /dashboard q=442 196.12.196.145 172.18.0.5:80 curl/7.68.0 https://google.com HTTP/1.1 200 application/x-www-form-urlencoded - TLSv1.3 ECDHE-ECDSA-AES256-GCM-SHA384 2025-12-11T02:38:04.841422', 'POST shop.example.com /dashboard /dashboard - 122.155.200.122 54.113.254:9409 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/143.0.0.0 Safari/537.36 - HTTP/2.0 200 text/html - TLSv1.2 ECDHE-ECDSA-AES256-GCM-SHA384 2025-12-11T16:09:13.841422', 'GET shop.example.com /"_onmouseover="alert(\'XSS\')"