# 셀 A) 전처리 유틸 (NFKC·소문자·마스킹·토큰화·길이제한)

In [None]:
import re, unicodedata, urllib.parse, base64, binascii
from typing import List, Dict

RE_EMAIL = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,}\b")
RE_IPV4  = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
RE_HEX   = re.compile(r"\b[0-9A-Fa-f]{16,}\b")
RE_B64   = re.compile(r"\b[A-Za-z0-9+/]{16,}={0,2}\b")
RE_SID   = re.compile(r"\b[a-f0-9]{8,32}\b")  # 세션/토큰류(느슨)
RE_NUM   = re.compile(r"\d+")
RE_SPLIT = re.compile(r"[^\w/\.\-]+")

# Keep these signature tokens as-is
KEEP_ATOM = {"b64","hex","mz","pk","gif","pdf"}

def normalize_nfkc_lower(s: str) -> str:
    return unicodedata.normalize("NFKC", s).lower()

def url_decode_once(s: str) -> str:
    # + -> space 는 form-urlencoded에서 흔함
    return urllib.parse.unquote(s.replace("+", " "))

def mask_sensitive(s: str) -> str:
    s = RE_EMAIL.sub("[EMAIL]", s)
    s = RE_IPV4.sub("[IP]", s)
    # 긴 HEX/B64 먼저 치환(순서 중요)
    s = RE_HEX.sub("[HEX]", s)
    s = RE_B64.sub("[B64]", s)
    # SID/토큰류
    s = RE_SID.sub("[SID]", s)
    return s

def tokenize_keep_slash_underscore(s: str) -> List[str]:
    # '_' 는 단어문자(\w)에 포함되어 유지됨, '/' 는 따로 허용
    parts = RE_SPLIT.split(s)
    return [p for p in parts if p]

def numbers_to_hash(tokens: List[str]) -> List[str]:
    out = []
    for t in tokens:
        tl = t.lower()
        if tl in KEEP_ATOM:
            out.append(t)
            continue
        if RE_NUM.fullmatch(t):
            out.append("#NUM")
        else:
            out.append(RE_NUM.sub("#NUM", t))
    return out
def head_tail_trim(tokens: List[str], max_len=320, head=192, tail=128) -> List[str]:
    if len(tokens) <= max_len:
        return tokens
    return tokens[:head] + ["[...]" ] + tokens[-tail:]

def preprocess_text(raw: str) -> List[str]:
    """
    전처리 전체: NFKC+lower -> URL 1회 디코드 -> 민감치환 -> 토큰화 -> 숫자군 치환
    """
    s = normalize_nfkc_lower(raw)
    s = url_decode_once(s)
    s = mask_sensitive(s)
    toks = tokenize_keep_slash_underscore(s)
    toks = numbers_to_hash(toks)
    return toks


# 셀 B) HTTP 파트 추출 → 한 줄 표현 빌드

- 구분 토큰: [SEP_PATH][SEP_Q][SEP_UA][SEP_H][SEP_BODY]
- 바디는 길이/시그니처(B64/HEX/MZ/PK 등)만 요약

In [None]:
import dpkt
from urllib.parse import urlsplit, parse_qs

SIG_HINTS = ["mz", "pk", "gif89a", "%pdf"]

def body_signature_summary(body: bytes) -> str:
    if not body:
        return "BODY.len=0"
    text = body[:64].decode("latin1", errors="ignore").lower()
    sig = []
    # 간단한 시그니처
    if "mz" in text: sig.append("MZ")
    if "pk" in text: sig.append("PK")
    if "gif89a" in text: sig.append("GIF")
    if "%pdf" in text: sig.append("PDF")
    # 길고 규칙적인 텍스트면 B64/HEX 힌트
    if re.fullmatch(r"[A-Za-z0-9+/=\s]{32,}", body[:256].decode("latin1", errors="ignore") or ""):
        sig.append("B64?")
    if re.fullmatch(r"[0-9A-Fa-f\s]{32,}", body[:256].decode("latin1", errors="ignore") or ""):
        sig.append("HEX?")
    return f"BODY.len={len(body)} BODY.sig=[{','.join(sig) if sig else '-'}]"

def headers_pick(d: Dict[str,str]) -> str:
    # 관심 헤더만 축약
    keys = ["content-type","content-encoding","transfer-encoding"]
    parts = []
    for k in keys:
        v = d.get(k)
        if v:
            parts.append(f"h[{k[:2]}]={v}")
    return " ".join(parts) if parts else ""

def build_one_line_from_http(raw: bytes) -> str:
    """
    METHOD PATH ? query_keys | HOST=... | UA=... | H[...] | BODY.len=... BODY.sig=...
    """
    line = ""
    try:
        if raw.startswith(b"HTTP/1."):
            # Response
            resp = dpkt.http.Response(raw)
            hdrs = {k.lower(): v for k,v in resp.headers.items()}
            body = resp.body or b""
            hsum = headers_pick(hdrs)
            line = f"RESP {resp.status} {resp.reason} | {hsum} | {body_signature_summary(body)}"
        else:
            # Request
            req = dpkt.http.Request(raw)
            hdrs = {k.lower(): v for k,v in req.headers.items()}
            host = hdrs.get("host","-")
            ua   = hdrs.get("user-agent","-")
            # PATH / QUERY keys
            u = urlsplit(req.uri)
            qkeys = ",".join(sorted(parse_qs(u.query).keys())) if u.query else "-"
            hsum  = headers_pick(hdrs)
            body  = req.body or b""
            base  = f"{req.method} {u.path or '/'} ? {qkeys} | host={host} | ua={ua} | {hsum} | {body_signature_summary(body)}"
            line  = base
    except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
        # HTTP 파싱 실패 시 원문 앞부분만
        preview = raw[:120].decode("latin1","ignore").replace("\r"," ").replace("\n"," ")
        line = f"RAW {preview}"
    return line

def add_sep_tokens(line: str) -> str:
    """
    구분 토큰 삽입: PATH, QUERY, UA, HEADERS, BODY 순서
    """
    # 예상 포맷을 기준으로 간단 분해
    # ... PATH ? Q | host=... | ua=... | h[..]=.. | BODY...
    parts = line.split("|")
    left = parts[0]
    rest = [p.strip() for p in parts[1:]]
    out = []

    # PATH / Q
    if " ?" in left:
        path, q = left.split(" ?", 1)
        out += ["[SEP_PATH]", path.strip(), "[SEP_Q]", q.strip()]
    else:
        out += ["[SEP_PATH]", left.strip()]

    # UA / HOST / HEADERS / BODY 위치 조정
    ua_blk = next((p for p in rest if p.lower().startswith("ua=")), "")
    host_blk = next((p for p in rest if p.lower().startswith("host=")), "")
    hdr_blk = next((p for p in rest if p.lower().startswith("h[")), "")
    body_blk = next((p for p in rest if p.lower().startswith("body.len=")), "")

    if host_blk: out += ["[SEP_H]", host_blk]
    if ua_blk:   out += ["[SEP_UA]", ua_blk]
    if hdr_blk:  out += ["[SEP_H]", hdr_blk]
    if body_blk: out += ["[SEP_BODY]", body_blk]

    return " ".join(out)


# 셀 C) 전처리 파이프라인 → 토큰 제한 → 한 줄 저장 (samples_pre.txt)

In [None]:
from pathlib import Path

PCAP_PATH = "../pcap_file/demo_pcap/demo_payload_preproc.pcap"
OUT_TXT   = "samples_pre.txt"

def iter_http_records(pcap_path: str):
    with open(pcap_path, "rb") as f:
        for ts, buf in dpkt.pcap.Reader(f):
            eth = dpkt.ethernet.Ethernet(buf)
            ip = getattr(eth, "data", None)
            if not isinstance(ip, dpkt.ip.IP): 
                continue
            tcp = getattr(ip, "data", None)
            if not isinstance(tcp, dpkt.tcp.TCP) or not tcp.data:
                continue
            if tcp.dport == 80 or tcp.sport == 80:
                yield tcp.data

def preprocess_one_line(raw_http_bytes: bytes, max_tokens=320) -> str:
    # 1) HTTP 요약 한 줄 생성
    base_line = build_one_line_from_http(raw_http_bytes)
    # 2) SEP 토큰 삽입
    with_sep  = add_sep_tokens(base_line)
    # 3) 전처리(소문자/NFKC/URL 1회/마스킹/토큰화/숫자치환)
    toks = preprocess_text(with_sep)
    # 4) 길이 제한(head-tail 우선)
    toks = head_tail_trim(toks, max_len=max_tokens, head=192, tail=128)
    # 5) 최종 한 줄
    return " ".join(toks)

# 실행: pcap → samples_pre.txt
lines = []
for raw in iter_http_records(PCAP_PATH):
    line = preprocess_one_line(raw, max_tokens=320)
    if line.strip():
        lines.append(line)

Path(OUT_TXT).write_text("\n".join(lines), encoding="utf-8")
print(f"Wrote {len(lines)} lines -> {OUT_TXT}")
print("\nPreview:\n", "\n".join(lines[:3]))


# 셀 D) (선택) 결과 테이블 미리보기 & 길이 점검

In [None]:
import pandas as pd

df = pd.DataFrame({
    "line": lines,
    "len_tokens": [len(l.split()) for l in lines]
})
df
