# 설치 & 기본 설정 

In [None]:
# (필요 시) 설치
%pip -q install dpkt chardet

# 경로 설정
PCAP_PATH = "../pcap_file/demo_pcap/demo_payload_preproc.pcap"
OUT_TXT   = "samples_pre.txt"

# HTTP 파싱/디코딩 유틸

In [None]:
import dpkt, urllib.parse, base64, binascii, zlib, chardet, re, unicodedata
from typing import Optional, Dict
from urllib.parse import urlsplit, parse_qs
from pathlib import Path

def _maybe_decode_text(b: bytes) -> str:
    if not b: return ""
    guess = chardet.detect(b) or {}
    enc = guess.get("encoding") or "utf-8"
    try:
        return b.decode(enc, errors="ignore")
    except Exception:
        return b.decode("utf-8", errors="ignore")

def _http_extract(raw: bytes):
    """HTTP 요청/응답 파싱 → (mode, headers, body)"""
    try:
        if raw.startswith(b"HTTP/1."):
            resp = dpkt.http.Response(raw)
            return ("resp", {k.lower(): v for k,v in resp.headers.items()}, resp.body or b"")
        else:
            req = dpkt.http.Request(raw)
            return ("req", {k.lower(): v for k,v in req.headers.items()}, req.body or b"")
    except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
        return (None, None, raw)

def _decode_chunked(body: bytes) -> bytes:
    out = bytearray(); i = 0
    while True:
        j = body.find(b"\r\n", i)
        if j < 0: break
        sz_hex = body[i:j].split(b";", 1)[0]
        try: size = int(sz_hex, 16)
        except ValueError: break
        i = j + 2
        if size == 0: break
        out.extend(body[i:i+size]); i += size + 2
    return bytes(out) if out else body

def _maybe_gzip_deflate(headers: Optional[Dict[str,str]], body: bytes) -> bytes:
    if not headers: return body
    enc = headers.get("content-encoding", "").lower()
    if "gzip" in enc:
        try: return zlib.decompress(body, 16 + zlib.MAX_WBITS)
        except Exception: pass
    if "deflate" in enc:
        try: return zlib.decompress(body, -zlib.MAX_WBITS)
        except Exception: pass
    return body

def _maybe_chunked(headers: Optional[Dict[str,str]], body: bytes) -> bytes:
    if not headers: return body
    if "chunked" in headers.get("transfer-encoding","").lower():
        try: return _decode_chunked(body)
        except Exception: return body
    return body

def headers_pick(h: Dict[str,str]) -> str:
    keys = ["content-type","content-encoding","transfer-encoding"]
    parts = [f"h[{k[:2]}]={h[k]}" for k in keys if h.get(k)]
    return " ".join(parts) if parts else ""


# 전처리 규칙(소문자/NFKC/마스킹/토큰/길이)

In [None]:
# 정규식/룰
RE_EMAIL = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,}\b")
RE_IPV4  = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
RE_HEX   = re.compile(r"\b[0-9A-Fa-f]{16,}\b")
RE_B64   = re.compile(r"\b[A-Za-z0-9+/]{16,}={0,2}\b")
RE_SID   = re.compile(r"\b[a-f0-9]{8,32}\b")
RE_NUM   = re.compile(r"\d+")
RE_SPLIT = re.compile(r"[^\w/]+")

def normalize_nfkc_lower(s: str) -> str:
    return unicodedata.normalize("NFKC", s).lower()

def url_decode_once(s: str) -> str:  # + → space 포함
    return urllib.parse.unquote(s.replace("+", " "))

def mask_sensitive(s: str) -> str:
    s = RE_EMAIL.sub("[EMAIL]", s)
    s = RE_IPV4.sub("[IP]", s)
    s = RE_HEX.sub("[HEX]", s)
    s = RE_B64.sub("[B64]", s)
    s = RE_SID.sub("[SID]", s)
    return s

def tokenize_keep_slash_underscore(s: str):
    parts = RE_SPLIT.split(s)
    return [p for p in parts if p]

def numbers_to_hash(tokens):
    out = []
    for t in tokens:
        out.append("#NUM" if RE_NUM.fullmatch(t) else RE_NUM.sub("#NUM", t))
    return out

def head_tail_trim(tokens, max_len=320, head=192, tail=128):
    if len(tokens) <= max_len: return tokens
    return tokens[:head] + ["[...]"] + tokens[-tail:]

def preprocess_text(raw: str):
    s = normalize_nfkc_lower(raw)
    s = url_decode_once(s)
    s = mask_sensitive(s)
    toks = tokenize_keep_slash_underscore(s)
    toks = numbers_to_hash(toks)
    return toks


# 한 줄 표현 생성 + 실행(PCAP→samples_pre.txt)

In [None]:
def body_signature_summary(body: bytes) -> str:
    if not body: return "BODY.len=0 BODY.sig=[-]"
    t = body[:64].decode("latin1","ignore").lower(); sig = []
    if "mz" in t: sig.append("MZ")
    if "pk" in t: sig.append("PK")
    if "gif89a" in t: sig.append("GIF")
    if "%pdf" in t or "pdf" in t: sig.append("PDF")
    body256 = body[:256].decode("latin1","ignore")
    if re.fullmatch(r"[A-Za-z0-9+/=\s]{32,}", body256 or ""): sig.append("B64?")
    if re.fullmatch(r"[0-9A-Fa-f\s]{32,}", body256 or ""):    sig.append("HEX?")
    return f"BODY.len={len(body)} BODY.sig=[{','.join(sig) if sig else '-'}]"

def build_one_line_from_http(raw: bytes) -> str:
    mode, hdrs, body = _http_extract(raw)
    if mode == "resp":
        return f"RESP | {headers_pick(hdrs)} | {body_signature_summary(body)}"
    elif mode == "req":
        req = dpkt.http.Request(raw)
        host = hdrs.get("host","-"); ua = hdrs.get("user-agent","-")
        u = urlsplit(req.uri); qkeys = ",".join(sorted(parse_qs(u.query).keys())) if u.query else "-"
        return f"{req.method} {u.path or '/'} ? {qkeys} | host={host} | ua={ua} | {headers_pick(hdrs)} | {body_signature_summary(req.body or b'')}"
    # 파싱 실패 시
    preview = raw[:120].decode("latin1","ignore").replace("\r"," ").replace("\n"," ")
    return f"RAW {preview}"

def add_sep_tokens(line: str) -> str:
    parts = line.split("|"); left = parts[0]; rest = [p.strip() for p in parts[1:]]
    out = []
    if " ?" in left:
        path, q = left.split(" ?", 1)
        out += ["[SEP_PATH]", path.strip(), "[SEP_Q]", q.strip()]
    else:
        out += ["[SEP_PATH]", left.strip()]
    ua_blk   = next((p for p in rest if p.lower().startswith("ua=")), "")
    host_blk = next((p for p in rest if p.lower().startswith("host=")), "")
    hdr_blk  = next((p for p in rest if p.lower().startswith("h[")), "")
    body_blk = next((p for p in rest if p.lower().startswith("body.len=")), "")
    if host_blk: out += ["[SEP_H]", host_blk]
    if ua_blk:   out += ["[SEP_UA]", ua_blk]
    if hdr_blk:  out += ["[SEP_H]", hdr_blk]
    if body_blk: out += ["[SEP_BODY]", body_blk]
    return " ".join(out)

def iter_http_records(pcap_path: str):
    with open(pcap_path, "rb") as f:
        for ts, buf in dpkt.pcap.Reader(f):
            eth = dpkt.ethernet.Ethernet(buf)
            ip  = getattr(eth, "data", None)
            if not isinstance(ip, dpkt.ip.IP):  continue
            tcp = getattr(ip, "data", None)
            if not isinstance(tcp, dpkt.tcp.TCP) or not tcp.data:  continue
            if tcp.dport == 80 or tcp.sport == 80:
                yield tcp.data

def preprocess_one_line(raw_http: bytes, max_tokens=320) -> str:
    line = build_one_line_from_http(raw_http)
    line = add_sep_tokens(line)
    toks = preprocess_text(line)
    toks = head_tail_trim(toks, max_len=max_tokens, head=192, tail=128)
    return " ".join(toks)

# 실행: PCAP → samples_pre.txt
lines = []
for raw in iter_http_records(PCAP_PATH):
    s = preprocess_one_line(raw, max_tokens=320)
    if s.strip(): lines.append(s)

Path(OUT_TXT).write_text("\n".join(lines), encoding="utf-8")
print(f"Wrote {len(lines)} lines -> {OUT_TXT}")
print("\nPreview:\n", "\n".join(lines[:5]))


# 키워드 스캔 룰 & 스캐너

In [None]:
import re
from pathlib import Path

IN_TXT  = "samples_pre.txt"
OUT_KW  = "suspect_keywords.txt"
OUT_HIT = "hits_sample.txt"

# === 전처리된 한 줄 형식에 맞춘 관대한 패턴들 ===
RULES = {
    # 1) 실행/다운로드 명령 (하이픈/공백 변화 허용)
    "cmd": r"\b(wget|curl|powershell\s*-enc|mshta|certutil|bitsadmin|rundll32|python\s*-c|regsvr32)\b",

    # 2) 확장자: 전처리 때문에 '.'가 사라진 경우도 허용
    "ext": r"(\.|\\b)(exe|dll|ps1|js|vbs|bat)\\b",

    # 3) 경로: 하이픈/점이 공백으로 바뀐 경우도 허용
    "path": r"(/wp[\-\s_]?includes/|/gate[.\s_]?php|/api/.*/upload|wp-admin/admin[\-\s_]?ajax[.\s_]?php)",

    # 4) 헤더: 하이픈이 공백으로 바뀐 MIME도 매치
    "header": r"(application[/\-\s]?x[/\-\s]?msdownload|application[/\-\s]?octet[/\-\s]?stream|multipart[/\-\s]?form[/\-\s]?data)",

    # 5) 인코딩 흔적: %25, %2f, .. /, +exec+, base64/eval/fromcharcode
    "encoding": r"(%25|%2f|\.\./|\+exec\+|base64|eval|fromcharcode)"
}

# === 맥락 결합: 전처리된 토큰 형태를 반영 ===
CONTEXT_RULES = [
    # 헤더가 실행파일/바이너리 + 바디 시그니처(MZ/B64/HEX)
    # h[ct] 대신 h co / h ct 모두 허용, body.sig=[..] → 'body sig mz|b64|hex' 형태 매치
    (
        r"\bh\s*(\[\s*(ct|co)\s*\]|\s*(ct|co))\b.*?(application[/\-\s]?x[/\-\s]?msdownload|application[/\-\s]?octet[/\-\s]?stream)",
        r"\bbody\s*len\b.*\bbody\s*sig\b.*\b(mz|b64|hex)\b",
        "header+sig"
    ),
    # 멀티파트 업로드 (파일명까지는 전처리에서 잘리기 쉬워서 헤더만 우선)
    (
        r"multipart[/\-\s]?form[/\-\s]?data",
        r"(/api/.*/upload|filename|body\s*sig\b)",
        "multipart"
    ),
    # C2 경로 + 인코딩/우회 흔적
    (
        r"/gate[.\s_]?php",
        r"(powershell\s*-enc|\+exec\+|%25|%2f|\.\./)",
        "gate+enc"
    )
]

def scan_lines(lines):
    hits = []
    kw_set = set()

    for line in lines:
        L = line.lower()
        matched = False

        # 1) 단일 룰 스캔
        for name, pat in RULES.items():
            for m in re.finditer(pat, L, flags=re.I):
                kw_set.add(m.group(0))
                matched = True

        # 2) 맥락 결합 스캔
        for pat_a, pat_b, tag in CONTEXT_RULES:
            if re.search(pat_a, L, flags=re.I) and re.search(pat_b, L, flags=re.I):
                kw_set.add(f"[CTX:{tag}]")
                matched = True

        if matched:
            hits.append(line)

    return sorted(kw_set), hits


# 실행(스캔→파일 저장) + 미리보기

In [None]:
# 실행
lines = Path(IN_TXT).read_text(encoding="utf-8").splitlines() if Path(IN_TXT).exists() else []
kw, hits = scan_lines(lines)

# 저장
Path(OUT_KW).write_text("\n".join(kw), encoding="utf-8")
Path(OUT_HIT).write_text("\n".join(hits[:10]), encoding="utf-8")

print(f"[OK] keywords -> {OUT_KW} ({len(kw)}개)")
print(f"[OK] hits -> {OUT_HIT} (미리보기 {min(10, len(hits))}줄)")

# 디버그: 어떤 라인이 무엇 때문에 걸렸는지 보고 싶다면 아래 주석 해제
# for h in hits[:10]:
#     print("----")
#     print(h)
