## Detecta que el fichero sea válido

NOTA: Comprueba que sea un .ncap correcto.

In [8]:
import os
import struct

def comprobar_pcap(path):
    print(f"Abriendo archivo: {path}")

    if not os.path.exists(path):
        print("ERROR: el archivo no existe")
        return

    tamaño = os.path.getsize(path)
    print(f"Tamaño del archivo: {tamaño} bytes")

    if tamaño == 0:
        print("ERROR: archivo vacío")
        return

    with open(path, "rb") as f:
        cabecera = f.read(24)

        if len(cabecera) < 24:
            print("ERROR: archivo demasiado pequeño para ser PCAP")
            return

        magic = struct.unpack("<I", cabecera[:4])[0]

        print(f"Magic number: 0x{magic:08x}")

        # contar paquetes
        paquetes = 0

        while True:
            header = f.read(16)
            if len(header) < 16:
                break

            ts_sec, ts_usec, incl_len, orig_len = struct.unpack("<IIII", header)

            data = f.read(incl_len)

            if paquetes == 0:
                print("\nPrimer paquete encontrado:")
                print(f"Timestamp: {ts_sec}")
                print(f"Tamaño capturado: {incl_len} bytes")

                print("\nPrimeros 32 bytes:")
                print(data[:32])

            paquetes += 1

        print(f"\nTotal paquetes: {paquetes}")


# CAMBIA ESTA RUTA SI HACE FALTA
comprobar_pcap("descargas.pcap")

Abriendo archivo: descargas.pcap
Tamaño del archivo: 153442222 bytes
Magic number: 0xa1b2c3d4

Primer paquete encontrado:
Timestamp: 1771923568
Tamaño capturado: 54 bytes

Primeros 32 bytes:
b'\x00\x10 0@P\x00\x1e\x10\x1f\x00\x00\x08\x00E\x00\x00(\xac\xc3@\x00\x80\x06\xbe\x10\xc0\xa8\td\xc0o'

Total paquetes: 185635


## Lista paquetes

In [9]:
import os
import struct
import socket

def ip_to_str(ip_bytes):
    return socket.inet_ntoa(ip_bytes)

def analizar_pcap(path):

    if not os.path.exists(path):
        print("Archivo no existe")
        return

    with open(path, "rb") as f:

        # saltar cabecera global
        f.read(24)

        paquetes = 0

        while True:

            header = f.read(16)
            if len(header) < 16:
                break

            ts_sec, ts_usec, incl_len, orig_len = struct.unpack("<IIII", header)

            data = f.read(incl_len)

            paquetes += 1

            # comprobar que hay Ethernet + IP
            if len(data) < 34:
                continue

            eth_type = struct.unpack("!H", data[12:14])[0]

            # 0x0800 = IPv4
            if eth_type == 0x0800:

                ip_header = data[14:34]

                proto = ip_header[9]

                src_ip = ip_to_str(ip_header[12:16])
                dst_ip = ip_to_str(ip_header[16:20])

                proto_name = {
                    1: "ICMP",
                    6: "TCP",
                    17: "UDP"
                }.get(proto, str(proto))

                print(f"Paquete {paquetes}: {src_ip} → {dst_ip} ({proto_name})")

                # solo mostrar primeros 20
                if paquetes >= 20:
                    break

        print(f"\nTotal paquetes leídos: {paquetes}")


analizar_pcap(r"descargas.pcap")

Paquete 1: 192.168.9.100 → 192.111.5.128 (TCP)
Paquete 2: 192.168.9.100 → 142.250.200.106 (TCP)
Paquete 3: 142.250.200.106 → 192.168.9.100 (TCP)
Paquete 4: 52.97.117.50 → 192.168.9.100 (UDP)
Paquete 5: 192.168.9.100 → 52.97.117.50 (UDP)
Paquete 6: 142.250.200.106 → 192.168.9.100 (TCP)
Paquete 7: 192.168.9.100 → 142.250.200.106 (TCP)
Paquete 8: 192.168.9.100 → 172.217.168.170 (TCP)
Paquete 9: 142.250.200.106 → 192.168.9.100 (TCP)
Paquete 10: 172.217.168.170 → 192.168.9.100 (TCP)
Paquete 11: 192.168.9.100 → 192.168.9.1 (UDP)
Paquete 12: 192.168.9.100 → 208.67.222.222 (UDP)
Paquete 13: 192.168.9.100 → 52.97.117.50 (UDP)
Paquete 14: 192.168.9.1 → 192.168.9.100 (UDP)
Paquete 15: 172.217.168.170 → 192.168.9.100 (TCP)
Paquete 16: 142.250.200.106 → 192.168.9.100 (TCP)
Paquete 17: 172.217.168.170 → 192.168.9.100 (TCP)
Paquete 18: 172.217.168.170 → 192.168.9.100 (TCP)
Paquete 19: 192.168.9.100 → 172.217.168.170 (TCP)
Paquete 20: 192.168.9.100 → 216.239.38.223 (TCP)

Total paquetes leídos: 20


## Resumen de paquetes

In [10]:
import os
import struct
import socket
from collections import Counter

def ip_to_str(b): 
    return socket.inet_ntoa(b)

def safe_ascii_preview(b, n=200):
    # intenta sacar texto visible sin petar
    try:
        s = b[:n].decode("utf-8", errors="ignore")
    except Exception:
        return ""
    s = s.replace("\r", "\\r").replace("\n", "\\n")
    return s

def parse_dns_name(payload, offset):
    # parser DNS name muy simple (sin compresión por punteros para mantener robustez)
    labels = []
    i = offset
    while i < len(payload):
        l = payload[i]
        if l == 0:
            i += 1
            break
        # si hay compresión (puntero), salimos para no liarla
        if (l & 0xC0) == 0xC0:
            return None
        i += 1
        if i + l > len(payload):
            return None
        labels.append(payload[i:i+l].decode("utf-8", errors="ignore"))
        i += l
    if not labels:
        return None
    return ".".join(labels)

def analizar_pcap(path, max_print=30):
    if not os.path.exists(path):
        print("ERROR: archivo no existe")
        return

    if os.path.getsize(path) < 24:
        print("ERROR: demasiado pequeño para PCAP")
        return

    conv = Counter()
    puertos = Counter()
    dns_queries = Counter()

    mostrados = 0
    total = 0

    with open(path, "rb") as f:
        f.read(24)  # global header

        while True:
            ph = f.read(16)
            if len(ph) < 16:
                break

            # asumimos endianness little (como en tu ejemplo que funcionó)
            ts_sec, ts_usec, incl_len, orig_len = struct.unpack("<IIII", ph)

            data = f.read(incl_len)
            if len(data) < incl_len:
                break

            total += 1

            # Ethernet mínimo
            if len(data) < 14:
                continue

            eth_type = struct.unpack("!H", data[12:14])[0]
            if eth_type != 0x0800:  # IPv4
                continue

            # IPv4 header mínimo
            if len(data) < 14 + 20:
                continue

            ip_start = 14
            ver_ihl = data[ip_start]
            ihl = (ver_ihl & 0x0F) * 4
            if ihl < 20:
                continue
            if len(data) < ip_start + ihl:
                continue

            proto = data[ip_start + 9]
            src_ip = ip_to_str(data[ip_start+12:ip_start+16])
            dst_ip = ip_to_str(data[ip_start+16:ip_start+20])

            # conversación por IP/proto
            conv[(src_ip, dst_ip, proto)] += 1

            l4_start = ip_start + ihl
            if len(data) < l4_start + 4:
                continue

            proto_name = {1:"ICMP",6:"TCP",17:"UDP"}.get(proto, str(proto))

            # TCP/UDP: puertos + payload
            src_port = dst_port = None
            payload = b""

            if proto == 6 and len(data) >= l4_start + 20:  # TCP
                src_port, dst_port = struct.unpack("!HH", data[l4_start:l4_start+4])
                data_offset = (data[l4_start+12] >> 4) * 4
                if data_offset >= 20 and len(data) >= l4_start + data_offset:
                    payload = data[l4_start+data_offset:]
                puertos[(proto_name, src_port)] += 1
                puertos[(proto_name, dst_port)] += 1

            elif proto == 17 and len(data) >= l4_start + 8:  # UDP
                src_port, dst_port, udplen = struct.unpack("!HHH", data[l4_start:l4_start+6])
                payload = data[l4_start+8:]
                puertos[(proto_name, src_port)] += 1
                puertos[(proto_name, dst_port)] += 1

                # DNS básico: puerto 53 y query name simple
                if src_port == 53 or dst_port == 53:
                    # header DNS 12 bytes
                    if len(payload) >= 12:
                        qdcount = struct.unpack("!H", payload[4:6])[0]
                        if qdcount >= 1:
                            name = parse_dns_name(payload, 12)
                            if name:
                                dns_queries[name] += 1

            # imprimir algunos paquetes con detalle
            if mostrados < max_print:
                if src_port is not None:
                    print(f"#{total} {src_ip}:{src_port} → {dst_ip}:{dst_port} ({proto_name}) len={incl_len}")
                else:
                    print(f"#{total} {src_ip} → {dst_ip} ({proto_name}) len={incl_len}")

                # pistas HTTP en claro (solo texto)
                prev = safe_ascii_preview(payload, n=200)
                if prev.startswith("GET ") or prev.startswith("POST ") or "HTTP/1." in prev:
                    print("   HTTP(?)", prev)
                mostrados += 1

    print("\nResumen")
    print(f"Total paquetes leídos: {total}")

    print("\nTop 10 conversaciones (src → dst, proto) por nº de paquetes:")
    for (s, d, p), c in conv.most_common(10):
        pn = {1:"ICMP",6:"TCP",17:"UDP"}.get(p, str(p))
        print(f"  {c:5d}  {s} → {d}  {pn}")

    print("\nTop 10 puertos (TCP/UDP) más vistos:")
    for (pn, port), c in puertos.most_common(10):
        print(f"  {c:5d}  {pn}  puerto {port}")

    if dns_queries:
        print("\nTop 10 DNS queries (sin compresión):")
        for name, c in dns_queries.most_common(10):
            print(f"  {c:5d}  {name}")
    else:
        print("\nDNS queries: no detectadas (puede ser todo DNS cifrado/DoH o compresión/punteros).")


analizar_pcap(r"descargas.pcap", max_print=30)

#1 192.168.9.100:64999 → 192.111.5.128:443 (TCP) len=54
#2 192.168.9.100:49328 → 142.250.200.106:443 (TCP) len=809
#3 142.250.200.106:443 → 192.168.9.100:49328 (TCP) len=54
#4 52.97.117.50:443 → 192.168.9.100:56866 (UDP) len=81
#5 192.168.9.100:56866 → 52.97.117.50:443 (UDP) len=81
#6 142.250.200.106:443 → 192.168.9.100:49328 (TCP) len=434
#7 192.168.9.100:49328 → 142.250.200.106:443 (TCP) len=1171
#8 192.168.9.100:64996 → 172.217.168.170:443 (TCP) len=720
#9 142.250.200.106:443 → 192.168.9.100:49328 (TCP) len=54
#10 172.217.168.170:443 → 192.168.9.100:64996 (TCP) len=54
#11 192.168.9.100:58140 → 192.168.9.1:53 (UDP) len=109
#12 192.168.9.100:58142 → 208.67.222.222:443 (UDP) len=366
#13 192.168.9.100:56866 → 52.97.117.50:443 (UDP) len=77
#14 192.168.9.1:53 → 192.168.9.100:58140 (UDP) len=129
#15 172.217.168.170:443 → 192.168.9.100:64996 (TCP) len=411
#16 142.250.200.106:443 → 192.168.9.100:49328 (TCP) len=356
#17 172.217.168.170:443 → 192.168.9.100:64996 (TCP) len=117
#18 172.217.168.1

## Top 20 por tamaño

In [5]:
import os
import struct
from statistics import mean

def analizar_tamaños_pcap(path, top_n=20):
    if not os.path.exists(path):
        print("ERROR: archivo no existe")
        return

    tamaño_archivo = os.path.getsize(path)
    if tamaño_archivo < 24:
        print("ERROR: demasiado pequeño para PCAP")
        return

    tamaños = []
    top = []  # lista de tuplas (incl_len, idx, ts_sec, ts_usec, orig_len)

    with open(path, "rb") as f:
        f.read(24)  # global header

        idx = 0
        while True:
            ph = f.read(16)
            if len(ph) < 16:
                break

            ts_sec, ts_usec, incl_len, orig_len = struct.unpack("<IIII", ph)

            data = f.read(incl_len)
            if len(data) < incl_len:
                # archivo truncado: salimos limpio
                print("Aviso: PCAP truncado (paquete incompleto al final).")
                break

            idx += 1
            tamaños.append(incl_len)

            # mantener top_n sin librerías: insert + sort pequeño
            top.append((incl_len, idx, ts_sec, ts_usec, orig_len))
            top.sort(key=lambda x: x[0], reverse=True)
            if len(top) > top_n:
                top.pop()

    if not tamaños:
        print("No se han leído paquetes.")
        return

    tamaños_ordenados = sorted(tamaños)

    def pct(p):
        # percentil simple (nearest-rank)
        if not tamaños_ordenados:
            return None
        k = int(round((p/100) * (len(tamaños_ordenados)-1)))
        k = max(0, min(k, len(tamaños_ordenados)-1))
        return tamaños_ordenados[k]

    total = len(tamaños)
    print(f"Paquetes leídos: {total}")
    print(f"Tamaño min: {tamaños_ordenados[0]} bytes")
    print(f"Tamaño max: {tamaños_ordenados[-1]} bytes")
    print(f"Tamaño medio: {mean(tamaños):.2f} bytes")
    print(f"P50: {pct(50)} bytes | P90: {pct(90)} bytes | P95: {pct(95)} bytes | P99: {pct(99)} bytes")

    print(f"\nTop {top_n} paquetes más grandes (por tamaño capturado incl_len):")
    for incl_len, i, ts_sec, ts_usec, orig_len in top:
        print(f"  #{i:6d}  incl_len={incl_len:5d}  orig_len={orig_len:5d}  ts={ts_sec}.{ts_usec:06d}")

# Uso
analizar_tamaños_pcap(r"descargas.pcap", top_n=20)

Paquetes leídos: 185635
Tamaño min: 42 bytes
Tamaño max: 1514 bytes
Tamaño medio: 810.58 bytes
P50: 1354 bytes | P90: 1354 bytes | P95: 1494 bytes | P99: 1494 bytes

Top 20 paquetes más grandes (por tamaño capturado incl_len):
  #148592  incl_len= 1514  orig_len= 1514  ts=1771924003.950213
  #148605  incl_len= 1514  orig_len= 1514  ts=1771924003.953112
  # 41629  incl_len= 1494  orig_len= 1494  ts=1771923692.382848
  # 51852  incl_len= 1494  orig_len= 1494  ts=1771923727.040544
  # 62862  incl_len= 1494  orig_len= 1494  ts=1771923761.349634
  # 62888  incl_len= 1494  orig_len= 1494  ts=1771923761.425631
  # 62889  incl_len= 1494  orig_len= 1494  ts=1771923761.425631
  # 62892  incl_len= 1494  orig_len= 1494  ts=1771923761.426596
  # 62893  incl_len= 1494  orig_len= 1494  ts=1771923761.426596
  # 62895  incl_len= 1494  orig_len= 1494  ts=1771923761.427555
  # 62896  incl_len= 1494  orig_len= 1494  ts=1771923761.427555
  # 62898  incl_len= 1494  orig_len= 1494  ts=1771923761.428557
  # 6

In [6]:
import os
import re
import struct
import socket
from collections import defaultdict

OUT_DIR = "extraidos_http"
MAX_BODY = 50 * 1024 * 1024  # 50 MB por seguridad (ajusta si quieres)

def ip_to_str(b):
    return socket.inet_ntoa(b)

def safe_filename(name):
    name = name.strip().strip('"').strip("'")
    name = re.sub(r"[^\w\-.() ]+", "_", name)
    if not name:
        name = "archivo.bin"
    return name

def parse_headers(header_bytes):
    # Devuelve (status_line, headers_dict_lower)
    text = header_bytes.decode("iso-8859-1", errors="ignore")
    lines = text.split("\r\n")
    status = lines[0] if lines else ""
    headers = {}
    for line in lines[1:]:
        if not line or ":" not in line:
            continue
        k, v = line.split(":", 1)
        headers[k.strip().lower()] = v.strip()
    return status, headers

def decode_chunked(body):
    # Decoder chunked robusto: si algo falla, devuelve None
    out = bytearray()
    i = 0
    try:
        while True:
            j = body.find(b"\r\n", i)
            if j == -1:
                return None
            size_line = body[i:j].split(b";", 1)[0].strip()
            size = int(size_line, 16)
            i = j + 2
            if size == 0:
                # puede haber trailers + CRLF final; no nos complicamos
                return bytes(out)
            if i + size > len(body):
                return None
            out += body[i:i+size]
            i += size
            if body[i:i+2] != b"\r\n":
                return None
            i += 2
    except Exception:
        return None

def parse_pcap_tcp_payloads(path):
    """
    Lee PCAP y produce segmentos TCP por conexión.
    Devuelve dict: conn_key -> {'A': [(seq, payload), ...], 'B': [(seq, payload), ...]}
    donde A/B son direcciones (endpoint1->endpoint2) y (endpoint2->endpoint1).
    """
    conns = defaultdict(lambda: {"A": [], "B": []})

    with open(path, "rb") as f:
        gh = f.read(24)
        if len(gh) < 24:
            raise ValueError("PCAP demasiado pequeño")

        pkt_idx = 0
        while True:
            ph = f.read(16)
            if len(ph) < 16:
                break

            ts_sec, ts_usec, incl_len, orig_len = struct.unpack("<IIII", ph)
            data = f.read(incl_len)
            if len(data) < incl_len:
                break

            pkt_idx += 1

            # Ethernet
            if len(data) < 14:
                continue
            eth_type = struct.unpack("!H", data[12:14])[0]
            if eth_type != 0x0800:  # IPv4
                continue

            # IP header
            if len(data) < 14 + 20:
                continue
            ip_start = 14
            ver_ihl = data[ip_start]
            ihl = (ver_ihl & 0x0F) * 4
            if ihl < 20 or len(data) < ip_start + ihl:
                continue

            proto = data[ip_start + 9]
            if proto != 6:  # TCP
                continue

            src_ip = ip_to_str(data[ip_start+12:ip_start+16])
            dst_ip = ip_to_str(data[ip_start+16:ip_start+20])

            tcp_start = ip_start + ihl
            if len(data) < tcp_start + 20:
                continue

            src_port, dst_port = struct.unpack("!HH", data[tcp_start:tcp_start+4])
            seq = struct.unpack("!I", data[tcp_start+4:tcp_start+8])[0]
            data_offset = (data[tcp_start+12] >> 4) * 4
            if data_offset < 20 or len(data) < tcp_start + data_offset:
                continue

            payload = data[tcp_start+data_offset:]
            if not payload:
                continue

            # Normalizamos la conexión por endpoints (para tener key estable)
            ep1 = (src_ip, src_port)
            ep2 = (dst_ip, dst_port)
            if ep1 <= ep2:
                key = (ep1, ep2)
                direction = "A"  # ep1->ep2
            else:
                key = (ep2, ep1)
                direction = "B"  # ep2->ep1

            # Guardamos segmento
            conns[key][direction].append((seq, payload))

    return conns

def assemble_stream(segments, max_bytes=MAX_BODY):
    """
    Ensambla segmentos TCP de forma best-effort:
    - Ordena por seq
    - Concatena si hay continuidad o solape
    Nota: esto NO maneja gaps perfectos, es “robusto” para capturas normales sin pérdidas.
    """
    if not segments:
        return b""
    segments = sorted(segments, key=lambda x: x[0])
    out = bytearray()

    cur_seq = segments[0][0]
    for seq, payload in segments:
        if len(out) >= max_bytes:
            break

        if seq > cur_seq:
            # gap: metemos separador para evitar juntar cosas raras
            # (y avanzamos)
            gap = seq - cur_seq
            # no rellenamos con ceros (podría falsear), solo saltamos
            cur_seq = seq

        if seq < cur_seq:
            # solape: recortamos
            overlap = cur_seq - seq
            if overlap >= len(payload):
                continue
            payload = payload[overlap:]

        out += payload
        cur_seq += len(payload)

    return bytes(out)

def find_http_responses(stream_bytes):
    """
    Busca respuestas HTTP dentro del stream:
    Devuelve lista de dict con info: headers, body, offsets, filename, etc.
    """
    results = []
    i = 0
    while True:
        j = stream_bytes.find(b"HTTP/1.", i)
        if j == -1:
            break

        # header end
        k = stream_bytes.find(b"\r\n\r\n", j)
        if k == -1:
            break

        header_bytes = stream_bytes[j:k]
        status, headers = parse_headers(header_bytes)

        body_start = k + 4
        # Determinar longitud / chunked
        te = headers.get("transfer-encoding", "").lower()
        cl = headers.get("content-length", "")
        cd = headers.get("content-disposition", "")
        ct = headers.get("content-type", "")

        filename = None
        m = re.search(r'filename\*?=(?:UTF-8\'\')?("?)([^";\r\n]+)\1', cd, re.IGNORECASE)
        if m:
            filename = m.group(2)

        body = None
        extracted = False

        if "chunked" in te:
            # tenemos que encontrar dónde acaba el chunked: intentamos decodificar desde body_start
            decoded = decode_chunked(stream_bytes[body_start:])
            if decoded is not None and len(decoded) <= MAX_BODY:
                body = decoded
                extracted = True
        else:
            try:
                n = int(cl) if cl else None
            except ValueError:
                n = None

            if n is not None and 0 <= n <= MAX_BODY and body_start + n <= len(stream_bytes):
                body = stream_bytes[body_start:body_start+n]
                extracted = True

        results.append({
            "offset": j,
            "status": status,
            "content_type": ct,
            "content_length": cl,
            "transfer_encoding": te,
            "content_disposition": cd,
            "filename": filename,
            "extracted": extracted,
            "body": body if extracted else None,
        })

        # avanzar para buscar siguiente
        i = body_start

    return results

def main(path):
    if not os.path.exists(path):
        print("ERROR: archivo no existe")
        return
    if os.path.getsize(path) < 24:
        print("ERROR: archivo demasiado pequeño")
        return

    os.makedirs(OUT_DIR, exist_ok=True)

    conns = parse_pcap_tcp_payloads(path)

    total_candidates = 0
    total_saved = 0

    print(f"Conexiones TCP vistas: {len(conns)}")
    print("Buscando respuestas HTTP con posibles descargas...\n")

    for key, dirs in conns.items():
        (ip1, p1), (ip2, p2) = key

        # Para respuestas HTTP típicas: servidor -> cliente suele ir en dirección contraria al GET
        # Pero como no sabemos quién es servidor, escaneamos ambas direcciones.
        for dir_name in ("A", "B"):
            stream = assemble_stream(dirs[dir_name])
            if not stream:
                continue

            http_resps = find_http_responses(stream)
            if not http_resps:
                continue

            for r in http_resps:
                total_candidates += 1

                who = f"{ip1}:{p1}→{ip2}:{p2} dir={dir_name}"
                print(f"[HTTP] ({who}) {r['status']}")
                if r["content_type"]:
                    print(f"  Content-Type: {r['content_type']}")
                if r["content_length"]:
                    print(f"  Content-Length: {r['content_length']}")
                if r["transfer_encoding"]:
                    print(f"  Transfer-Encoding: {r['transfer_encoding']}")
                if r["content_disposition"]:
                    print(f"  Content-Disposition: {r['content_disposition']}")

                if r["extracted"] and r["body"] is not None:
                    fname = r["filename"]
                    if not fname:
                        # inventar nombre por tipo o genérico
                        ext = ""
                        if "pdf" in (r["content_type"] or "").lower():
                            ext = ".pdf"
                        elif "zip" in (r["content_type"] or "").lower():
                            ext = ".zip"
                        elif "json" in (r["content_type"] or "").lower():
                            ext = ".json"
                        elif "html" in (r["content_type"] or "").lower():
                            ext = ".html"
                        fname = f"extraido_{total_saved+1}{ext}"

                    fname = safe_filename(fname)
                    out_path = os.path.join(OUT_DIR, fname)

                    # evitar sobreescrituras
                    base, ext = os.path.splitext(out_path)
                    n = 1
                    while os.path.exists(out_path):
                        out_path = f"{base}_{n}{ext}"
                        n += 1

                    with open(out_path, "wb") as wf:
                        wf.write(r["body"])

                    total_saved += 1
                    print(f"  ✅ Guardado: {out_path} ({len(r['body'])} bytes)")
                else:
                    print("  ⚠️ No extraíble (puede faltar reensamblado completo, o es HTTPS, o no hay Content-Length/chunked completo).")

                print()

    print("Resumen final")
    print(f"  Candidatos HTTP detectados: {total_candidates}")
    print(f"  Ficheros guardados: {total_saved}")
    print(f"  Carpeta salida: {OUT_DIR}")

# Cambia ruta si hace falta
main(r"descargas.pcap")

Conexiones TCP vistas: 1609
Buscando respuestas HTTP con posibles descargas...

Resumen final
  Candidatos HTTP detectados: 0
  Ficheros guardados: 0
  Carpeta salida: extraidos_http


## Top 20 descargas y dominios

In [7]:
import os
import struct
import socket
from collections import defaultdict, Counter

def ip_to_str(b):
    return socket.inet_ntoa(b)

def parse_tls_sni(payload: bytes):
    """
    Extrae el primer SNI (server_name) si el payload contiene un TLS ClientHello.
    Devuelve dominio (str) o None. Robusto: si algo no cuadra, devuelve None.
    """
    try:
        # TLS record header: ContentType(1)=0x16 handshake, Version(2), Length(2)
        if len(payload) < 5 or payload[0] != 0x16:
            return None
        rec_len = int.from_bytes(payload[3:5], "big")
        if rec_len <= 0 or 5 + rec_len > len(payload):
            # puede venir fragmentado; no forzamos
            return None

        hs = payload[5:5+rec_len]
        # Handshake header: msg_type(1)=0x01 client_hello, length(3)
        if len(hs) < 4 or hs[0] != 0x01:
            return None
        hs_len = int.from_bytes(hs[1:4], "big")
        if hs_len <= 0 or 4 + hs_len > len(hs):
            return None

        ch = hs[4:4+hs_len]
        # ClientHello: version(2) + random(32) + session_id_len(1) + session_id + cipher_suites_len(2)+... etc
        if len(ch) < 2 + 32 + 1:
            return None
        i = 0
        i += 2  # version
        i += 32 # random

        sess_len = ch[i]
        i += 1
        if i + sess_len > len(ch):
            return None
        i += sess_len

        if i + 2 > len(ch):
            return None
        cs_len = int.from_bytes(ch[i:i+2], "big")
        i += 2
        if i + cs_len > len(ch):
            return None
        i += cs_len

        if i + 1 > len(ch):
            return None
        comp_len = ch[i]
        i += 1
        if i + comp_len > len(ch):
            return None
        i += comp_len

        # Extensions length
        if i + 2 > len(ch):
            return None
        ext_len = int.from_bytes(ch[i:i+2], "big")
        i += 2
        if i + ext_len > len(ch):
            return None

        end = i + ext_len
        # Iterate extensions: type(2), len(2), data
        while i + 4 <= end:
            ext_type = int.from_bytes(ch[i:i+2], "big")
            ext_l = int.from_bytes(ch[i+2:i+4], "big")
            i += 4
            if i + ext_l > end:
                return None
            ext_data = ch[i:i+ext_l]
            i += ext_l

            # server_name extension type = 0x0000
            if ext_type == 0x0000:
                # structure: list_len(2), then entries: name_type(1), name_len(2), name(bytes)
                if len(ext_data) < 2:
                    return None
                list_len = int.from_bytes(ext_data[0:2], "big")
                if 2 + list_len > len(ext_data):
                    return None
                p = 2
                while p + 3 <= 2 + list_len:
                    name_type = ext_data[p]
                    name_len = int.from_bytes(ext_data[p+1:p+3], "big")
                    p += 3
                    if p + name_len > len(ext_data):
                        return None
                    name_bytes = ext_data[p:p+name_len]
                    p += name_len
                    if name_type == 0:  # host_name
                        sni = name_bytes.decode("utf-8", errors="ignore").strip()
                        return sni or None
        return None
    except Exception:
        return None

def parse_pcap_flows_and_sni(path):
    """
    Lee PCAP (little-endian) y produce:
    - flow_bytes[(client_ip,client_port,server_ip,server_port)] = {"c2s":bytes, "s2c":bytes, "pkts":n}
    - flow_sni[(client_ip,client_port,server_ip,server_port)] = dominio (si se detecta)
    Heurística: si vemos ClientHello con SNI en c2s, marcamos server como dst:port
    """
    flow_bytes = defaultdict(lambda: {"c2s": 0, "s2c": 0, "pkts": 0})
    flow_sni = {}

    with open(path, "rb") as f:
        gh = f.read(24)
        if len(gh) < 24:
            raise ValueError("PCAP demasiado pequeño")

        while True:
            ph = f.read(16)
            if len(ph) < 16:
                break

            ts_sec, ts_usec, incl_len, orig_len = struct.unpack("<IIII", ph)
            data = f.read(incl_len)
            if len(data) < incl_len:
                break

            # Ethernet
            if len(data) < 14:
                continue
            eth_type = struct.unpack("!H", data[12:14])[0]
            if eth_type != 0x0800:
                continue

            # IP
            if len(data) < 14 + 20:
                continue
            ip_start = 14
            ver_ihl = data[ip_start]
            ihl = (ver_ihl & 0x0F) * 4
            if ihl < 20 or len(data) < ip_start + ihl:
                continue

            proto = data[ip_start + 9]
            if proto != 6:
                continue

            src_ip = ip_to_str(data[ip_start+12:ip_start+16])
            dst_ip = ip_to_str(data[ip_start+16:ip_start+20])

            tcp_start = ip_start + ihl
            if len(data) < tcp_start + 20:
                continue

            src_port, dst_port = struct.unpack("!HH", data[tcp_start:tcp_start+4])
            data_offset = (data[tcp_start+12] >> 4) * 4
            if data_offset < 20 or len(data) < tcp_start + data_offset:
                continue

            payload = data[tcp_start+data_offset:]
            payload_len = len(payload)

            # Identificar dirección del flujo (cliente->servidor vs servidor->cliente):
            # Heurística simple: el "server" suele ser puerto 443/80/etc. Si dst_port es 443, asumimos c2s.
            # Si src_port es 443, asumimos s2c.
            # Si ninguno, lo dejamos por "orden" estable de endpoints (menos robusto pero ok).
            common_server_ports = {443, 80, 8080, 8443, 8000, 21, 20, 22}
            if dst_port in common_server_ports and src_port not in common_server_ports:
                key = (src_ip, src_port, dst_ip, dst_port)
                direction = "c2s"
            elif src_port in common_server_ports and dst_port not in common_server_ports:
                key = (dst_ip, dst_port, src_ip, src_port)  # cliente sería dst, server sería src
                direction = "s2c"
            else:
                # fallback: definimos "cliente" como endpoint lexicográficamente menor
                ep1 = (src_ip, src_port)
                ep2 = (dst_ip, dst_port)
                if ep1 <= ep2:
                    key = (src_ip, src_port, dst_ip, dst_port)
                    direction = "c2s"
                else:
                    key = (dst_ip, dst_port, src_ip, src_port)
                    direction = "s2c"

            flow_bytes[key]["pkts"] += 1
            if direction == "c2s":
                flow_bytes[key]["c2s"] += payload_len
                # Intentar SNI solo en c2s (ClientHello suele ir del cliente)
                if key not in flow_sni and payload_len >= 20:
                    sni = parse_tls_sni(payload)
                    if sni:
                        flow_sni[key] = sni
            else:
                flow_bytes[key]["s2c"] += payload_len

    return flow_bytes, flow_sni

def human(n):
    # bytes -> texto
    for unit in ["B", "KB", "MB", "GB"]:
        if n < 1024:
            return f"{n:.1f}{unit}"
        n /= 1024
    return f"{n:.1f}TB"

def main(path, top_n=20):
    if not os.path.exists(path):
        print("ERROR: archivo no existe")
        return

    flow_bytes, flow_sni = parse_pcap_flows_and_sni(path)

    print(f"Flujos TCP detectados (normalizados): {len(flow_bytes)}")
    print(f"SNI (TLS) detectados: {len(flow_sni)}\n")

    # Top por bytes s2c (posibles descargas)
    ranked = sorted(flow_bytes.items(), key=lambda kv: kv[1]["s2c"], reverse=True)

    print(f"Top {top_n} posibles 'descargas' (bytes servidor→cliente = s2c):")
    shown = 0
    for key, st in ranked:
        if shown >= top_n:
            break
        if st["s2c"] == 0:
            continue
        c_ip, c_port, s_ip, s_port = key
        sni = flow_sni.get(key)
        extra = f"  SNI={sni}" if sni else ""
        ratio = (st["s2c"] / (st["c2s"] + 1))  # +1 para evitar div0
        print(f"- {c_ip}:{c_port} → {s_ip}:{s_port}  s2c={human(st['s2c'])}  c2s={human(st['c2s'])}  ratio~{ratio:.1f}{extra}")
        shown += 1

    # Top dominios por total bajado (si hay SNI)
    by_domain = Counter()
    for key, dom in flow_sni.items():
        by_domain[dom] += flow_bytes[key]["s2c"]

    if by_domain:
        print("\nTop dominios (SNI) por bytes bajados (s2c):")
        for dom, b in by_domain.most_common(10):
            print(f"- {dom}: {human(b)}")
    else:
        print("\nNo se detectó SNI. Puede ser porque:")
        print("- el ClientHello está fragmentado y no cae entero en un paquete")
        print("- es QUIC (UDP/443) en vez de TCP")
        print("- o la captura empieza tarde (sin el inicio de la conexión)")

# Uso
main(r"descargas.pcap", top_n=20)

Flujos TCP detectados (normalizados): 1616
SNI (TLS) detectados: 43

Top 20 posibles 'descargas' (bytes servidor→cliente = s2c):
- 192.168.9.100:58246 → 77.209.227.96:443  s2c=33.2MB  c2s=16.1KB  ratio~2106.7
- 192.168.9.100:55610 → 142.250.200.99:443  s2c=15.8MB  c2s=6.0KB  ratio~2716.3
- 192.168.9.100:62984 → 212.145.41.98:443  s2c=10.8MB  c2s=6.8KB  ratio~1617.0
- 192.168.9.100:58806 → 23.40.114.46:443  s2c=4.9MB  c2s=49.3KB  ratio~101.2
- 192.168.9.100:51549 → 184.31.3.35:443  s2c=4.3MB  c2s=9.5KB  ratio~461.9
- 192.168.9.100:57701 → 172.217.17.4:443  s2c=3.9MB  c2s=1.5MB  ratio~2.5
- 192.168.9.100:58301 → 82.98.170.158:443  s2c=3.8MB  c2s=2.5KB  ratio~1523.0
- 192.168.9.100:50034 → 23.40.114.93:443  s2c=3.3MB  c2s=43.5KB  ratio~77.3
- 192.168.9.100:61321 → 13.107.246.42:443  s2c=2.1MB  c2s=54.2KB  ratio~39.2
- 192.168.9.100:51452 → 52.98.200.178:443  s2c=1.6MB  c2s=485.2KB  ratio~3.3
- 192.168.9.100:60414 → 23.46.84.203:443  s2c=1.6MB  c2s=2.5KB  ratio~639.7
- 192.168.9.100:57473 