In [None]:
from scapy.all import rdpcap, IP, TCP, UDP
import pandas as pd
from collections import defaultdict
import numpy as np

# path to your pcap (adjust filename as needed)
pcap_file = "/home/ubuntu/DoH_DGA_training/datasets/DGA/padcrypt-doh-24h.pcap"

# read packets in chunks (Scapy loads all into memory otherwise, which may be heavy for 1GB+)
# For big files, use dpkt or pyshark (streaming) — but let's start with scapy for clarity
packets = rdpcap(pcap_file)

# A flow key: (src_ip, dst_ip, src_port, dst_port, protocol)
def flow_key(pkt):
    if IP in pkt:
        ip = pkt[IP]
        proto = ip.proto
        if TCP in pkt:
            l4 = pkt[TCP]
            return (ip.src, ip.dst, l4.sport, l4.dport, 'TCP')
        elif UDP in pkt:
            l4 = pkt[UDP]
            return (ip.src, ip.dst, l4.sport, l4.dport, 'UDP')
    return None

# Flow stats container
flows = defaultdict(lambda: {
    "timestamps": [],
    "sizes": [],
    "src_bytes": 0,
    "dst_bytes": 0
})

# Iterate over packets
for pkt in packets:
    k = flow_key(pkt)
    if not k:
        continue

    ts = pkt.time
    size = len(pkt)

    flows[k]["timestamps"].append(ts)
    flows[k]["sizes"].append(size)

    # track sent vs received
    if IP in pkt:
        ip = pkt[IP]
        if (ip.src, ip.dst, pkt.sport if TCP in pkt or UDP in pkt else 0, pkt.dport if TCP in pkt or UDP in pkt else 0, 'TCP' if TCP in pkt else 'UDP') == k:
            flows[k]["src_bytes"] += size
        else:
            flows[k]["dst_bytes"] += size

# Build dataframe with per-flow stats
records = []
for k, stats in flows.items():
    if len(stats["timestamps"]) < 2:
        continue
    iats = np.diff(sorted(stats["timestamps"]))  # inter-arrival times
    records.append({
        "src_ip": k[0],
        "dst_ip": k[1],
        "src_port": k[2],
        "dst_port": k[3],
        "protocol": k[4],
        "n_packets": len(stats["sizes"]),
        "pkt_size_min": np.min(stats["sizes"]),
        "pkt_size_mean": np.mean(stats["sizes"]),
        "pkt_size_max": np.max(stats["sizes"]),
        "iat_min": np.min(iats),
        "iat_mean": np.mean(iats),
        "iat_max": np.max(iats),
        "src_bytes": stats["src_bytes"],
        "dst_bytes": stats["dst_bytes"]
    })

df = pd.DataFrame(records)
df.head()
