In [2]:
import dpkt
import socket
import pandas as pd
import numpy as np
from collections import defaultdict
from pathlib import Path

base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/DGA")
pcap_files = list(base_dir.glob("*.pcap"))


In [3]:

def inet_to_str(inet):
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        return socket.inet_ntop(socket.AF_INET6, inet)

def normalize_flow(src, dst, sport, dport, proto):
    if (src, sport) < (dst, dport):
        return (src, dst, sport, dport, proto), "forward"
    else:
        return (dst, src, dport, sport, proto), "reverse"

def process_pcap(pcap_file, max_packets=None):
    flows = defaultdict(lambda: {
        "sent_timestamps": [], "recv_timestamps": [],
        "sent_sizes": [], "recv_sizes": []
    })

    with open(pcap_file, "rb") as f:
        pcap = dpkt.pcap.Reader(f)

        for i, (ts, buf) in enumerate(pcap):
            try:
                eth = dpkt.ethernet.Ethernet(buf)
                if not isinstance(eth.data, dpkt.ip.IP):
                    continue
                ip = eth.data
                l4 = ip.data
                if not isinstance(l4, (dpkt.tcp.TCP, dpkt.udp.UDP)):
                    continue

                proto = "TCP" if isinstance(l4, dpkt.tcp.TCP) else "UDP"
                src = inet_to_str(ip.src)
                dst = inet_to_str(ip.dst)
                sport = l4.sport
                dport = l4.dport

                fid, direction = normalize_flow(src, dst, sport, dport, proto)

                if direction == "forward":
                    flows[fid]["sent_timestamps"].append(float(ts))
                    flows[fid]["sent_sizes"].append(len(ip))
                else:
                    flows[fid]["recv_timestamps"].append(float(ts))
                    flows[fid]["recv_sizes"].append(len(ip))
            except Exception:
                continue

            if max_packets and i > max_packets:
                break

    # Aggregate features
    records = []
    for fid, stats in flows.items():
        sent_ts = sorted(stats["sent_timestamps"])
        recv_ts = sorted(stats["recv_timestamps"])
        sent_sizes = stats["sent_sizes"]
        recv_sizes = stats["recv_sizes"]

        sent_iats = np.diff(sent_ts) if len(sent_ts) > 1 else []
        recv_iats = np.diff(recv_ts) if len(recv_ts) > 1 else []

        server_delay = None
        if sent_ts and recv_ts:
            server_delay = max(0.0, recv_ts[0] - sent_ts[0])

        records.append({
            "flow_id": fid,
            "src_ip": fid[0],
            "dst_ip": fid[1],
            "src_port": fid[2],
            "dst_port": fid[3],
            "protocol": fid[4],

            "n_sent": len(sent_sizes),
            "sent_bytes": int(np.sum(sent_sizes)) if sent_sizes else 0,
            "sent_pkt_min": int(np.min(sent_sizes)) if sent_sizes else 0,
            "sent_pkt_mean": float(np.mean(sent_sizes)) if sent_sizes else 0.0,
            "sent_pkt_max": int(np.max(sent_sizes)) if sent_sizes else 0,
            "sent_iat_min": float(np.min(sent_iats)) if len(sent_iats) else 0.0,
            "sent_iat_mean": float(np.mean(sent_iats)) if len(sent_iats) else 0.0,
            "sent_iat_max": float(np.max(sent_iats)) if len(sent_iats) else 0.0,

            "n_recv": len(recv_sizes),
            "recv_bytes": int(np.sum(recv_sizes)) if recv_sizes else 0,
            "recv_pkt_min": int(np.min(recv_sizes)) if recv_sizes else 0,
            "recv_pkt_mean": float(np.mean(recv_sizes)) if recv_sizes else 0.0,
            "recv_pkt_max": int(np.max(recv_sizes)) if recv_sizes else 0,
            "recv_iat_min": float(np.min(recv_iats)) if len(recv_iats) else 0.0,
            "recv_iat_mean": float(np.mean(recv_iats)) if len(recv_iats) else 0.0,
            "recv_iat_max": float(np.max(recv_iats)) if len(recv_iats) else 0.0,

            "server_delay": server_delay if server_delay is not None else 0.0,
            "bytes_ratio": float(np.sum(sent_sizes)) / np.sum(recv_sizes)
                if recv_sizes and np.sum(recv_sizes) > 0 else 0.0
        })

    return pd.DataFrame(records)

In [None]:
import pandas as pd

all_dfs = []

for pcap_file in pcap_files:
    print(f"Processing {pcap_file.name}...")
    df = process_pcap(pcap_file)

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]
    df["label"] = label

    all_dfs.append(df)
    print(f"Loaded {df.shape[0]} flows with label={label}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/DGA/all_pcaps.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels")


Processing padcrypt-doh-24h.pcap...
Loaded 216 flows with label=padcrypt-doh-24h
Processing sisron-doh-24h.pcap...
Loaded 38 flows with label=sisron-doh-24h
Processing zloader-doh-24h.pcap...
Loaded 295 flows with label=zloader-doh-24h
Processing tinba-doh-24h.pcap...
Loaded 1707 flows with label=tinba-doh-24h
Saved aggregated dataframe with 2256 flows across 4 labels


In [15]:
print(final_df.shape)

(2256, 25)


In [4]:
import pandas as pd

base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/DoH_HKD/DoH-Pcaps/DoH-Pcaps-dnstt")
pcap_files = list(base_dir.glob("*.pcap"))

all_dfs = []

for pcap_file in pcap_files:
    print(f"Processing {pcap_file.name}...")
    df = process_pcap(pcap_file)

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]
    df["label"] = label

    all_dfs.append(df)
    print(f"Loaded {df.shape[0]} flows with label={label}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/DoH_HKD/DoH-Pcaps/DoH-Pcaps-dnstt/dnstt_all_pcaps.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels")


Processing dnstt-48h-2021-11-03-08_01_45.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-03-08
Processing dnstt-48h-2021-11-03-01_01_45.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-03-01
Processing dnstt-48h-2021-11-01-11_46_33.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-01-11
Processing dnstt-48h-2021-11-02-22_01_45.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-02-22
Processing dnstt-48h-2021-11-01-19_46_33.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-01-19
Processing dnstt-48h-2021-11-02-03_46_36.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-02-03
Processing dnstt-48h-2021-11-03-02_01_45.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-03-02
Processing dnstt-48h-2021-11-01-12_46_33.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-01-12
Processing dnstt-48h-2021-11-02-13_01_45.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-02-13
Processing dnstt-48h-2021-11-02-14_01_45.pcap...
Loaded 25 flows with label=dnstt-48h-2021-11-02-14
