In [None]:
import dpkt
import socket
import pandas as pd
import numpy as np
from collections import defaultdict
from pathlib import Path

In [None]:
# === CONFIG ===
pcap_folders = [
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH-NonDoH-CSVs",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Cloudflare",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Google",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Quad9",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-AdGuard",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-CloudFlare",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-Google",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-Quad9",
]

# List of benign DoH IPs
doh_ips = [
    "1.1.1.1",
    "8.8.4.4",
    "8.8.8.8",
    "9.9.9.9",
    "9.9.9.10",
    "9.9.9.11",
    "176.103.130.131",
    "176.103.130.130",
    "149.112.112.10",
    "149.112.112.112",
    "104.16.248.249",
    "104.16.249.249"
]

In [None]:
def inet_to_str(inet):
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        return socket.inet_ntop(socket.AF_INET6, inet)

def normalize_session(src, dst, sport, dport, proto):
    """
    Define session by (client_ip, server_ip, client_port, server_port, proto).
    Client is assumed to be the side with port != 443, server is port 443.
    """
    if dport == 443:
        client, server = (src, sport), (dst, dport)
    elif sport == 443:
        client, server = (dst, dport), (src, sport)
    else:
        # not a TLS session, skip
        return None
    return (client[0], server[0], client[1], server[1], proto)

def process_pcap_tls(pcap_file, N_values=[8,16,32,64], max_packets=None):
    """
    Process pcap and extract TLS Application Data features.
    Returns dict of DataFrames, one for each N in N_values.
    """

    sessions = defaultdict(lambda: {
        "client_ts": [], "client_sizes": [],
        "server_ts": [], "server_sizes": [],
        "appdata_count": 0
    })

    with open(pcap_file, "rb") as f:
        pcap = dpkt.pcap.Reader(f)

        for i, (ts, buf) in enumerate(pcap):
            try:
                eth = dpkt.ethernet.Ethernet(buf)
                if not isinstance(eth.data, dpkt.ip.IP):
                    continue
                ip = eth.data
                l4 = ip.data
                if not isinstance(l4, dpkt.tcp.TCP):
                    continue

                src = inet_to_str(ip.src)
                dst = inet_to_str(ip.dst)
                sport, dport = l4.sport, l4.dport
                fid = normalize_session(src, dst, sport, dport, "TCP")
                if fid is None:
                    continue

                # parse TLS records
                try:
                    records, _ = dpkt.ssl.tls_multi_factory(l4.data)
                except (dpkt.ssl.SSL3Exception, dpkt.dpkt.NeedData):
                    continue

                for rec in records:
                    if rec.type == 23:  # TLS Application Data
                        if dport == 443:  # client → server
                            sessions[fid]["client_ts"].append(float(ts))
                            sessions[fid]["client_sizes"].append(len(rec.data))
                        elif sport == 443:  # server → client
                            sessions[fid]["server_ts"].append(float(ts))
                            sessions[fid]["server_sizes"].append(len(rec.data))

                        sessions[fid]["appdata_count"] += 1

            except Exception:
                continue

            if max_packets and i > max_packets:
                break

    # build records for each N
    results = {N: [] for N in N_values}
    for fid, stats in sessions.items():
        # Order by timestamp across client+server
        combined = []
        combined += [(t, "client", s) for t, s in zip(stats["client_ts"], stats["client_sizes"])]
        combined += [(t, "server", s) for t, s in zip(stats["server_ts"], stats["server_sizes"])]
        combined.sort(key=lambda x: x[0])  # sort by ts

        for N in N_values:
            # take first N app data packets
            subset = combined[:N]
            if not subset:
                continue

            client_sizes = [s for _, d, s in subset if d == "client"]
            server_sizes = [s for _, d, s in subset if d == "server"]
            client_ts = [t for t, d, _ in subset if d == "client"]
            server_ts = [t for t, d, _ in subset if d == "server"]

            client_iats = np.diff(sorted(client_ts)) if len(client_ts) > 1 else []
            server_iats = np.diff(sorted(server_ts)) if len(server_ts) > 1 else []

            records = {
                "session_id": fid,
                "client_ip": fid[0],
                "server_ip": fid[1],
                "client_port": fid[2],
                "server_port": fid[3],
                "protocol": fid[4],
                "N": N,

                "n_client": len(client_sizes),
                "client_bytes": int(np.sum(client_sizes)) if client_sizes else 0,
                "client_pkt_min": int(np.min(client_sizes)) if client_sizes else 0,
                "client_pkt_mean": float(np.mean(client_sizes)) if client_sizes else 0.0,
                "client_pkt_max": int(np.max(client_sizes)) if client_sizes else 0,
                "client_iat_min": float(np.min(client_iats)) if len(client_iats) else 0.0,
                "client_iat_mean": float(np.mean(client_iats)) if len(client_iats) else 0.0,
                "client_iat_max": float(np.max(client_iats)) if len(client_iats) else 0.0,

                "n_server": len(server_sizes),
                "server_bytes": int(np.sum(server_sizes)) if server_sizes else 0,
                "server_pkt_min": int(np.min(server_sizes)) if server_sizes else 0,
                "server_pkt_mean": float(np.mean(server_sizes)) if server_sizes else 0.0,
                "server_pkt_max": int(np.max(server_sizes)) if server_sizes else 0,
                "server_iat_min": float(np.min(server_iats)) if len(server_iats) else 0.0,
                "server_iat_mean": float(np.mean(server_iats)) if len(server_iats) else 0.0,
                "server_iat_max": float(np.max(server_iats)) if len(server_iats) else 0.0,

                "bytes_ratio": float(np.sum(client_sizes)) / np.sum(server_sizes)
                    if server_sizes and np.sum(server_sizes) > 0 else 0.0
            }

            results[N].append(records)

    # convert dict of lists to dict of DataFrames
    for N in results:
        results[N] = pd.DataFrame(results[N])

    return results


In [None]:
import pandas as pd
base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/DGA")
pcap_files = list(base_dir.glob("*.pcap"))

all_dfs = []

for pcap_file in pcap_files:
    print(f"Processing {pcap_file.name}...")
    df_dict = process_pcap_tls(pcap_file, N_values=[8,16,32,64])

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]

    for N, df in df_dict.items():
        df["label"] = label
        df["N"] = N  # add column to track window size
        all_dfs.append(df)
        print(f"Loaded {df.shape[0]} flows with label={label}, N={N}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/DGA/all_pcaps_allN.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels and {final_df['N'].nunique()} N values")


Processing padcrypt-doh-24h.pcap...
Loaded 214 flows with label=padcrypt-doh-24h, N=8
Loaded 214 flows with label=padcrypt-doh-24h, N=16
Loaded 214 flows with label=padcrypt-doh-24h, N=32
Loaded 214 flows with label=padcrypt-doh-24h, N=64
Processing sisron-doh-24h.pcap...
Loaded 38 flows with label=sisron-doh-24h, N=8
Loaded 38 flows with label=sisron-doh-24h, N=16
Loaded 38 flows with label=sisron-doh-24h, N=32
Loaded 38 flows with label=sisron-doh-24h, N=64
Processing zloader-doh-24h.pcap...
Loaded 294 flows with label=zloader-doh-24h, N=8
Loaded 294 flows with label=zloader-doh-24h, N=16
Loaded 294 flows with label=zloader-doh-24h, N=32
Loaded 294 flows with label=zloader-doh-24h, N=64
Processing tinba-doh-24h.pcap...
Loaded 1707 flows with label=tinba-doh-24h, N=8
Loaded 1707 flows with label=tinba-doh-24h, N=16
Loaded 1707 flows with label=tinba-doh-24h, N=32
Loaded 1707 flows with label=tinba-doh-24h, N=64
Saved aggregated dataframe with 9012 flows across 4 labels and 4 N values


In [5]:
print(final_df.shape)

(9012, 25)


In [6]:
import pandas as pd
from pathlib import Path

# Base directory where your three folders live
base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/DoH_HKD/DoH-Pcaps")

# The three subfolders
folders = [
    base_dir / "DoH-Pcaps-dnstt",
    base_dir / "DoH-Pcaps-tcp-over-dns",
    base_dir / "DoH-Pcaps-tuns"
]

all_dfs = []

for folder in folders:
    label = folder.name  # use folder name as label
    print(f"Processing folder: {label}")

    for pcap_file in folder.glob("*.pcap"):
        print(f"  Processing {pcap_file.name}...")
        df_dict = process_pcap_tls(pcap_file, N_values=[8,16,32,64])  # TLS app data extractor

        for N, df in df_dict.items():
            if df.empty:
                continue
            df["label"] = label
            df["pcap_file"] = pcap_file.name
            df["N"] = N
            all_dfs.append(df)

            print(f"    Loaded {df.shape[0]} flows for N={N}, label={label}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
output_file = base_dir / "all_pcaps_allN.csv"
final_df.to_csv(output_file, index=False)

print(f"\n✅ Saved aggregated dataframe with {final_df.shape[0]} flows "
      f"across {final_df['label'].nunique()} labels and "
      f"{final_df['N'].nunique()} N values")


Processing folder: DoH-Pcaps-dnstt
  Processing dnstt-48h-2021-11-03-08_01_45.pcap...
    Loaded 25 flows for N=8, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=16, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=32, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=64, label=DoH-Pcaps-dnstt
  Processing dnstt-48h-2021-11-03-01_01_45.pcap...
    Loaded 25 flows for N=8, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=16, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=32, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=64, label=DoH-Pcaps-dnstt
  Processing dnstt-48h-2021-11-01-11_46_33.pcap...
    Loaded 25 flows for N=8, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=16, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=32, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=64, label=DoH-Pcaps-dnstt
  Processing dnstt-48h-2021-11-02-22_01_45.pcap...
    Loaded 25 flows for N=8, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=16, label=DoH-Pcaps-dnstt
    Loaded 25 flows for N=32, label=D

In [7]:
import pandas as pd

base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/DGA_PCAPS1_ENC")
pcap_files = list(base_dir.glob("*.pcap"))

all_dfs = []

for pcap_file in pcap_files:
    print(f"Processing {pcap_file.name}...")
    df_dict = process_pcap_tls(pcap_file, N_values=[8,16,32,64])

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]

    for N, df in df_dict.items():
        df["label"] = label
        df["N"] = N  # add column to track window size
        all_dfs.append(df)
        print(f"Loaded {df.shape[0]} flows with label={label}, N={N}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/DGA_PCAPS1_ENC/all_pcaps_allN.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels and {final_df['N'].nunique()} N values")


Processing 3cdb8764d79b80ec06905e6bb963840c0f3caff21d07ed7421c588af69f1d646_220604-3l9hqsbaar_enc.pcap...
Loaded 1 flows with label=3cdb8764d79b80ec06905e6bb963840c0f3caff21d07ed7421c588af69f1d646, N=8
Loaded 1 flows with label=3cdb8764d79b80ec06905e6bb963840c0f3caff21d07ed7421c588af69f1d646, N=16
Loaded 1 flows with label=3cdb8764d79b80ec06905e6bb963840c0f3caff21d07ed7421c588af69f1d646, N=32
Loaded 1 flows with label=3cdb8764d79b80ec06905e6bb963840c0f3caff21d07ed7421c588af69f1d646, N=64
Processing 837a7e8eed7112af737fdb8817eb2e41927053388afd2a959d1a1c6313616168_220628-t3gplsaggp_enc.pcap...
Loaded 1 flows with label=837a7e8eed7112af737fdb8817eb2e41927053388afd2a959d1a1c6313616168, N=8
Loaded 1 flows with label=837a7e8eed7112af737fdb8817eb2e41927053388afd2a959d1a1c6313616168, N=16
Loaded 1 flows with label=837a7e8eed7112af737fdb8817eb2e41927053388afd2a959d1a1c6313616168, N=32
Loaded 1 flows with label=837a7e8eed7112af737fdb8817eb2e41927053388afd2a959d1a1c6313616168, N=64
Processing 7d4