In [2]:
import dpkt
import socket
import pandas as pd
import numpy as np
from collections import defaultdict
from pathlib import Path

In [6]:
# List of PCAP folders to process
pcap_folders = [
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard/AdGuard",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Cloudflare/Cloudflare",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Google/Google",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Quad9/Quad9",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-AdGuard/AdGuard/1",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-AdGuard/AdGuard/2",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-CloudFlare/CloudFlare/1",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-CloudFlare/CloudFlare/2",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-Google/Google/large",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-Quad9/Quad9/1",
    r"/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Firefox-Quad9/Quad9/2",
]

# List of benign DoH IPs
doh_ips = [
    "1.1.1.1",
    "8.8.4.4",
    "8.8.8.8",
    "9.9.9.9",
    "9.9.9.10",
    "9.9.9.11",
    "176.103.130.131",
    "176.103.130.130",
    "149.112.112.10",
    "149.112.112.112",
    "104.16.248.249",
    "104.16.249.249"
]

In [7]:
def inet_to_str(inet):
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        return socket.inet_ntop(socket.AF_INET6, inet)

def normalize_session(src, dst, sport, dport, proto):
    """
    Define session by (client_ip, server_ip, client_port, server_port, proto).
    Client is assumed to be the side with port != 443, server is port 443.
    """
    if dport == 443:
        client, server = (src, sport), (dst, dport)
    elif sport == 443:
        client, server = (dst, dport), (src, sport)
    else:
        # not a TLS session, skip
        return None
    return (client[0], server[0], client[1], server[1], proto)

def process_pcap_tls(pcap_file, N_values=[8,16,32,64], max_packets=None):
    sessions = defaultdict(lambda: {
        "client_ts": [], "client_sizes": [],
        "server_ts": [], "server_sizes": [],
        "appdata_count": 0
    })

    with open(pcap_file, "rb") as f:
        pcap = dpkt.pcap.Reader(f)

        for i, (ts, buf) in enumerate(pcap):
            try:
                eth = dpkt.ethernet.Ethernet(buf)
                if not isinstance(eth.data, dpkt.ip.IP):
                    continue
                ip = eth.data
                l4 = ip.data
                if not isinstance(l4, dpkt.tcp.TCP):
                    continue

                src = inet_to_str(ip.src)
                dst = inet_to_str(ip.dst)
                sport, dport = l4.sport, l4.dport
                fid = normalize_session(src, dst, sport, dport, "TCP")
                if fid is None:
                    continue

                try:
                    records, _ = dpkt.ssl.tls_multi_factory(l4.data)
                except (dpkt.ssl.SSL3Exception, dpkt.dpkt.NeedData):
                    continue

                for rec in records:
                    if rec.type == 23:  # TLS App Data
                        if dport == 443:  # client → server
                            sessions[fid]["client_ts"].append(float(ts))
                            sessions[fid]["client_sizes"].append(len(rec.data))
                        elif sport == 443:  # server → client
                            sessions[fid]["server_ts"].append(float(ts))
                            sessions[fid]["server_sizes"].append(len(rec.data))

                        sessions[fid]["appdata_count"] += 1

            except Exception:
                continue

            if max_packets and i > max_packets:
                break

    results = {N: [] for N in N_values}
    for fid, stats in sessions.items():
        combined = []
        combined += [(t, "client", s) for t, s in zip(stats["client_ts"], stats["client_sizes"])]
        combined += [(t, "server", s) for t, s in zip(stats["server_ts"], stats["server_sizes"])]
        combined.sort(key=lambda x: x[0])

        for N in N_values:
            subset = combined[:N]
            if not subset:
                continue

            client_sizes = [s for _, d, s in subset if d == "client"]
            server_sizes = [s for _, d, s in subset if d == "server"]
            client_ts = [t for t, d, _ in subset if d == "client"]
            server_ts = [t for t, d, _ in subset if d == "server"]

            client_iats = np.diff(sorted(client_ts)) if len(client_ts) > 1 else []
            server_iats = np.diff(sorted(server_ts)) if len(server_ts) > 1 else []

            # ---- New features ----
            n_client, n_server = len(client_sizes), len(server_sizes)
            client_bytes, server_bytes = np.sum(client_sizes), np.sum(server_sizes)

            pkt_fraction_client = n_client / (n_client + n_server) if (n_client + n_server) > 0 else 0.0
            bytes_fraction_client = client_bytes / (client_bytes + server_bytes) if (client_bytes + server_bytes) > 0 else 0.0

            # direction switches
            dirs = [d for _, d, _ in subset]
            dir_switches = sum(1 for i in range(1, len(dirs)) if dirs[i] != dirs[i-1])

            # flow duration
            flow_duration = subset[-1][0] - subset[0][0] if len(subset) > 1 else 0.0

            # time to first response
            if client_ts and server_ts:
                first_client = min(client_ts)
                first_server = min(server_ts)
                time_first_response = first_server - first_client if first_server > first_client else 0.0
            else:
                time_first_response = 0.0

            # global packet size stats
            all_sizes = [s for _, _, s in subset]
            size_min = np.min(all_sizes) if all_sizes else 0
            size_mean = np.mean(all_sizes) if all_sizes else 0.0
            size_max = np.max(all_sizes) if all_sizes else 0

            # global IAT stats (all packets regardless of dir)
            all_ts = [t for t, _, _ in subset]
            all_iats = np.diff(sorted(all_ts)) if len(all_ts) > 1 else []
            iat_min = np.min(all_iats) if len(all_iats) else 0.0
            iat_mean = np.mean(all_iats) if len(all_iats) else 0.0
            iat_max = np.max(all_iats) if len(all_iats) else 0.0
            # ----------------------

            records = {
                "session_id": fid,
                "client_ip": fid[0],
                "server_ip": fid[1],
                "client_port": fid[2],
                "server_port": fid[3],
                "protocol": fid[4],
                "N": N,

                "n_client": n_client,
                "client_bytes": int(client_bytes),
                "client_pkt_min": int(np.min(client_sizes)) if client_sizes else 0,
                "client_pkt_mean": float(np.mean(client_sizes)) if client_sizes else 0.0,
                "client_pkt_max": int(np.max(client_sizes)) if client_sizes else 0,
                "client_iat_min": float(np.min(client_iats)) if len(client_iats) else 0.0,
                "client_iat_mean": float(np.mean(client_iats)) if len(client_iats) else 0.0,
                "client_iat_max": float(np.max(client_iats)) if len(client_iats) else 0.0,

                "n_server": n_server,
                "server_bytes": int(server_bytes),
                "server_pkt_min": int(np.min(server_sizes)) if server_sizes else 0,
                "server_pkt_mean": float(np.mean(server_sizes)) if server_sizes else 0.0,
                "server_pkt_max": int(np.max(server_sizes)) if server_sizes else 0,
                "server_iat_min": float(np.min(server_iats)) if len(server_iats) else 0.0,
                "server_iat_mean": float(np.mean(server_iats)) if len(server_iats) else 0.0,
                "server_iat_max": float(np.max(server_iats)) if len(server_iats) else 0.0,

                # new flow-level ratios
                "pkt_fraction_client": pkt_fraction_client,
                "bytes_fraction_client": bytes_fraction_client,

                # new flow-level timing
                "flow_duration": flow_duration,
                "time_first_response": time_first_response,

                # new flow-level directionality
                "dir_switches": dir_switches,

                # new global stats
                "size_min": int(size_min),
                "size_mean": float(size_mean),
                "size_max": int(size_max),
                "iat_min": float(iat_min),
                "iat_mean": float(iat_mean),
                "iat_max": float(iat_max),
            }

            results[N].append(records)

    for N in results:
        results[N] = pd.DataFrame(results[N])

    return results

In [8]:
import os
import pandas as pd

output_doh_csv = "all_doh.csv"
output_nondoh_csv = "all_nondoh.csv"

# === PROCESS ===
all_doh = []
all_nondoh = []

for folder in pcap_folders:
    for fname in os.listdir(folder):
        if not fname.endswith(".pcap"):
            continue
        fpath = os.path.join(folder, fname)
        print(f"[+] Processing {fpath}...")

        try:
            results = process_pcap_tls(fpath)  # uses your updated feature extractor

            # merge all N into a single DataFrame
            df_allN = pd.concat(results.values(), ignore_index=True)

            # classify by server IP
            mask_doh = df_allN["server_ip"].isin(doh_ips)
            df_doh = df_allN[mask_doh].copy()
            df_nondoh = df_allN[~mask_doh].copy()

            # add file origin (optional)
            df_doh["source_pcap"] = fname
            df_nondoh["source_pcap"] = fname

            all_doh.append(df_doh)
            all_nondoh.append(df_nondoh)

        except Exception as e:
            print(f"[-] Error processing {fpath}: {e}")

# combine across all pcaps
df_doh_all = pd.concat(all_doh, ignore_index=True) if all_doh else pd.DataFrame()
df_nondoh_all = pd.concat(all_nondoh, ignore_index=True) if all_nondoh else pd.DataFrame()

# save to CSV
df_doh_all.to_csv(output_doh_csv, index=False)
df_nondoh_all.to_csv(output_nondoh_csv, index=False)

print(f"[+] Saved {output_doh_csv} with {len(df_doh_all)} rows")
print(f"[+] Saved {output_nondoh_csv} with {len(df_nondoh_all)} rows")


[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard/AdGuard/dump_00004_20200114141606.pcap...
[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard/AdGuard/dump_00002_20200114114901.pcap...
[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard/AdGuard/dump_00005_20200114153502.pcap...
[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard/AdGuard/dump_00003_20200114130936.pcap...
[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-AdGuard/AdGuard/dump_00001_20200114102945.pcap...
[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_NonDoH-Chrome-Cloudflare/Cloudflare/dump_00004_20200113193921.pcap...
[+] Processing /home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/BenignDoH_N