In [1]:
import socket
from collections import defaultdict, deque
import dpkt
import numpy as np
import pandas as pd
from pathlib import Path
import os
import struct


def inet_to_str(inet):
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        return socket.inet_ntop(socket.AF_INET6, inet)

def normalize_session(src, dst, sport, dport, proto):
    """
    Define session by (client_ip, server_ip, client_port, server_port, proto).
    Client is assumed to be the side with port != 443, server is port 443.
    """
    if dport == 443:
        client, server = (src, sport), (dst, dport)
    elif sport == 443:
        client, server = (dst, dport), (src, sport)
    else:
        # not a TLS session, skip
        return None
    return (client[0], server[0], client[1], server[1], proto)

def process_pcap_tls_manual(pcap_file, N_values=[8,16,32,64], max_packets=None):
    sessions = defaultdict(lambda: {
        "client_ts": [], "client_sizes": [],
        "server_ts": [], "server_sizes": [],
        "appdata_count": 0
    })

    with open(pcap_file, "rb") as f:
        pcap = dpkt.pcap.Reader(f)

        for i, (ts, buf) in enumerate(pcap):
            try:
                # Check if this is a Linux Cooked Capture v1 frame
                # SLL header is 16 bytes: [2 bytes packet type, 2 bytes ARPHRD, 2 bytes link layer addr len, 
                # 8 bytes link layer addr, 2 bytes protocol type]
                if len(buf) >= 16:
                    # Unpack the protocol type (last 2 bytes of SLL header)
                    protocol_type = struct.unpack('>H', buf[14:16])[0]
                    
                    if protocol_type == 0x0800:  # IPv4 protocol
                        # Skip the 16-byte SLL header and parse the IP packet
                        ip = dpkt.ip.IP(buf[16:])
                    else:
                        # Not IPv4, skip this packet
                        continue
                else:
                    # Try to parse as regular Ethernet frame
                    eth = dpkt.ethernet.Ethernet(buf)
                    if not isinstance(eth.data, dpkt.ip.IP):
                        continue
                    ip = eth.data
                
            except:
                # If parsing fails, skip this packet
                continue
            
            # Check if we have an IP packet
            if not isinstance(ip, dpkt.ip.IP):
                continue
                
            # Check if we have TCP
            l4 = ip.data
            if not isinstance(l4, dpkt.tcp.TCP):
                continue

            src = inet_to_str(ip.src)
            dst = inet_to_str(ip.dst)
            sport, dport = l4.sport, l4.dport

            #print(f"Packet {i}: {src}:{sport} -> {dst}:{dport}, TCP payload length: {len(l4.data)}")
            fid = normalize_session(src, dst, sport, dport, "TCP")
            if fid is None:
                continue

            # Skip SSL/TLS parsing entirely - use raw TCP payload
            # Calculate payload size (TCP payload length)
            tcp_payload_len = len(l4.data)
            
            if tcp_payload_len > 0:  # Only process packets with TCP payload
                if dport == 443:  # client → server
                    sessions[fid]["client_ts"].append(float(ts))
                    sessions[fid]["client_sizes"].append(tcp_payload_len)
                elif sport == 443:  # server → client
                    sessions[fid]["server_ts"].append(float(ts))
                    sessions[fid]["server_sizes"].append(tcp_payload_len)

                sessions[fid]["appdata_count"] += 1

            if max_packets and i > max_packets:
                break

    results = {N: [] for N in N_values}
    for fid, stats in sessions.items():
        combined = []
        combined += [(t, "client", s) for t, s in zip(stats["client_ts"], stats["client_sizes"])]
        combined += [(t, "server", s) for t, s in zip(stats["server_ts"], stats["server_sizes"])]
        combined.sort(key=lambda x: x[0])

        for N in N_values:
            subset = combined[:N]
            if not subset:
                continue

            client_sizes = [s for _, d, s in subset if d == "client"]
            server_sizes = [s for _, d, s in subset if d == "server"]
            client_ts = [t for t, d, _ in subset if d == "client"]
            server_ts = [t for t, d, _ in subset if d == "server"]

            client_iats = np.diff(sorted(client_ts)) if len(client_ts) > 1 else []
            server_iats = np.diff(sorted(server_ts)) if len(server_ts) > 1 else []

            # ---- New features ----
            n_client, n_server = len(client_sizes), len(server_sizes)
            client_bytes, server_bytes = np.sum(client_sizes), np.sum(server_sizes)

            pkt_fraction_client = n_client / (n_client + n_server) if (n_client + n_server) > 0 else 0.0
            bytes_fraction_client = client_bytes / (client_bytes + server_bytes) if (client_bytes + server_bytes) > 0 else 0.0

            # direction switches
            dirs = [d for _, d, _ in subset]
            # direction switches
            if len(dirs) > 1:
                dir_switches = sum(1 for i in range(1, len(dirs)) if dirs[i] != dirs[i-1])
            else:
                dir_switches = 0

            # flow duration
            flow_duration = subset[-1][0] - subset[0][0] if len(subset) > 1 else 0.0

            # time to first response
            if client_ts and server_ts:
                first_client = min(client_ts)
                first_server = min(server_ts)
                time_first_response = first_server - first_client if first_server > first_client else 0.0
            else:
                time_first_response = 0.0

            # global packet size stats
            all_sizes = [s for _, _, s in subset]
            size_min = np.min(all_sizes) if all_sizes else 0
            size_mean = np.mean(all_sizes) if all_sizes else 0.0
            size_max = np.max(all_sizes) if all_sizes else 0

            # global IAT stats (all packets regardless of dir)
            all_ts = [t for t, _, _ in subset]
            all_iats = np.diff(sorted(all_ts)) if len(all_ts) > 1 else []
            iat_min = np.min(all_iats) if len(all_iats) else 0.0
            iat_mean = np.mean(all_iats) if len(all_iats) else 0.0
            iat_max = np.max(all_iats) if len(all_iats) else 0.0
            # ----------------------

            records = {
                "session_id": fid,
                "client_ip": fid[0],
                "server_ip": fid[1],
                "client_port": fid[2],
                "server_port": fid[3],
                "protocol": fid[4],
                "N": N,

                "n_client": n_client,
                "client_bytes": int(client_bytes),
                "client_pkt_min": int(np.min(client_sizes)) if client_sizes else 0,
                "client_pkt_mean": float(np.mean(client_sizes)) if client_sizes else 0.0,
                "client_pkt_max": int(np.max(client_sizes)) if client_sizes else 0,
                "client_iat_min": float(np.min(client_iats)) if len(client_iats) else 0.0,
                "client_iat_mean": float(np.mean(client_iats)) if len(client_iats) else 0.0,
                "client_iat_max": float(np.max(client_iats)) if len(client_iats) else 0.0,

                "n_server": n_server,
                "server_bytes": int(server_bytes),
                "server_pkt_min": int(np.min(server_sizes)) if server_sizes else 0,
                "server_pkt_mean": float(np.mean(server_sizes)) if server_sizes else 0.0,
                "server_pkt_max": int(np.max(server_sizes)) if server_sizes else 0,
                "server_iat_min": float(np.min(server_iats)) if len(server_iats) else 0.0,
                "server_iat_mean": float(np.mean(server_iats)) if len(server_iats) else 0.0,
                "server_iat_max": float(np.max(server_iats)) if len(server_iats) else 0.0,

                # new flow-level ratios
                "pkt_fraction_client": pkt_fraction_client,
                "bytes_fraction_client": bytes_fraction_client,

                # new flow-level timing
                "flow_duration": flow_duration,
                "time_first_response": time_first_response,

                # new flow-level directionality
                "dir_switches": dir_switches,

                # new global stats
                "size_min": int(size_min),
                "size_mean": float(size_mean),
                "size_max": int(size_max),
                "iat_min": float(iat_min),
                "iat_mean": float(iat_mean),
                "iat_max": float(iat_max),
            }

            results[N].append(records)

    for N in results:
        results[N] = pd.DataFrame(results[N])

    return results

In [2]:
import pandas as pd

base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/iodine")
pcap_files = list(base_dir.glob("*.pcap"))

all_dfs = []

for pcap_file in pcap_files:
    if pcap_file.stat().st_size == 0:
        print(f"Skipping {pcap_file.name} (empty file)")
        continue
    
    print(f"Processing {pcap_file.name}...")
    df_dict = process_pcap_tls_manual(pcap_file, N_values=[8,16,32,64])

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]

    for N, df in df_dict.items():
        df["label"] = label
        df["N"] = N  # add column to track window size
        all_dfs.append(df)
        print(f"Loaded {df.shape[0]} flows with label={label}, N={N}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/iodine/all_pcaps_allN.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels and {final_df['N'].nunique()} N values")


Processing iodine_txt-64-tunnel_1111_doh7_2020-03-21T02:05:11.780252.pcap...
Loaded 1 flows with label=iodine, N=8
Loaded 1 flows with label=iodine, N=16
Loaded 1 flows with label=iodine, N=32
Loaded 1 flows with label=iodine, N=64
Processing iodine_null-32-baseline_99911_doh7_2020-03-20T04:04:27.918783.pcap...
Loaded 1 flows with label=iodine, N=8
Loaded 1 flows with label=iodine, N=16
Loaded 1 flows with label=iodine, N=32
Loaded 1 flows with label=iodine, N=64
Processing iodine_srv-64-baseline_1111_doh10_2020-03-22T07:03:07.291153.pcap...
Loaded 1 flows with label=iodine, N=8
Loaded 1 flows with label=iodine, N=16
Loaded 1 flows with label=iodine, N=32
Loaded 1 flows with label=iodine, N=64
Processing iodine_txt-64-tunnel_dnsgoogle_doh1_2020-03-18T18:55:37.032750.pcap...
Loaded 11 flows with label=iodine, N=8
Loaded 11 flows with label=iodine, N=16
Loaded 11 flows with label=iodine, N=32
Loaded 11 flows with label=iodine, N=64
Processing iodine_null-64-tunnel_dnsadguardcom_doh1_2020

In [3]:
import pandas as pd

base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dnscat2")
pcap_files = list(base_dir.glob("*.pcap"))

all_dfs = []

for pcap_file in pcap_files:
    if pcap_file.stat().st_size == 0:
        print(f"Skipping {pcap_file.name} (empty file)")
        continue
    
    print(f"Processing {pcap_file.name}...")
    df_dict = process_pcap_tls_manual(pcap_file, N_values=[8,16,32,64])

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]

    for N, df in df_dict.items():
        df["label"] = label
        df["N"] = N  # add column to track window size
        all_dfs.append(df)
        print(f"Loaded {df.shape[0]} flows with label={label}, N={N}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dnscat2/all_pcaps_allN.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels and {final_df['N'].nunique()} N values")


Processing dsncat2_txt-tunnel_dnsgoogle_doh1_2020-03-23T16:04:33.473980.pcap...
Loaded 14 flows with label=dsncat2, N=8
Loaded 14 flows with label=dsncat2, N=16
Loaded 14 flows with label=dsncat2, N=32
Loaded 14 flows with label=dsncat2, N=64
Processing dsncat2_default-tunnel_dnsadguardcom_doh4_2020-03-29T07:17:49.089669.pcap...
Loaded 1 flows with label=dsncat2, N=8
Loaded 1 flows with label=dsncat2, N=16
Loaded 1 flows with label=dsncat2, N=32
Loaded 1 flows with label=dsncat2, N=64
Processing dsncat2_default-tunnel_dnsadguardcom_doh10_2020-03-28T21:10:33.852697.pcap...
Loaded 1 flows with label=dsncat2, N=8
Loaded 1 flows with label=dsncat2, N=16
Loaded 1 flows with label=dsncat2, N=32
Loaded 1 flows with label=dsncat2, N=64
Processing dsncat2_default-baseline_dnsgoogle_doh2_2020-03-26T03:13:21.391275.pcap...
Loaded 1 flows with label=dsncat2, N=8
Loaded 1 flows with label=dsncat2, N=16
Loaded 1 flows with label=dsncat2, N=32
Loaded 1 flows with label=dsncat2, N=64
Processing dsncat

In [4]:
import pandas as pd

base_dir = Path("/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dns2tcp")
pcap_files = list(base_dir.glob("*.pcap"))

all_dfs = []

for pcap_file in pcap_files:
    if pcap_file.stat().st_size == 0:
        print(f"Skipping {pcap_file.name} (empty file)")
        continue
    
    print(f"Processing {pcap_file.name}...")
    df_dict = process_pcap_tls_manual(pcap_file, N_values=[8,16,32,64])

    # Extract label (everything before the first underscore)
    label = pcap_file.stem.split("_")[0]

    for N, df in df_dict.items():
        df["label"] = label
        df["N"] = N  # add column to track window size
        all_dfs.append(df)
        print(f"Loaded {df.shape[0]} flows with label={label}, N={N}")

# Concatenate everything into one dataframe
final_df = pd.concat(all_dfs, ignore_index=True)

# Save once
final_df.to_csv("/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dns2tcp/all_pcaps_allN.csv", index=False)
print(f"Saved aggregated dataframe with {final_df.shape[0]} flows across {final_df['label'].nunique()} labels and {final_df['N'].nunique()} N values")


Processing dns2tcp_tunnel_99911_doh3_2020-03-31T10:37:01.655493.pcap...
Loaded 166 flows with label=dns2tcp, N=8
Loaded 166 flows with label=dns2tcp, N=16
Loaded 166 flows with label=dns2tcp, N=32
Loaded 166 flows with label=dns2tcp, N=64
Processing dns2tcp_tunnel_dnsgoogle_doh3_2020-03-31T11:11:09.236379.pcap...
Loaded 1 flows with label=dns2tcp, N=8
Loaded 1 flows with label=dns2tcp, N=16
Loaded 1 flows with label=dns2tcp, N=32
Loaded 1 flows with label=dns2tcp, N=64
Processing dns2tcp_tunnel_1111_doh8_2020-04-01T22:20:32.903725.pcap...
Loaded 1 flows with label=dns2tcp, N=8
Loaded 1 flows with label=dns2tcp, N=16
Loaded 1 flows with label=dns2tcp, N=32
Loaded 1 flows with label=dns2tcp, N=64
Processing dns2tcp_tunnel_1111_doh8_2020-04-01T11:27:10.596853.pcap...
Loaded 1 flows with label=dns2tcp, N=8
Loaded 1 flows with label=dns2tcp, N=16
Loaded 1 flows with label=dns2tcp, N=32
Loaded 1 flows with label=dns2tcp, N=64
Processing dns2tcp_tunnel_1111_doh1_2020-04-01T02:02:47.956265.pca