# Read Data

In [4]:
import os 
import scipy
import pandas

import numpy as np
from collections import defaultdict

def get_info(directory, timestamp_min_max=None, trim_last_seconds=0):
    
    time_info = defaultdict(list)
    incoming_byte_info = defaultdict(list)
    outgoing_byte_info = defaultdict(list)
    
    for filename in os.listdir(directory):
        
        filepath = os.path.join(directory, filename)
        
        if os.path.isfile(filepath):
            print(f"Processing file: {filename}")

            with open(filepath, 'r') as file:
                
                for line in file:
                    parts = line.strip().split()
                    
                    url = parts[0]
                    timestamps_bytes = parts[2:]
                    
                    timestamps = []
                    incoming_byte_counts = []
                    outgoing_byte_counts = []
                    
                    for tb in timestamps_bytes:
                        
                        # Skip malformed entries that don't contain ':'
                        if ':' not in tb:
                            continue
                        
                        split_result = tb.split(':')
                        if len(split_result) != 2:
                            continue
                        
                        timestamp, nb_of_bytes = split_result
                        timestamp = float(timestamp)                        
                        nb_of_bytes = float(nb_of_bytes)
                        
                        timestamps.append(timestamp)
                        
                        if nb_of_bytes > 0:  # Consider only incoming packets
                            incoming_byte_counts.append(nb_of_bytes)
                        else:
                            nb_of_bytes = int(np.abs(nb_of_bytes))
                            outgoing_byte_counts.append(nb_of_bytes)

                    # Skip lines with no valid timestamps
                    if not timestamps:
                        continue

                    timestamp_start = timestamps[0]
                    timestamp_end = timestamps[-1] 
                    
                    # If trimming last seconds is requested
                    if trim_last_seconds > 0:
                        # timestamps are in ms, trim_last_seconds is in seconds
                        trim_cutoff = timestamp_end - trim_last_seconds * 1000
                        # Only keep those timestamps strictly before the cutoff
                        valid_indices = [i for i, t in enumerate(timestamps) if t < trim_cutoff]
                        if valid_indices:
                            last_valid_index = valid_indices[-1]
                            timestamp_end = timestamps[last_valid_index]
                        else:
                            # If all timestamps are beyond cutoff, skip this line
                            continue
                    
                    if timestamp_min_max:
                        for t_start, t_end in timestamp_min_max[url]:
                            if timestamp_start == t_start:
                                timestamp_end = t_end
                                break
                                
                    end_index = timestamps.index(timestamp_end)
                
                    time_info[url].append((timestamp_start, timestamp_end))
                    incoming_byte_info[url].append(sum(incoming_byte_counts[:end_index]))
                    outgoing_byte_info[url].append(sum(outgoing_byte_counts[:end_index]))
                    
    return time_info, incoming_byte_info, outgoing_byte_info

In [3]:
directory = '../../data/reduced_list/no_proxy/'
no_proxy = get_info(directory)

Processing file: datatracker.ietf.org
Processing file: telegraph.co.uk
Processing file: google.com
Processing file: npr.org
Processing file: berkeley.edu
Processing file: etsy.com
Processing file: huffingtonpost.com
Processing file: creativecommons.org
Processing file: forbes.com
Processing file: ebay.com
Processing file: free.fr
Processing file: techcrunch.com
Processing file: nature.com
Processing file: dailymail.co.uk
Processing file: yelp.com
Processing file: unsplash.com
Processing file: baidu.com
Processing file: linkedin.com
Processing file: weibo.com
Processing file: yahoo.com
Processing file: github.com
Processing file: soundcloud.com
Processing file: latimes.com
Processing file: medium.com
Processing file: facebook.com
Processing file: springer.com
Processing file: un.org
Processing file: office.com
Processing file: apache.org
Processing file: bbc.co.uk
Processing file: wikipedia.org
Processing file: bing.com
Processing file: opera.com
Processing file: europa.eu
Processing fi

In [1]:
import pickle
with open('../../data/overheads/infos.pkl', 'rb') as f: 
    infos = pickle.load(f)

In [21]:
infos.keys()

dict_keys(['configuration00_default', 'no_proxy'])

In [2]:
import numpy as np
from collections import defaultdict
from tabulate import tabulate


def print_overheads_table(overheads_final):
    table = []
    #headers = ["Source", "Latency", "Incoming Bytes", "Outgoing Bytes", "Bandwidth"]
    #headers = ['latency', 'incoming_bytes', 'outgoing_bytes', 'bandwidth']
    headers = ['latency', 'incoming_bytes', 'outgoing_bytes', 'bandwidth']
    for source, metrics in overheads_final.items():
        row = [source]
        for key in headers:
            value = metrics.get(key, 'N/A')
            if isinstance(value, (int, float)):
                row.append(f"{value:.2f}")
            else:
                row.append(value)
        table.append(row)

    print(tabulate(table, headers=headers, tablefmt="grid"))


In [41]:
for k in infos.keys():
    if "extra" in k:
        print(k)

tamaraw_tor_lab_extra
wtf_tor_lab_extra
front_tor_lab_extra
tor_lab_extra
tamaraw_tor_lab_2_extra
tor_lab_tor_lab_extra
tamaraw2_extra
wtf_pad_filtered_extra
front_adv_filtered_extra
front_tor_lab2_extra


In [4]:
infos2 = {}
infos2["configuration00_default"] = infos["normal_setup_filtered"]
infos2["configuration01_lqp10"] = infos["lq10_filtered"]
infos2["configuration02_lqp40"] = infos["lq40_filtered"]
infos2["configuration03_ll0"] = infos["no-cover_filtered"]
infos2["configuration04_lqp0_ll0"] = infos["nothing_filtered"]
infos2["configuration05_lqp0_lm100_ll0"] = infos["nothing_lm100"]
infos2["configuration06_lqp0_lm0_ll0"] = infos["no_mix_nothing"]
infos2["configuration07_lm0"] = infos["lm0_filtered"]
infos2["configuration08_lm20"] = infos["lm20_filtered"]
infos2["configuration09_lm100"] = infos["lm100_filtered"]
infos2["configuration10_lqr10"] = infos["qs10_filtered"]
infos2["configuration11_lqr20"] = infos["qs20_filtered"]
infos2["configuration12_lqp0_ll0_lqr10"] = infos["qs10_nocover"]
infos2["no_proxy"] = infos["No proxy"]

infos2["tor_full"] = infos["Tor"]
infos2["nym_mainnet_full"] = infos["Nym"]
infos2["nym_labnet_full"] = infos["Nym Lab"]

infos2["tor_reduced"] = infos["tor_filtered"]
infos2["nym_mainnet_reduced"] = infos["Nym"]
infos2["nym_labnet_reduced"] = infos["normal_setup_filtered"]

infos2["nym_defence_wtf4nym"] = infos["nym_wtf_advanced"]

In [8]:
def compute_and_print_overheads(infos_dict, fields, baseline_key="no_proxy"):
    """
    Compute and print overhead table for given fields compared to baseline.
    
    Args:
        infos_dict: Dictionary containing traffic information
        fields: List of field names to compute overheads for
        baseline_key: Key for the baseline dataset (default: "no_proxy")
    """
    other_info = {n: f for n, f in infos_dict.items() if n in fields}
    overheads = compute_proportional_overhead(infos_dict[baseline_key], other_info)
    overheads_final = defaultdict(lambda: defaultdict(int))
    overheads_final = {key: overheads_final[key] for key in fields}
    for source, oh in overheads.items():
        for metric, values in oh.items():
            overheads_final[source][metric] = np.median(values)
    
    print_overheads_table(overheads_final)
    return overheads_final


# Example usage
fields = [
    "configuration00_default",
    "configuration01_lqp10",
    "configuration02_lqp40",
    "configuration03_ll0",
    "configuration04_lqp0_ll0",
    "configuration05_lqp0_lm100_ll0",
    "configuration06_lqp0_lm0_ll0",
    "configuration07_lm0",
    "configuration08_lm20",
    "configuration09_lm100",
    "configuration10_lqr10",
    "configuration11_lqr20",
    "configuration12_lqp0_ll0_lqr10"
]

overheads_final = compute_and_print_overheads(infos2, fields, "no_proxy")

+--------------------------------+-----------+------------------+------------------+-------------+
|                                |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| configuration00_default        |      3.8  |             1.56 |            39.11 |        3.77 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration01_lqp10          |      3.76 |             2.42 |            74.34 |        5.65 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration02_lqp40          |      4.57 |             1.1  |            24.33 |        2.47 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration03_ll0            |      3.88 |             1.56 |            38.87 |        3.45 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configur

In [9]:
overheads_final = compute_and_print_overheads(infos2, fields, "no_proxy")

+--------------------------------+-----------+------------------+------------------+-------------+
|                                |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| configuration00_default        |      3.8  |             1.56 |            39.11 |        3.77 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration01_lqp10          |      3.76 |             2.42 |            74.34 |        5.65 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration02_lqp40          |      4.57 |             1.1  |            24.33 |        2.47 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration03_ll0            |      3.88 |             1.56 |            38.87 |        3.45 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configur