# Get Latency + bandwidth overhead

## Read the data in the folders ../../data/{reduced_list, full_list}

In [1]:
import os 
import scipy
import pandas

import numpy as np
from collections import defaultdict

def get_info(directory, timestamp_min_max=None, trim_last_seconds=0):
    
    time_info = defaultdict(list)
    incoming_byte_info = defaultdict(list)
    outgoing_byte_info = defaultdict(list)
    
    for filename in os.listdir(directory):
        
        filepath = os.path.join(directory, filename)
        
        if os.path.isfile(filepath):
            print(f"Processing file: {filename}")

            with open(filepath, 'r') as file:
                
                for line in file:
                    parts = line.strip().split()
                    
                    url = parts[0]
                    timestamps_bytes = parts[2:]
                    
                    timestamps = []
                    incoming_byte_counts = []
                    outgoing_byte_counts = []
                    
                    for tb in timestamps_bytes:
                        
                        # Skip malformed entries that don't contain ':'
                        if ':' not in tb:
                            continue
                        
                        split_result = tb.split(':')
                        if len(split_result) != 2:
                            continue
                        
                        timestamp, nb_of_bytes = split_result
                        timestamp = float(timestamp)                        
                        nb_of_bytes = float(nb_of_bytes)
                        
                        timestamps.append(timestamp)
                        
                        if nb_of_bytes > 0:  # Consider only incoming packets
                            incoming_byte_counts.append(nb_of_bytes)
                        else:
                            nb_of_bytes = int(np.abs(nb_of_bytes))
                            outgoing_byte_counts.append(nb_of_bytes)

                    # Skip lines with no valid timestamps
                    if not timestamps:
                        continue

                    timestamp_start = timestamps[0]
                    timestamp_end = timestamps[-1] 
                    
                    # If trimming last seconds is requested
                    if trim_last_seconds > 0:
                        # timestamps are in ms, trim_last_seconds is in seconds
                        trim_cutoff = timestamp_end - trim_last_seconds * 1000
                        # Only keep those timestamps strictly before the cutoff
                        valid_indices = [i for i, t in enumerate(timestamps) if t < trim_cutoff]
                        if valid_indices:
                            last_valid_index = valid_indices[-1]
                            timestamp_end = timestamps[last_valid_index]
                        else:
                            # If all timestamps are beyond cutoff, skip this line
                            continue
                    
                    if timestamp_min_max:
                        for t_start, t_end in timestamp_min_max[url]:
                            if timestamp_start == t_start:
                                timestamp_end = t_end
                                break
                                
                    end_index = timestamps.index(timestamp_end)
                
                    time_info[url].append((timestamp_start, timestamp_end))
                    incoming_byte_info[url].append(sum(incoming_byte_counts[:end_index]))
                    outgoing_byte_info[url].append(sum(outgoing_byte_counts[:end_index]))
                    
    return time_info, incoming_byte_info, outgoing_byte_info

In [2]:
directory = '../../data/reduced_list/no_proxy/'
no_proxy = get_info(directory)

Processing file: datatracker.ietf.org
Processing file: telegraph.co.uk
Processing file: google.com
Processing file: npr.org
Processing file: google.com
Processing file: npr.org
Processing file: google.com
Processing file: npr.org
Processing file: berkeley.edu
Processing file: etsy.com
Processing file: huffingtonpost.com
Processing file: berkeley.edu
Processing file: etsy.com
Processing file: huffingtonpost.com
Processing file: berkeley.edu
Processing file: etsy.com
Processing file: huffingtonpost.com
Processing file: creativecommons.org
Processing file: forbes.com
Processing file: ebay.com
Processing file: free.fr
Processing file: creativecommons.org
Processing file: forbes.com
Processing file: ebay.com
Processing file: free.fr
Processing file: creativecommons.org
Processing file: forbes.com
Processing file: ebay.com
Processing file: free.fr
Processing file: techcrunch.com
Processing file: nature.com
Processing file: dailymail.co.uk
Processing file: techcrunch.com
Processing file: natu

## We provide the full extracted informations in infos

In [3]:
import pickle
with open('../../data/overheads/infos.pkl', 'rb') as f: 
    infos = pickle.load(f)

In [4]:
infos.keys()

dict_keys(['configuration00_default', 'configuration01_lqp10', 'configuration02_lqp40', 'configuration03_ll0', 'configuration04_lqp0_ll0', 'configuration05_lqp0_lm100_ll0', 'configuration06_lqp0_lm0_ll0', 'configuration07_lm0', 'configuration08_lm20', 'configuration09_lm100', 'configuration10_lqr10', 'configuration11_lqr20', 'configuration12_lqp0_ll0_lqr10', 'no_proxy', 'tor_full', 'nym_mainnet_full', 'nym_labnet_full', 'tor_reduced', 'nym_mainnet_reduced', 'nym_labnet_reduced', 'nym_defence_wtf4nym', 'nym_defence_poissoff'])

## These cells can be run to compute the overheads

In [5]:

def compute_proportional_overhead(base_info, other_info):

    base_time_info, base_incoming_byte_info, base_outgoing_byte_info = base_info
    
    websites_set = [set(base_time_info.keys())] + [set(time_info.keys()) for time_info, _, _ in other_info.values()]
    common_websites = set.intersection(*websites_set)
    print(len(common_websites))
    
    overheads = defaultdict(lambda: defaultdict(list))
    
    for website in common_websites:
        
        median_latency_base = np.median([end - start for start, end in base_time_info[website]])
        median_incoming_bytes_base = np.median(base_incoming_byte_info[website])
        median_outgoing_bytes_base = np.median(base_outgoing_byte_info[website])
        bandwidth_base = np.median(base_incoming_byte_info[website]+base_outgoing_byte_info[website])
    
        for source_name, (time_info, incoming_byte_info, outgoing_byte_info) in other_info.items():
            
            median_latency_source = np.median([end - start for start, end in time_info[website]])
            latency_overhead = (median_latency_source / median_latency_base) - 1 if median_latency_base != 0 else np.nan
            overheads[source_name]["latency"].append(latency_overhead)
    
            median_incoming_bytes_source = np.median(incoming_byte_info[website])
            incoming_byte_overhead = (median_incoming_bytes_source / median_incoming_bytes_base) - 1 if median_incoming_bytes_base != 0 else np.nan
            overheads[source_name]["incoming_bytes"].append(incoming_byte_overhead)
    
            median_outgoing_bytes_source = np.median(outgoing_byte_info[website])
            outgoing_byte_overhead = (median_outgoing_bytes_source / median_outgoing_bytes_base) - 1 if median_outgoing_bytes_base != 0 else np.nan
            overheads[source_name]["outgoing_bytes"].append(outgoing_byte_overhead)

            bandwidth_source = np.median(incoming_byte_info[website]+outgoing_byte_info[website])
            bandwidth_overhead = (bandwidth_source / bandwidth_base) - 1 if bandwidth_base != 0 else np.nan 
            overheads[source_name]["bandwidth"].append(bandwidth_overhead)
    
    return overheads
    


In [6]:
import numpy as np
from collections import defaultdict
from tabulate import tabulate


def print_overheads_table(overheads_final):
    table = []
    headers = ['latency', 'incoming_bytes', 'outgoing_bytes', 'bandwidth']
    for source, metrics in overheads_final.items():
        row = [source]
        for key in headers:
            value = metrics.get(key, 'N/A')
            if isinstance(value, (int, float)):
                row.append(f"{value:.2f}")
            else:
                row.append(value)
        table.append(row)

    print(tabulate(table, headers=headers, tablefmt="grid"))


In [7]:
def compute_and_print_overheads(infos_dict, fields, baseline_key="no_proxy"):
    """
    Compute and print overhead table for given fields compared to baseline.
    
    Args:
        infos_dict: Dictionary containing traffic information
        fields: List of field names to compute overheads for
        baseline_key: Key for the baseline dataset (default: "no_proxy")
    """
    other_info = {n: f for n, f in infos_dict.items() if n in fields}
    overheads = compute_proportional_overhead(infos_dict[baseline_key], other_info)
    overheads_final = defaultdict(lambda: defaultdict(int))
    overheads_final = {key: overheads_final[key] for key in fields}
    for source, oh in overheads.items():
        for metric, values in oh.items():
            overheads_final[source][metric] = np.median(values)
    
    print_overheads_table(overheads_final)
    return overheads_final

In [8]:
# Example usage
fields = [
    "configuration00_default",
    "configuration01_lqp10",
    "configuration02_lqp40",
    "configuration03_ll0",
    "configuration04_lqp0_ll0",
    "configuration08_lm20",
    "configuration09_lm100",
    "configuration10_lqr10",
    "configuration11_lqr20",
    "configuration12_lqp0_ll0_lqr10"
]

overheads_final = compute_and_print_overheads(infos, fields, "no_proxy")

580
+--------------------------------+-----------+------------------+------------------+-------------+
|                                |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| configuration00_default        |      3.78 |             1.53 |            38.75 |        3.54 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration01_lqp10          |      3.74 |             2.36 |            73.97 |        5.54 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration02_lqp40          |      4.51 |             1.09 |            24.03 |        2.43 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration03_ll0            |      3.84 |             1.49 |            38.74 |        3.39 |
+--------------------------------+-----------+------------------+------------------+-------------+
| conf

In [9]:
# Example usage
fields = [
    "configuration05_lqp0_lm100_ll0",
    "configuration06_lqp0_lm0_ll0",
    "configuration07_lm0",
]

overheads_final = compute_and_print_overheads(infos, fields, "no_proxy")

567
+--------------------------------+-----------+------------------+------------------+-------------+
|                                |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| configuration05_lqp0_lm100_ll0 |      5.64 |             0.64 |             9.35 |        1.31 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration06_lqp0_lm0_ll0   |      0.88 |             0.59 |             9.01 |        1.16 |
+--------------------------------+-----------+------------------+------------------+-------------+
| configuration07_lm0            |      1.8  |             1.07 |            22.1  |        2.54 |
+--------------------------------+-----------+------------------+------------------+-------------+
+--------------------------------+-----------+------------------+------------------+-------------+
|                                |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| conf

In [10]:
fields = [
    "tor_reduced",
    "nym_labnet_reduced",
    "nym_mainnet_reduced"
]
overheads_final = compute_and_print_overheads(infos, fields, "no_proxy")

580
+---------------------+-----------+------------------+------------------+-------------+
|                     |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| tor_reduced         |      2.79 |             0.36 |             3.6  |        0.54 |
+---------------------+-----------+------------------+------------------+-------------+
| nym_labnet_reduced  |      3.78 |             1.53 |            38.75 |        3.54 |
+---------------------+-----------+------------------+------------------+-------------+
| nym_mainnet_reduced |     12.38 |             3.85 |           113.33 |        8.9  |
+---------------------+-----------+------------------+------------------+-------------+
+---------------------+-----------+------------------+------------------+-------------+
|                     |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| tor_reduced         |      2.79 |             0.36 |             3.6  |        0.54 |
+---------------------+-----

In [11]:
fields = [
    "tor_full",
    "nym_labnet_full",
    "nym_mainnet_full"
]
overheads_final = compute_and_print_overheads(infos, fields, "no_proxy")

718
+------------------+-----------+------------------+------------------+-------------+
|                  |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| tor_full         |      2.8  |             0.42 |             3.55 |        0.64 |
+------------------+-----------+------------------+------------------+-------------+
| nym_labnet_full  |      3.69 |             1.68 |            37.29 |        3.73 |
+------------------+-----------+------------------+------------------+-------------+
| nym_mainnet_full |     11.93 |             3.71 |           102.55 |        8.97 |
+------------------+-----------+------------------+------------------+-------------+


In [12]:
fields = [
    "nym_defence_wtf4nym",
    "nym_defence_poissoff"
]
overheads_final = compute_and_print_overheads(infos, fields, "no_proxy")

580
+----------------------+-----------+------------------+------------------+-------------+
|                      |   latency |   incoming_bytes |   outgoing_bytes |   bandwidth |
| nym_defence_wtf4nym  |      3.78 |             2.95 |            38.75 |        4.38 |
+----------------------+-----------+------------------+------------------+-------------+
| nym_defence_poissoff |      9.87 |             4.48 |            89.97 |        8.06 |
+----------------------+-----------+------------------+------------------+-------------+
