In [1]:
import os
import gzip
import json
import ipaddress
import re
import datetime
import copy

import numpy as np
import pandas as pd

from util import *
from histogram_functions import *

In [2]:
# To get json dumps to format floats the same as epping (%g format)
# from https://stackoverflow.com/a/1733105
class PrettyFloat(float):
    def __repr__(self):
        return '%g' % self
    
def pretty_floats(obj):
    if isinstance(obj, float):
        return PrettyFloat(obj)
    elif isinstance(obj, dict):
        return dict((k, pretty_floats(v)) for k, v in obj.items())
    elif isinstance(obj, (list, tuple)):
        return list(map(pretty_floats, obj))
    return obj

In [3]:
def is_valid_date(datestr):
    return re.match("^\d{4}-\d{2}-\d{2}$", datestr) is not None

def is_epping_filename(filename):
    return re.match("^pping\..*\.json.\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[\+\-]\d{2}:\d{2}(\.gz)?$", 
                    filename) is not None

def get_epping_files(root_folder):
    files = []
    for datedir in os.scandir(root_folder):
        if not (datedir.is_dir() and is_valid_date(datedir.name)):
            continue
        
        for gzfile in os.scandir(datedir.path):
            if gzfile.is_file() and is_epping_filename(gzfile.name):
                files.append(gzfile.path)
    
    return sorted(files)

def classify_epping_entry_type(entry):
    if "aggregation_interval_ns" in entry:
        return "specification"
    
    if "ip_prefix" in entry:
        return "subnet_stats"
    
    if "protocol_counters" in entry:
        return "global_counters"
    
    return "unknown"

def verify_epping_specification(json_agg_format, nbins=250, bin_width_ms=4, 
                                agg_interval_s=10, ipv4_prefix_len=24, 
                                ipv6_prefix_len=48):
    fields = {"bins": nbins,
              "bin_width_ns": bin_width_ms * 1e6,
              "aggregation_interval_ns": agg_interval_s * 1e9,
              "ipv4_prefix_len": ipv4_prefix_len,
              "ipv6_prefix_len": ipv6_prefix_len}
    
    for field, expected in fields.items():
        if json_agg_format[field] != expected:
            raise ValueError("{} is {}, expected {}".format(
                field, json_agg_format[field], expected))
    return True

def get_epping_json_entries(filename, verify_specification=True, **kwargs):
    with gzip.open(filename) as jfile:
        entries = json.load(jfile)
    
    if not classify_epping_entry_type(entries[0]) == "specification":
        raise ValueError("{} does not start with a specification entry".format(filename))
    
    for entry in entries[1:]:
        if classify_epping_entry_type(entry) == "specification":
            raise ValueError("{} contains multiple specification entires".format(filename))
    
    if verify_specification:
        verify_epping_specification(entries[0], **kwargs)
    
    return entries

def is_in_subnets(subnet, include_subnets):
    subnet = netify(subnet)
    include_subnets = [netify(net) for net in include_subnets]
    
    for net in include_subnets:
        if subnet.version == net.version and subnet.subnet_of(net):
            return True
    return False

In [4]:
def add_to_rxtx_stats(dst, src):
    for traf_cat in src.keys():
        if traf_cat in dst.keys():
            dst[traf_cat]["packets"] += src[traf_cat]["packets"]
            dst[traf_cat]["bytes"] += src[traf_cat]["bytes"]
        else:
            dst[traf_cat] = src[traf_cat].copy()
    
    return dst

def add_to_entry(dst, src, bins):
    if dst["timestamp"] != src["timestamp"]:
        raise ValueError("dst and src from different times, should not be merged!")
    
    add_to_rxtx_stats(dst["rx_stats"], src["rx_stats"])
    add_to_rxtx_stats(dst["tx_stats"], src["tx_stats"])
    
    if "count_rtt" in src and src["count_rtt"] > 0:
        dst["min_rtt"] = min(dst.get("min_rtt", 1e100), src["min_rtt"])
        dst["max_rtt"] = max(dst.get("max_rtt", 0), src["max_rtt"])
        
        hist = sum_histograms([dst.get("histogram", []), src["histogram"]])
        bins = bins[:len(hist) + 1]
        dst["histogram"] = hist
        
        dst["count_rtt"] = int(bincount_count(bins, hist))
        dst["mean_rtt"] = float(bincount_mean(bins, hist))
        dst["median_rtt"] = float(bincount_median(bins, hist))
        dst["p95_rtt"] = float(bincount_quantile(bins, hist, 0.95))
    
    return dst

def merge_subnet_entries(json_entries, subnets, replacement_subnet="100.64.0.0/24", 
                         del_subnets=None):
    curr_t = 0
    merged_entry = None
    del_idx = []
    
    subnets = [netify(subnet) for subnet in subnets]
    if del_subnets is not None:
        del_subnets = [netify(subnet) for subnet in del_subnets]
    
    spec = json_entries[0]
    if classify_epping_entry_type(spec) != "specification":
        raise ValueError("Missing initial specification entry - cannot get histogram config")
    
    nbins = spec["bins"]
    bin_width = spec["bin_width_ns"]
    bins = np.arange(0, nbins * bin_width + 1, bin_width)
    
    for i, entry in enumerate(json_entries):
        if classify_epping_entry_type(entry) != "subnet_stats":
            continue
        if del_subnets is not None and is_in_subnets(entry["ip_prefix"], 
                                                     del_subnets):
            del_idx.append(i)
            continue
        if not is_in_subnets(entry["ip_prefix"], subnets):
            continue
        
        if entry["timestamp"] != curr_t:
            curr_t = entry["timestamp"]
            merged_entry = entry
            entry["ip_prefix"] = replacement_subnet
        else:
            add_to_entry(merged_entry, entry, bins)
            del_idx.append(i)
    
    json_entries[:] = [entry for idx, entry in enumerate(json_entries) if idx not in del_idx]
    return json_entries

In [11]:
'''
Merges together all customer LAN subnets into a single 100.64.0.0/24 entry to
avoid leaking information about the ISPs internal network.
Also removes some subnet used for internal network management for similar reasons.
'''
def clean_jsondata(root_folder, new_root, merge_subnets=[], subnet_replacement="100.64.0.0/24", 
                   del_subnets=[], specification_kwargs={}, report_freq=1440, compresslevel=6):
    nfiles = 0
    nentries = 0
    nmergedel = 0
    
    files = get_epping_files(root_folder)
    for file in files:
        entries = get_epping_json_entries(file)
        prelen = len(entries)
        
        entries = merge_subnet_entries(entries, merge_subnets, 
                                       replacement_subnet=subnet_replacement, 
                                       del_subnets=del_subnets)
        
        nentries += prelen
        nmergedel += prelen - len(entries)
        
        save_file = os.path.join(new_root, os.path.relpath(file, root_folder))
        os.makedirs(os.path.dirname(save_file), exist_ok=True)
        if os.path.exists(save_file):
            raise FileExistsError("{} already exists".format(save_file))
        
        with gzip.open(save_file, "wt", compresslevel=compresslevel) as outfile:
            json.dump(entries, outfile, separators=(",",":"))
            
        nfiles += 1
        
        if (report_freq > 0 and (nfiles % report_freq == 0 or nfiles == len(files))):
            print("{}: Parsed {}/{} files, containing {} entries ({} merged or dropped)".format(
                datetime.datetime.now(), nfiles, len(files), nentries, nmergedel))
    
    return nfiles, nentries, nmergedel

In [12]:
lan_subnets_to_merge = [# Removed list of internal subnets used by ISP
                            ]

lan_subnets_to_remove = [# Removed list of internal subnets used by ISP
                            ]

In [13]:
%%time
clean_jsondata("data/original/raw", 
               "data/raw",
               merge_subnets=lan_subnets_to_merge,
               subnet_replacement="100.64.0.0/24",
               del_subnets=lan_subnets_to_remove)

2024-08-21 17:14:16.097685: Parsed 1440/46522 files, containing 10703453 entries (225462 merged or dropped)
2024-08-21 17:28:04.579010: Parsed 2880/46522 files, containing 21131617 entries (443351 merged or dropped)
2024-08-21 17:42:23.515666: Parsed 4320/46522 files, containing 31948463 entries (663273 merged or dropped)
2024-08-21 17:56:06.102021: Parsed 5760/46522 files, containing 42486616 entries (885994 merged or dropped)
2024-08-21 18:10:17.679646: Parsed 7200/46522 files, containing 53317821 entries (1112148 merged or dropped)
2024-08-21 18:25:14.759356: Parsed 8640/46522 files, containing 64566341 entries (1345935 merged or dropped)
2024-08-21 18:40:16.116448: Parsed 10080/46522 files, containing 75980364 entries (1588208 merged or dropped)
2024-08-21 18:55:09.230121: Parsed 11520/46522 files, containing 87200788 entries (1818702 merged or dropped)
2024-08-21 19:09:39.660753: Parsed 12960/46522 files, containing 98021512 entries (2028267 merged or dropped)
2024-08-21 19:21:32.

(46522, 357907307, 7211260)