In [4]:
# imports and functions, does nothing

from functools import partial
from IPython.display import Image, display, Video, Image, HTML
from matplotlib import rcParams
from prettytable import PrettyTable
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.multiclass import unique_labels
import constants as cst
import glob
import json
import sys
sys.path.append('../..')
import lib.plot_builder as plot_builder
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import pathlib
import random
import sklearn
import random
import matplotlib
matplotlib.rcParams.update({'font.size': 14})

def print_table(data, fields=None):
    if len(data) == 0:
        return
    if len(data[0]) < len(fields):
        fields = fields[:len(data[0])]
    while len(data[0]) > len(fields):
        fields.append('*')
    t = PrettyTable()
    for row in data:
        t.add_row(row)
    t.align = 'l'
    if fields != None:
        t.field_names = fields
    print(t)

def read_dataset(f):
    acc = None
    precision = None
    recall = None
    f1score = None
    with open(f) as f2:
        data = json.load(f2)
        acc = round(100*data['score']['accuracy'][0], 1)
        precision = round(100*data['score']['precision'][0], 1)
        recall = round(100*data['score']['recall'][0], 1)
        f1score = round(100*data['score']['f1score'][0], 1)

    cm = f.replace('datasets/', 'plots/').replace('.json', '-cm.png')
    fi = f.replace('datasets/', 'plots/').replace('.json', '-fi.png')

    return acc, precision, recall, f1score, cm, fi

def img(path, width=400):
    rnd = random.randint(0,2e9)
    return f"""<img src="{path}?nocache={rnd}" style="width:{width}px; "></img>"""

def latex_table(table, header):

    if len(table) == 0:
        return
    if len(table[0]) < len(header):
        header = header[:len(table[0])]
    while len(table[0]) > len(header):
        header.append('*')

    header_bold = ["\\textbf{"+t+"}" for t in header]
    table2 = [header_bold]
    table2.extend(table)
    latex_table = """\\begin{tabular}{lrrrr}
"""
    rows = [" & ".join(map(str,row)) for row in table2]
    latex_table += "".join(["    " + row + " \\\\\n" for row in rows])
    latex_table += """\\end{tabular}"""

    return latex_table

def dataset_name_to_friendly_name(name):
    root_dataset = name.replace('-netflow1000', '').replace('-netflow100', '').replace('-netflow10', '').replace('-netflows', '').replace('-netflow', '').replace('-defended', '').replace('-google', '')
    nicename = name.replace('-netflow1000', "NF 0.1%").replace('-netflow100', "NF   1%").replace('-netflow10', "NF  10%").replace('-netflow', "NF 100%").replace('/', ' ').replace('-defended', ' (defended)').replace('-google', '(Google dest filter)')
    variant = nicename.replace(root_dataset, '')
    return root_dataset, nicename, variant

def find_pos(name, features):
    i = 0
    while i<len(features):
        if features[i].strip() == name:
            return i
        i += 1
    print("Couldn't find", name)
    sys.exit(0)

In [5]:
# loads datasets of interest
files = glob.glob("datasets/*.json")
files = [f for f in files]
files.sort()

In [24]:
# Table 3
# Print the table summary with Dataset + Accuracy (just the netflows)
tuples = []

def size(dataset_path, path="**/nfcapd*", name=""):
    files = glob.glob(dataset_path + path, recursive=True)
    
    sizes = dict()
    for p in files:
        f = p.replace(dataset_path, '')
        parts = f.split('/')
        url = parts[0]

        if not url in sizes:
            sizes[url] = []

        size = os.path.getsize(p)
        sizes[url].append(size)

    keys = list(sizes.keys())
    means = []
    
    for k in keys:
        mean = round(np.mean(sizes[k]), 2)
        means.append(mean)

    dataset = dataset_path.replace("../../cf-clusters-datasets/", "").replace('netflows_1000', "0.1%").replace('netflows_100', "  1%").replace('netflows_10', " 10%").replace('netflows', "100%").replace('/', ' ')

    return [dataset, round(np.min(means)), round(np.mean(means)), round(np.max(means))]

def getf1(f):
    acc, precision, recall, f1score, cm, fi = read_dataset(f)
    return f1score


sizes = dict()
sizes['pcap'] = size("../../cf-clusters-datasets/quic-100p-150/pcaps/", path="**/*.pcap")
sizes['100%'] = size("../../cf-clusters-datasets/quic-100p-150/netflows/")
sizes['10%'] = size("../../cf-clusters-datasets/quic-100p-150/netflows_10/")
sizes['1%'] = size("../../cf-clusters-datasets/quic-100p-150/netflows_100/")
sizes['0.1%'] = size("../../cf-clusters-datasets/quic-100p-150/netflows_1000/")

accs = dict()
accs['pcap'] = getf1("../randomforests/datasets/quic-100p-150.json")
accs['100%'] = getf1("./datasets/quic-100p-150-netflow.json")
accs['10%'] = getf1("./datasets/quic-100p-150-netflow10.json")
accs['1%'] = getf1("./datasets/quic-100p-150-netflow100.json")
accs['0.1%'] = getf1("./datasets/quic-100p-150-netflow1000.json")

table_data = []
for key in sizes.keys():
    table_data.append([key, sizes[key][2], accs[key]])

print(f"table_data = ", table_data)
print(latex_table([[t[0].replace('%', '\\%'), t[1], t[2]] for t in table_data], header=["Dataset", "Sampling", "F1 Score"]))

table_data =  [['pcap', 319914, 95.8], ['100%', 26537, 90.5], ['10%', 3034, 66.4], ['1%', 923, 41.7], ['0.1%', 433, 16.8]]
\begin{tabular}{lrrrr}
    \textbf{Dataset} & \textbf{Sampling} & \textbf{F1 Score} \\
    pcap & 319914 & 95.8 \\
    100\% & 26537 & 90.5 \\
    10\% & 3034 & 66.4 \\
    1\% & 923 & 41.7 \\
    0.1\% & 433 & 16.8 \\
\end{tabular}


In [23]:
# Defended netflows

cost = dict()
cost['100%'] = [41480379.910333335, 19176125091471.715]
cost['10%'] = [4449720.7485, 71816947398.78859]
cost['1%'] = [443051.086, 1554101157.4342709]
cost['0.1%'] = [44110.254, 27883060.782484]

accs = dict()
accs['100%'] = getf1("./datasets/quic-100p-150-netflow-nototsize.json")
accs['10%'] = getf1("./datasets/quic-100p-150-netflow10-nototsize.json")
accs['1%'] = getf1("./datasets/quic-100p-150-netflow100-nototsize.json")
accs['0.1%'] = getf1("./datasets/quic-100p-150-netflow1000-nototsize.json")

table_data = []
for key in cost.keys():
    table_data.append([key, round(cost[key][0]/1024, 1), accs[key]])

print(f"table_data = ", table_data)
print(latex_table([[t[0].replace('%', '\\%'), t[1], t[2]] for t in table_data], header=["Sampling", "Cost [kB/sample]", "F1 Score"]))

table_data =  [['100%', 40508.2, 53.1], ['10%', 4345.4, 33.1], ['1%', 432.7, 21.6], ['0.1%', 43.1, 8.6]]
\begin{tabular}{lrrrr}
    \textbf{Sampling} & \textbf{Cost [kB/sample]} & \textbf{F1 Score} \\
    100\% & 40508.2 & 53.1 \\
    10\% & 4345.4 & 33.1 \\
    1\% & 432.7 & 21.6 \\
    0.1\% & 43.1 & 8.6 \\
\end{tabular}
