In [7]:
# imports and functions, does nothing

from functools import partial
from IPython.display import Image, display, Video, Image, HTML
from matplotlib import rcParams
from prettytable import PrettyTable
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.multiclass import unique_labels
import constants as cst
import glob
import json
import sys
sys.path.append('../..')
import lib.plot_builder as plot_builder
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import pathlib
import random
import sklearn
import random
import matplotlib
matplotlib.rcParams.update({'font.size': 14})

def print_table(data, fields=None):
    t = PrettyTable()
    for row in data:
        t.add_row(row)
    t.align = 'l'
    if fields != None:
        t.field_names = fields
    print(t)
    
def read_dataset(f):
    acc = None
    precision = None
    recall = None
    f1score = None
    with open(f) as f2:
        data = json.load(f2)
        acc = round(100*data['score']['accuracy'][0], 1)
        precision = round(100*data['score']['precision'][0], 1)
        recall = round(100*data['score']['recall'][0], 1)
        f1score = round(100*data['score']['f1score'][0], 1)

    cm = f.replace('datasets/', 'plots/').replace('.json', '-cm.png')
    fi = f.replace('datasets/', 'plots/').replace('.json', '-fi.png')

    return acc, precision, recall, f1score, cm, fi
def img(path, width=400):
    rnd = random.randint(0,2e9)
    return f"""<img src="{path}?nocache={rnd}" style="width:{width}px; "></img>"""

def latex_table(table, header):
    header_bold = ["\\textbf{"+t+"}" for t in header]
    table2 = [header_bold]
    table2.extend(table)
    latex_table = """\\begin{tabular}{lrrrr}
"""
    rows = [" & ".join(map(str,row)) for row in table2]
    latex_table += "".join(["    " + row + " \\\\\n" for row in rows])
    latex_table += """\\end{tabular}"""

    return latex_table

def dataset_name_to_friendly_name(name):
    root_dataset = name.replace('-netflow1000', '').replace('-netflow100', '').replace('-netflow10', '').replace('-netflows', '').replace('-netflow', '').replace('-defended', '').replace('-google', '')
    nicename = name.replace('-netflow1000', "NF 0.1%").replace('-netflow100', "NF   1%").replace('-netflow10', "NF  10%").replace('-netflow', "NF 100%").replace('/', ' ').replace('-defended', ' (defended)').replace('-google', '(Google dest filter)')
    variant = nicename.replace(root_dataset, '')
    return root_dataset, nicename, variant

def find_pos(name, features):
    i = 0
    while i<len(features):
        if features[i].strip() == name:
            return i
        i += 1
    print("Couldn't find", name)
    sys.exit(0)

In [2]:
# loads datasets of interest
files = glob.glob("datasets/*.json")
files = [f for f in files]
files.sort()

In [8]:
for f in files:
    if not "-google" in f:
        continue


    dataset = f.replace('dataset/', '').replace('.json', '')
    acc, _, _, _, cm, fi = read_dataset(f)
    dataset_name, _, variant = dataset_name_to_friendly_name(dataset)
    
    display(HTML(f"""
    <div class="row" style="text-align:left">
            <table>
            <td style="text-align: left">
            <h2>{dataset_name}</h2>
            <h3>Accuracy {acc}%</h3>
            """ + img(cm) + img(fi) + f"""
            <td>
            </td>
            </tr>
            </table>
    </div>
    """))

In [5]:
# Table 2
# Print the table summary with Dataset + Accuracy (just the google-destination filter)
tuples = []

for f in files:
    if not "-google" in f:
        continue

    print(f)

    dataset = f.replace('datasets/', '').replace('.json', '')

    acc, _, _, _, cm, fi = read_dataset(f)
    dataset_name, _, variant = dataset_name_to_friendly_name(dataset)
    
    tuples.append([dataset_name, variant, acc])

print_table(tuples, fields=["Dataset", "Variant", "Acc"])
print(latex_table([[f"\\texttt{{{t[0]}}}", t[1], t[2]] for t in tuples], header=["Dataset", "Variant", "F1 Score"]))

datasets/quic-100p-150-google.json
+---------------+----------------------+------+
| Dataset       | Variant              | Acc  |
+---------------+----------------------+------+
| quic-100p-150 | (Google dest filter) | 78.4 |
+---------------+----------------------+------+
\begin{tabular}{lrrrr}
    \textbf{Dataset} & \textbf{Variant} & \textbf{F1 Score} \\
    \texttt{quic-100p-150} & (Google dest filter) & 78.4 \\
\end{tabular}


In [9]:
# Google Filter PCAP sizes

#!/usr/bin/python3

import sys
import os
import glob
import numpy as np
from pathlib import Path

def getsizesnetflows(name, dataset_path):
    return getsizespcap(name, dataset_path, filterstr="**/nfcapd*")

def getsizespcap(name, dataset_path, filterstr="**/*.pcap"):
    pcaps = glob.glob(dataset_path + filterstr, recursive=True)

    sizes = dict()
    for p in pcaps:
        f = p.replace(dataset_path, '')
        parts = f.split('/')
        url = parts[0]

        if not url in sizes:
            sizes[url] = []

        size = os.path.getsize(p)
        sizes[url].append(size)

    keys = list(sizes.keys())
    means = []
    
    for k in keys:
        mean = round(np.mean(sizes[k])/1024, 2)
        means.append(mean)

    return [name, round(np.min(means),2), round(np.mean(means),2), round(np.max(means),2)]

table = []
table.append(getsizespcap('quic-150', '../../cf-clusters-datasets/quic-100p-150/pcaps/'))
table.append(getsizespcap('quic-150 (Google filter)', '../../cf-clusters-datasets/quic-100p-150-google/pcaps/'))
#table.append(getsizesnetflows('quic-150 (Google filter + netflows 100%)', '../../cf-clusters-datasets/quic-100p-338-google/netflows/'))
#table.append(getsizesnetflows('quic-150 (Google filter + netflows 10%)', '../../cf-clusters-datasets/quic-100p-338-google/netflows_10/'))
#table.append(getsizesnetflows('quic-150 (Google filter + netflows 1%)', '../../cf-clusters-datasets/quic-100p-338-google/netflows_100/'))
#table.append(getsizesnetflows('quic-150 (Google filter + netflows 0.1%)', '../../cf-clusters-datasets/quic-100p-338-google/netflows_1000/'))


print_table(table, fields=["Dataset", "Min", "Mean", "Max"])
print(latex_table([[f"\\texttt{{{t[0]}}}", t[2],] for t in table], header=["Dataset", "Mean [kB]"]))

+--------------------------+-------+--------+---------+
| Dataset                  | Min   | Mean   | Max     |
+--------------------------+-------+--------+---------+
| quic-150                 | 24.46 | 312.42 | 2101.94 |
| quic-150 (Google filter) | 0.02  | 112.15 | 1066.75 |
+--------------------------+-------+--------+---------+
\begin{tabular}{lrrrr}
    \textbf{Dataset} & \textbf{Mean [kB]} \\
    \texttt{quic-150} & 312.42 \\
    \texttt{quic-150 (Google filter)} & 112.15 \\
\end{tabular}


In [7]:
# I think this is invalid. We're comparing apple and oranges

# loads datasets of interest
files_google_netflows = glob.glob("../randomforests-netflows/datasets/*.json")
files_google_netflows.sort()

tuples = []

acc, precision, recall, f1score, cm, fi = read_dataset('../randomforests/datasets/quic-100p-150.json')
tuples.append(['quic-150', "", "", f1score])

acc, precision, recall, f1score, cm, fi = read_dataset('./datasets/quic-100p-150-google.json')
tuples.append(['', "(Google view)", "", f1score])

for f in files_google_netflows:
    continue
    if "google" not in f or "-no" in f:
        continue

    dataset = f.replace('../randomforests-netflows/datasets/', '').replace('.json', '').replace("-google", "")
    dataset_name, _, variant = dataset_name_to_friendly_name(dataset)
    variant = variant.replace('NF ', '')
    
    acc, precision, recall, f1score, cm, fi = read_dataset(f)

    tuples.append(["", "(Google dest filter + netflow)", variant, f1score])

print_table(tuples, fields=["Dataset", "Variant", "Sampling", "F1 Score"])
print(latex_table([[f"\\texttt{{{t[0]}}}", t[1], t[2].replace('%', '\\%'), t[3]] for t in tuples], header=["Dataset", "Variant", "Sampling", "F1 Score"]))

+----------+---------------+----------+----------+
| Dataset  | Variant       | Sampling | F1 Score |
+----------+---------------+----------+----------+
| quic-150 |               |          | 95.8     |
|          | (Google view) |          | 78.4     |
+----------+---------------+----------+----------+
\begin{tabular}{lrrrr}
    \textbf{Dataset} & \textbf{Variant} & \textbf{Sampling} & \textbf{F1 Score} \\
    \texttt{quic-150} &  &  & 95.8 \\
    \texttt{} & (Google view) &  & 78.4 \\
\end{tabular}
