In [1]:
# imports and functions, does nothing

from IPython.display import Image
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import numpy as np
import os
import random
from json import JSONEncoder
import json

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

def serialize(uri, o, version=''):
    try:
        os.remove(".cache/"+uri)
    except:
        pass
    with open(".cache/"+uri, "w") as f:
        if version != '':
            f.write('#version: '+version+'\n')
        json.dump(o, f, cls=NumpyArrayEncoder)

def deserialize(uri, version=''):
    if os.path.isfile(".cache/"+uri):
        with open(".cache/"+uri, "r") as f:
            data = []
            for line in f:
                if not line.startswith('#version:'):
                    data.append(line)
            return json.loads(''.join(data))
    return None

def load_or_compute(uri, compute_function, rebuild=False):
    data = None
    if not rebuild:
        data = deserialize(uri)
    if data is None:
        data = compute_function()
        serialize(uri, data)
        return data
    return data

def img(path, width=400):
    rnd = random.randint(0,2e9)
    return f"""<img src="{path}?nocache={rnd}" style="width:{width}px; "></img>"""

def plot_fi(features_of_interest, title):
    features_of_interest.sort(key=lambda row: row[1], reverse=True) # truncate top ten
    features_of_interest = features_of_interest[:10]
    features_of_interest.sort(key=lambda row: row[1], reverse=False) # plotting needs ascending order

    xs = [x[0] for x in features_of_interest]
    ys = [y[1] for y in features_of_interest]
    yerr = [[min(y[1], y[2]) for y in features_of_interest], [y[2] for y in features_of_interest]]

    plt.barh(xs, ys, xerr=yerr)
    plt.title(title)
    plt.show()

In [2]:
data_all = np.load('datasets/quic-100p-150-40runs-again.npy', allow_pickle=True).item()

def add(a, b):
    x = 0
    if a is not None:
        x += abs(a)
    if b is not None:
        x += abs(b)
    return x

def toOldHARFormat(data):
    data2 = {}
    for url in data:
        data2[url] = {}
        for sample in data[url]:
            data2[url][sample] = []
            for request in data[url][sample]:
                domain, fullurl, t, out_h, out_b, t_resp, inc_h, inc_b = request
                data2[url][sample].append([t, add(out_h, out_b), add(inc_h, inc_b)])
    return data2

In [3]:
from lib.features import *
from lib.rf import *

In [4]:
def attack_and_plot(data, title):
    features = get_features(data)
    clf_res = rf_with_rfe(features, n_classes=141)
    print(clf_res)
    plot_fi(clf_res['features'], title=title)
    return clf_res

clf_results = {}
for data_type in data_all:

    data = toOldHARFormat(data_all[data_type])

    features = get_features(data)
    clf_res = rf_with_rfe(features, n_classes=141)
    print(data_type, clf_res)

    clf_results[data_type] = clf_res

np.save('datasets/variants.npy', clf_results)


Skipping guestnode.com , 15 empty samples
Skipping school54.ks.ua , 25 empty samples
Skipping ragazzeconlavaligia.com , 14 empty samples
Skipping funakoshikarateaustralia.com.au , 24 empty samples
[feature_extract] Number of classes 131
[feature_extract] Number of features 123
[feature_extract] Number of samples 5211
[feature_extract] Number of labels 5211
Number of classes 131
Number of features 123
Number of samples 5211
Number of labels 5211
Fold 0
141 but only found 131
Scores were 0.9712368168744008 but were corrected to 0.9620132953466287.
Fold 1
141 but only found 131
Scores were 0.9789069990412272 but were corrected to 0.9696106362773029.
Fold 2
141 but only found 131
Scores were 0.9702780441035475 but were corrected to 0.9610636277302944.
Fold 3
141 but only found 131
Scores were 0.9664429530201343 but were corrected to 0.9572649572649573.
Fold 4
141 but only found 131
Scores were 0.9674017257909875 but were corrected to 0.9582146248812915.
Fold 5
141 but only found 131
Scores

In [9]:
for variant in clf_results:
    print(variant, clf_results[variant]['score']['f1score'])

adblock {'accuracy': (0.963247863247863, 0.006525121894131777), 'precision': (0.963247863247863, 0.006525121894131777), 'recall': (0.963247863247863, 0.006525121894131777), 'f1score': (0.963247863247863, 0.006525121894131794)}
both {'accuracy': (0.9644486692015211, 0.005157665386407031), 'precision': (0.9644486692015211, 0.005157665386407031), 'recall': (0.9644486692015211, 0.005157665386407031), 'f1score': (0.9644486692015211, 0.005157665386407031)}
nofilter {'accuracy': (0.9571144278606966, 0.006409227030920575), 'precision': (0.9571144278606966, 0.006409227030920575), 'recall': (0.9571144278606966, 0.006409227030920575), 'f1score': (0.9571144278606966, 0.006409227030920575)}
decentraleyes {'accuracy': (0.9596421471172961, 0.0025997409206007867), 'precision': (0.9596421471172961, 0.0025997409206007867), 'recall': (0.9596421471172961, 0.0025997409206007867), 'f1score': (0.9596421471172961, 0.0025997409206007867)}
