In [1]:
# imports and functions, does nothing

from IPython.display import Image
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import numpy as np
import os
import random
from json import JSONEncoder
import json

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

def serialize(uri, o, version=''):
    try:
        os.remove(".cache/"+uri)
    except:
        pass
    with open(".cache/"+uri, "w") as f:
        if version != '':
            f.write('#version: '+version+'\n')
        json.dump(o, f, cls=NumpyArrayEncoder)

def deserialize(uri, version=''):
    if os.path.isfile(".cache/"+uri):
        with open(".cache/"+uri, "r") as f:
            data = []
            for line in f:
                if not line.startswith('#version:'):
                    data.append(line)
            return json.loads(''.join(data))
    return None

def load_or_compute(uri, compute_function, rebuild=False):
    data = None
    if not rebuild:
        data = deserialize(uri)
    if data is None:
        data = compute_function()
        serialize(uri, data)
        return data
    return data

def img(path, width=400):
    rnd = random.randint(0,2e9)
    return f"""<img src="{path}?nocache={rnd}" style="width:{width}px; "></img>"""

def plot_fi(features_of_interest, title):
    features_of_interest.sort(key=lambda row: row[1], reverse=True) # truncate top ten
    features_of_interest = features_of_interest[:10]
    features_of_interest.sort(key=lambda row: row[1], reverse=False) # plotting needs ascending order

    xs = [x[0] for x in features_of_interest]
    ys = [y[1] for y in features_of_interest]
    yerr = [[min(y[1], y[2]) for y in features_of_interest], [y[2] for y in features_of_interest]]

    plt.barh(xs, ys, xerr=yerr)
    plt.title(title)
    plt.show()

In [9]:
# Load .npy dataset of HAR files 40 loops
data_all = np.load('datasets/quic-100p-150-40runs.npy', allow_pickle=True).item()
# Use the dataset without ad-blocking
data = data_all['nofilter']
urls = [url for url in data]

def add(a, b):
    x = 0
    if a is not None:
        x += abs(a)
    if b is not None:
        x += abs(b)
    return x

def toOldHARFormat(data):
    data2 = {}
    for url in data:
        data2[url] = {}
        for sample in data[url]:
            data2[url][sample] = []
            for request in data[url][sample]:
                domain, fullurl, t, out_h, out_b, t_resp, inc_h, inc_b = request
                data2[url][sample].append([t, add(out_h, out_b), add(inc_h, inc_b)])
    return data2

In [10]:
from lib.features import *
from lib.rf import *

In [11]:
# split in 1st/3rd/google

def is_google_domain(d):
    d = d.lower().strip()
    if d.endswith('ggpht.com'):
        return True
    if "google" in d or "youtube" in d or "doubleclick" in d or "gstatic.com" in d:
        return True
    return False

data_1st = {}
data_3rd = {}
data_google = {}
for url in data:
    data_1st[url] = {}
    data_3rd[url] = {}
    data_google[url] = {}
    for sample in data[url]:
        data_1st[url][sample] = []
        data_3rd[url][sample] = []
        data_google[url][sample] = []
        for request in data[url][sample]:
            domain, fullurl, t, out_h, out_b, t_resp, inc_h, inc_b = request

            if url.lower() in domain.lower():
                data_1st[url][sample].append(request)
            else:
                data_3rd[url][sample].append(request)
            if is_google_domain(domain):
                data_google[url][sample].append(request)
    

print("Data split")

def attack_and_plot(data, title):
    features = get_features(data)
    clf_res = rf_with_rfe(features, n_classes=141)
    print(clf_res)
    plot_fi(clf_res['features'], title=title)
    return clf_res


Data split


In [12]:
a = []
f = []
t = []
g = []

for k, samples in data.items():
    empty_samples = [s for s in samples if len(samples[s]) == 0]
    if (len(samples) >= 20) and (len(empty_samples) <= 5):
        a.append(k)
    
for k, samples in data_1st.items():
    empty_samples = [s for s in samples if len(samples[s]) == 0]
    if (len(samples) >= 20) and (len(empty_samples) <= 5):
        f.append(k)
    
for k, samples in data_3rd.items():
    empty_samples = [s for s in samples if len(samples[s]) == 0]
    if (len(samples) >= 20) and (len(empty_samples) <= 5):
        t.append(k)
    
for k, samples in data_google.items():
    empty_samples = [s for s in samples if len(samples[s]) == 0]
    if (len(samples) >= 20) and (len(empty_samples) <= 5):
        g.append(k)
    
#Get sites common to all datasets
common = list(set(a) & set(f) & set(t) & set(g))
print(len(common))

100


In [13]:
#Create dataset for common sites
data_new = {key: data[key] for key in common}
data_1st_new = {key: data_1st[key] for key in common}
data_3rd_new = {key: data_3rd[key] for key in common}
data_google_new = {key: data_google[key] for key in common}

data = data_new
data_1st = data_1st_new
data_3rd = data_3rd_new
data_google = data_google_new

In [16]:
def attack(data):
    features = get_features(data)
    return rf_with_rfe(features, n_classes=len(common))

clf_results = {}
clf_results['all'] = attack(toOldHARFormat(data))
clf_results['1st'] = attack(toOldHARFormat(data_1st))
clf_results['3rd'] = attack(toOldHARFormat(data_3rd))
clf_results['google'] = attack(toOldHARFormat(data_google))
np.save('datasets/attack_by_parties.npy', clf_results)

[feature_extract] Number of classes 100
[feature_extract] Number of features 123
[feature_extract] Number of samples 3363
[feature_extract] Number of labels 3363
Number of classes 100
Number of features 123
Number of samples 3363
Number of labels 3363
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
[feature_extract] Number of classes 100
[feature_extract] Number of features 123
[feature_extract] Number of samples 3363
[feature_extract] Number of labels 3363
Number of classes 100
Number of features 123
Number of samples 3363
Number of labels 3363
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
[feature_extract] Number of classes 100
[feature_extract] Number of features 123
[feature_extract] Number of samples 3363
[feature_extract] Number of labels 3363
Number of classes 100
Number of features 123
Number of samples 3363
Number of labels 3363
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
[feature_extract] Number of classe

In [17]:
clf_results = np.load('datasets/attack_by_parties.npy', allow_pickle=True).item()
print(clf_results)

{'all': {'score': {'accuracy': (0.9698365527488855, 0.006712842197965216), 'precision': (0.9741682539682539, 0.00561689262845105), 'recall': (0.969657142857143, 0.006905256139947033), 'f1score': (0.9694916166994341, 0.006923055278620777)}, 'features': [('bytes_outgoing', 0.11988490805192839, 0.0059498983682333955), ('bytes_total', 0.11909149117158446, 0.003885135245277039), ('bytes_incoming', 0.12118054927369488, 0.007864698273361182), ('bytes_%_in', 0.10443707419881092, 0.005421408786461499), ('bytes_%_out', 0.10309698021250575, 0.004484259652080371), ('hist_2006', 0.0927124816052653, 0.004922753619486559), ('n_outgoing', 0.08887569549709519, 0.00896767029909386), ('n_total', 0.06777687183616564, 0.03477798530783672), ('hist_1672', 0.04766280541300781, 0.039068518258798764), ('n_incoming', 0.07529382584978664, 0.026235909709618428), ('hist_2341', 0.05998731689015503, 0.039292307788619925)]}, '1st': {'score': {'accuracy': (0.9827637444279347, 0.005069456793233891), 'precision': (0.9849