In [2]:
# imports and functions, does nothing

from IPython.display import Image
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import numpy as np
import os
import random
from json import JSONEncoder
import json

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

def serialize(uri, o, version=''):
    try:
        os.remove(".cache/"+uri)
    except:
        pass
    with open(".cache/"+uri, "w") as f:
        if version != '':
            f.write('#version: '+version+'\n')
        json.dump(o, f, cls=NumpyArrayEncoder)

def deserialize(uri, version=''):
    if os.path.isfile(".cache/"+uri):
        with open(".cache/"+uri, "r") as f:
            data = []
            for line in f:
                if not line.startswith('#version:'):
                    data.append(line)
            return json.loads(''.join(data))
    return None

def load_or_compute(uri, compute_function, rebuild=False):
    data = None
    if not rebuild:
        data = deserialize(uri)
    if data is None:
        data = compute_function()
        serialize(uri, data)
        return data
    return data

def img(path, width=400):
    rnd = random.randint(0,2e9)
    return f"""<img src="{path}?nocache={rnd}" style="width:{width}px; "></img>"""

def plot_fi(features_of_interest, title):
    features_of_interest.sort(key=lambda row: row[1], reverse=True) # truncate top ten
    features_of_interest = features_of_interest[:10]
    features_of_interest.sort(key=lambda row: row[1], reverse=False) # plotting needs ascending order

    xs = [x[0] for x in features_of_interest]
    ys = [y[1] for y in features_of_interest]
    yerr = [[min(y[1], y[2]) for y in features_of_interest], [y[2] for y in features_of_interest]]

    plt.barh(xs, ys, xerr=yerr)
    plt.title(title)
    plt.show()

In [3]:
data_all = np.load('datasets/quic-100p-150-40runs.npy', allow_pickle=True).item()
data = data_all['nofilter'] # adblock + decentraleyes
urls = [url for url in data]

def add(a, b):
    x = 0
    if a is not None:
        x += abs(a)
    if b is not None:
        x += abs(b)
    return x

def toOldHARFormat(data):
    data2 = {}
    for url in data:
        data2[url] = {}
        for sample in data[url]:
            data2[url][sample] = []
            for request in data[url][sample]:
                domain, fullurl, t, out_h, out_b, t_resp, inc_h, inc_b = request
                data2[url][sample].append([t, add(out_h, out_b), add(inc_h, inc_b)])
    return data2

In [4]:
from lib.features import *
from lib.rf import *

In [5]:
# split in 1st/3rd/google

def is_google_domain(d):
    d = d.lower().strip()
    if d.endswith('ggpht.com'):
        return True
    if "google" in d or "youtube" in d or "doubleclick" in d or "gstatic.com" in d:
        return True
    return False

data_1st = {}
data_3rd = {}
data_google = {}
for url in data:
    data_1st[url] = {}
    data_3rd[url] = {}
    data_google[url] = {}
    for sample in data[url]:
        data_1st[url][sample] = []
        data_3rd[url][sample] = []
        data_google[url][sample] = []
        for request in data[url][sample]:
            domain, fullurl, t, out_h, out_b, t_resp, inc_h, inc_b = request

            if url.lower() in domain.lower():
                data_1st[url][sample].append(request)
            else:
                data_3rd[url][sample].append(request)
            if is_google_domain(domain):
                data_google[url][sample].append(request)
    

np.save('fbleau/table7_all.npy', data)
np.save('fbleau/table7_1st.npy', data_1st)
np.save('fbleau/table7_3rd.npy', data_3rd)
np.save('fbleau/table7_google.npy', data_google)

print("Data splitted")

def attack_and_plot(data, title):
    features = get_features(data)
    clf_res = rf_with_rfe(features, n_classes=141)
    print(clf_res)
    plot_fi(clf_res['features'], title=title)
    return clf_res


Data splitted


In [7]:
def attack(data):
    features = get_features(data)
    return rf_with_rfe(features, n_classes=141)

clf_results = {}
clf_results['all'] = attack(toOldHARFormat(data))
clf_results['1st'] = attack(toOldHARFormat(data_1st))
clf_results['3rd'] = attack(toOldHARFormat(data_3rd))
clf_results['google'] = attack(toOldHARFormat(data_google))
np.save('datasets/attack_by_parties.npy', clf_results)

Skipping sahafah24.com only 19 samples
Skipping 9quotes.com only 19 samples
Skipping ragazzeconlavaligia.com , 19 empty samples
Skipping bieporn.com only 17 samples
Skipping faktabmr.com only 14 samples
Skipping miglianicocalcio.net only 14 samples
Skipping www.cscpro.org only 14 samples
Skipping levitra100pudoff.com only 14 samples
Skipping megagroup.ir only 13 samples
Skipping educortex.in only 12 samples
Skipping arthurmurray.com only 13 samples
Skipping hitburada.com only 13 samples
Skipping odlotw.blogspot.com only 13 samples
Skipping funakoshikarateaustralia.com.au only 13 samples
Skipping whoisdog.com only 13 samples
Skipping ksgroupscans.com only 1 samples
Skipping kk-vine.at only 13 samples
Skipping myegypthost.com only 13 samples
Skipping matchingimages.com only 13 samples
Skipping onlinecasino002.com only 10 samples
Skipping kayseriolaylari.com only 9 samples
Skipping hpsconline.in only 1 samples
[feature_extract] Number of classes 123
[feature_extract] Number of features 12

In [15]:
clf_results = np.load('datasets/attack_by_parties.npy', allow_pickle=True).item()

print(clf_results)


{'all': {'score': {'accuracy': (0.9372202591283862, 0.008262643065863634), 'precision': (0.9372202591283862, 0.008262643065863634), 'recall': (0.9372202591283862, 0.008262643065863634), 'f1score': (0.9372202591283862, 0.008262643065863634)}, 'features': [('bytes_incoming', 0.13082546987524182, 0.006796574033830774), ('bytes_total', 0.12361000165520157, 0.006608706446274057), ('bytes_%_in', 0.1130431582211651, 0.003059168964773161), ('bytes_%_out', 0.11088599729044357, 0.0027900044017173894), ('bytes_outgoing', 0.1176716474689515, 0.003745232798404053), ('hist_2006', 0.06795124822993352, 0.03403877732678598), ('hist_2341', 0.04056035631347335, 0.04062981262822632), ('n_outgoing', 0.08463790977437932, 0.007498450643906147), ('n_total', 0.07284217368163609, 0.02551171076150184), ('n_incoming', 0.06222894505178385, 0.03169236431859323), ('time_outgoing_p25', 0.007644679983275933, 0.022934039949827796), ('hist_1672', 0.051933285542652266, 0.03415171988825709), ('time_incoming_p25', 0.008035

In [17]:
# min/max number of parties for subresources
tuples = []
for url in data:
    domains = []
    for sample in data[url]:
        for request in data[url][sample]:
            domains.append(request[0])

    domains = list(set(domains))
    tuples.append([url, len(domains)])

tuples.sort(key=lambda row: row[1])
print(tuples[0])
print(tuples[-1])

['uggsoutletofficial.com', 1]
['pcdominant.com', 126]
