In [64]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np
import common # type: ignore
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector
import warnings
import pickle
from pprint import pprint
from IPython.display import display, Markdown, Latex

dataset = common.Dataset()

dataset.label().value_counts()

flags = common.DatasetFlags.ONLY_1M ^ common.DatasetFlags.ONLY_TLD

print(common.DatasetFlags.ONLY_1M in flags)
dataset.feature(flags).columns

zones = [ "[0.000, 0.100)", "[0.100, 0.250)", "[0.250, 0.500)", "[0.500, 0.900)", "[0.900, 1.100)" ]

True


In [88]:
df = dataset.df[dataset.df[("-", "source","mc")] == 0].copy()

durations = df[[("-", "source", "id"), ("-", "-", "duration")]].groupby([("-", "source", "id")]).sum()

results = {}
for id in df[("-", "source", "id")]:
    df_source = df[df[("-", "source", "id")] == id]
    for nn in [ "NONE", "TLD", "ICANN", "PRIVATE" ]:
        for nzone in range(len(zones[:-1])):
            fp = 0
            tn = 0
            q = df_source['-', '-', 'q'].sum()
            u = df_source['-', '-', 'u'].sum()
            for i in range(nzone + 1, len(zones)):
                fp += df_source[("eps", nn, zones[i])].sum()
                pass
            for i in range(nzone + 1):
                tn += df_source[("eps", nn, zones[i])].sum()
                pass
            tot = sum([ df_source[("eps", nn, zones[i])].sum() for i in range(len(zones))])

            duration = df_source['-', '-', 'duration'].sum()
            if fp + tn != tot:
                raise Exception(f"exception {fp} + {tn} != {fp + tn}")
            
            results[(id, nn, zones[nzone][8:13])] = {
                "q": q,
                "u": u,
                "fp": fp,
                "tn": tn,
                "fp_rate_min": (fp)/(duration / 60),
                "u/q": u,
                "u_fp": fp * (u/q),
                "u_tn": fp * (u/q),
                "u_fp_rate_min": (fp * (u/q))/(duration / 60)
            }

            # print(f"nn={nn:8}th={zones[nzone][8:13]}:")
            # print(f"\tfp={fp:<10}\ttn={tn:<10}\tfp_rate_min={fp/(duration / 60):4.2}")
            # print(f"\tfp={fp * (u/q):<12.1f}\ttn={tn * (u/q):<12.1f}\tfp_rate_min={(fp * (u/q))/(duration / 60):4.2}")
            # print()
        # print()

df_normal_approach = pd.DataFrame.from_dict(results).T

In [135]:

# df_normal_approach.reset_index(level=[1,2], names=['nn'])

# df_normal_approach.reset_index(level=[1,2], names=['nn', 'th']).groupby(['nn', 'th']).mean()
tmp = df_normal_approach.reset_index(level=[1,2], names=['id','nn','th'])

# tmp.groupby(['nn', 'th']).describe().round(2).drop(['q', 'u'], level=0, axis=1)
tmp.groupby(['nn', 'th']).describe().round(2).xs('fp', level=0, axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
nn,th,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ICANN,0.1,17.0,150.35,188.42,0.0,28.0,76.0,176.0,644.0
ICANN,0.25,17.0,123.29,161.34,0.0,15.0,60.0,141.0,538.0
ICANN,0.5,17.0,99.0,128.01,0.0,9.0,55.0,122.0,419.0
ICANN,0.9,17.0,71.47,97.26,0.0,6.0,33.0,82.0,322.0
NONE,0.1,17.0,267.18,317.63,1.0,46.0,113.0,406.0,983.0
NONE,0.25,17.0,245.12,291.03,1.0,44.0,104.0,373.0,918.0
NONE,0.5,17.0,215.71,249.94,0.0,42.0,100.0,340.0,811.0
NONE,0.9,17.0,174.12,203.33,0.0,33.0,78.0,286.0,656.0
PRIVATE,0.1,17.0,430.76,476.41,1.0,82.0,218.0,597.0,1496.0
PRIVATE,0.25,17.0,368.29,398.01,1.0,73.0,218.0,532.0,1288.0


In [142]:

display(Markdown("# fp_rate_min"))

display(tmp.groupby(['nn', 'th']).describe().round(2).xs('fp_rate_min', level=0, axis=1))

# fp_rate_min

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
nn,th,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ICANN,0.1,17.0,1.78,2.48,0.0,0.34,0.7,1.54,7.73
ICANN,0.25,17.0,1.45,2.1,0.0,0.09,0.55,1.24,6.46
ICANN,0.5,17.0,1.17,1.69,0.0,0.07,0.46,1.07,5.24
ICANN,0.9,17.0,0.87,1.33,0.0,0.03,0.34,0.7,4.12
NONE,0.1,17.0,2.38,2.83,0.05,0.37,1.66,3.46,11.4
NONE,0.25,17.0,2.19,2.58,0.05,0.35,1.52,3.04,10.31
NONE,0.5,17.0,1.87,2.07,0.0,0.29,1.39,2.64,8.23
NONE,0.9,17.0,1.45,1.6,0.0,0.17,1.17,1.8,6.34
PRIVATE,0.1,17.0,3.66,4.08,0.05,0.76,2.44,4.72,15.88
PRIVATE,0.25,17.0,2.98,2.99,0.05,0.64,2.17,4.21,11.76


In [143]:
from IPython.display import display, Markdown, Latex

display(Markdown("# u_fp_rate_min"))

display(tmp.groupby(['nn', 'th']).describe().round(2).xs('u_fp_rate_min', level=0, axis=1))

# u_fp_rate_min

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
nn,th,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ICANN,0.1,17.0,0.81,1.13,0.0,0.09,0.36,0.66,3.57
ICANN,0.25,17.0,0.66,0.95,0.0,0.07,0.3,0.53,2.98
ICANN,0.5,17.0,0.53,0.77,0.0,0.06,0.23,0.45,2.32
ICANN,0.9,17.0,0.4,0.6,0.0,0.02,0.16,0.3,1.8
NONE,0.1,17.0,1.06,1.31,0.03,0.17,0.68,1.47,5.27
NONE,0.25,17.0,0.98,1.19,0.03,0.17,0.62,1.29,4.76
NONE,0.5,17.0,0.83,0.96,0.0,0.16,0.57,1.12,3.8
NONE,0.9,17.0,0.64,0.74,0.0,0.13,0.48,0.84,2.93
PRIVATE,0.1,17.0,1.64,1.89,0.05,0.44,1.0,2.04,7.34
PRIVATE,0.25,17.0,1.33,1.38,0.05,0.39,0.89,1.82,5.43


In [None]:
from IPython.display import display, Markdown, Latex

display(Markdown("# u_fp_rate_min"))

display(tmp.groupby(['nn', 'th']).describe().round(2).xs('u_fp_rate_min', level=0, axis=1))

In [168]:
from IPython.display import display, Markdown, Latex

df_na_th = df_normal_approach.reset_index(level=[1,2], names=['id','nn','th'])

ei = df_na_th[df_na_th['th'] == '0.100']["fp"].sort_values()
print( ei.iloc[int(ei.shape[0] * 0.25)] / 2+ ei.iloc[int(ei.shape[0] * 0.25)-1] / 2)

display(Markdown("# Grouping by sources and NN"))
display(Markdown("## FP"))
display(df_na_th.groupby(['th']).describe().round(2).xs('fp', level=0, axis=1))
display(Markdown("## fp_rate_min"))
display(df_na_th.groupby(['th']).describe().round(2).xs('fp_rate_min', level=0, axis=1))
display(Markdown("## u_fp_rate_min"))
display(df_na_th.groupby(['th']).describe().round(2).xs('u_fp_rate_min', level=0, axis=1))

31.5


# Grouping by sources and NN

## FP

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
th,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.1,68.0,233.88,327.62,0.0,32.75,91.5,268.25,1496.0
0.25,68.0,201.43,283.32,0.0,23.5,76.5,229.0,1288.0
0.5,68.0,174.76,248.25,0.0,20.25,60.0,204.25,1159.0
0.9,68.0,135.93,195.48,0.0,13.5,48.5,148.0,968.0


## fp_rate_min

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
th,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.1,68.0,2.17,2.97,0.0,0.29,0.81,3.25,15.88
0.25,68.0,1.84,2.42,0.0,0.22,0.65,2.88,11.76
0.5,68.0,1.56,2.02,0.0,0.21,0.55,2.57,9.77
0.9,68.0,1.18,1.48,0.0,0.15,0.45,1.9,6.34


## u_fp_rate_min

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
th,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.1,68.0,0.97,1.37,0.0,0.11,0.38,1.43,7.34
0.25,68.0,0.82,1.11,0.0,0.1,0.3,1.25,5.43
0.5,68.0,0.7,0.92,0.0,0.09,0.24,1.12,4.51
0.9,68.0,0.53,0.67,0.0,0.07,0.17,0.86,2.93
