In [1]:
from pathlib import Path
from typing import cast

import numpy as np
import pandas as pd
from scipy.io.arff import loadarff

%load_ext autoreload
%autoreload 2
import ml_project.data as d
import ml_project.helpers as h

Data files can be downloaded from the nextcloud. VPN2016/Scenario B-ARFF/TimeBasedFeatures-Dataset-15s.arff

In [2]:
ds = d.Dataset(Path("../data/15s.arff"))
ds.cls_id_repr_map

{1: 'BROWSING (1)',
 2: 'CHAT (2)',
 3: 'STREAMING (3)',
 4: 'MAIL (4)',
 5: 'VOIP (5)',
 6: 'P2P (6)',
 7: 'FT (7)',
 8: 'VPN-VOIP (8)',
 9: 'VPN-CHAT (9)',
 10: 'VPN-STREAMING (10)',
 11: 'VPN-FT (11)',
 12: 'VPN-BROWSING (12)',
 13: 'VPN-P2P (13)',
 14: 'VPN-MAIL (14)'}

In [3]:
ds.orig

Unnamed: 0,duration,total_fiat,total_biat,min_fiat,min_biat,max_fiat,max_biat,mean_fiat,mean_biat,pps,...,std_flowiat,min_active,mean_active,max_active,std_active,min_idle,mean_idle,max_idle,std_idle,cls
0,9368711.0,16.0,4.0,1564818.0,1549373.0,190205.285714,203290.456522,389822.391917,370323.719754,10.353612,...,267600.198443,1871488.0,1.983656e+06,2195089.0,1.832197e+05,1234883.0,1420565.0,1523088.0,161096.539275,2
1,7340238.0,18.0,4.0,1567554.0,1527893.0,165686.977273,186914.846154,317267.548742,304370.651301,11.580006,...,221462.862028,1491627.0,3.572433e+06,5653239.0,2.942704e+06,1131498.0,1324636.0,1517774.0,273138.379008,2
2,4644225.0,29.0,15.0,1270547.0,1079974.0,165865.178571,195302.130435,329473.126261,300492.588227,11.412022,...,217475.425246,1758922.0,1.758922e+06,1758922.0,0.000000e+00,1079974.0,1079974.0,1079974.0,0.000000,2
3,4978735.0,19.0,8.0,2492050.0,2457286.0,239543.250000,276596.388889,612435.304238,628339.573544,8.034169,...,436959.716436,1710925.0,2.382905e+06,3054885.0,9.503232e+05,1346073.0,1894031.5,2441990.0,774930.342317,2
4,11838189.0,19.0,10.0,3094089.0,3093543.0,243766.500000,295954.725000,599721.781709,625632.703972,7.602514,...,436129.639296,1747431.0,2.400446e+06,3240696.0,6.232744e+05,1394455.0,1983227.0,3042717.0,725987.829075,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18753,73240.0,-1.0,-1.0,-1.0,-1.0,0.000000,0.000000,0.000000,0.000000,27.307482,...,0.000000,-1.0,0.000000e+00,-1.0,0.000000e+00,-1.0,0.0,-1.0,0.000000,6
18754,52083.0,-1.0,-1.0,-1.0,-1.0,0.000000,0.000000,0.000000,0.000000,38.400246,...,0.000000,-1.0,0.000000e+00,-1.0,0.000000e+00,-1.0,0.0,-1.0,0.000000,6
18755,67923.0,-1.0,-1.0,-1.0,-1.0,0.000000,0.000000,0.000000,0.000000,29.445107,...,0.000000,-1.0,0.000000e+00,-1.0,0.000000e+00,-1.0,0.0,-1.0,0.000000,6
18756,313588.0,-1.0,-1.0,-1.0,-1.0,0.000000,0.000000,0.000000,0.000000,6.377795,...,0.000000,-1.0,0.000000e+00,-1.0,0.000000e+00,-1.0,0.0,-1.0,0.000000,6


In [4]:
ds.class_fraction(ds.orig)

Unnamed: 0,pop,pop_f,f
BROWSING (1),2500,13.3,100.0
CHAT (2),890,4.7,100.0
STREAMING (3),482,2.6,100.0
MAIL (4),249,1.3,100.0
VOIP (5),2826,15.1,100.0
P2P (6),1000,5.3,100.0
FT (7),1018,5.4,100.0
VPN-VOIP (8),2271,12.1,100.0
VPN-CHAT (9),1196,6.4,100.0
VPN-STREAMING (10),475,2.5,100.0


First, missing values

In [5]:
with_na = ds.orig.loc[ds.orig.apply(lambda x: x.isna().any(), axis=1)]
ds.class_fraction(with_na)

Unnamed: 0,pop,pop_f,f
BROWSING (1),,,
CHAT (2),,,
STREAMING (3),,,
MAIL (4),,,
VOIP (5),,,
P2P (6),,,
FT (7),,,
VPN-VOIP (8),,,
VPN-CHAT (9),,,
VPN-STREAMING (10),,,


In [6]:
d.describe(ds.orig)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,18758.0,16.0,16.4,-inf,12.7,16.2,16.5,20.2
total_fiat,18758.0,13.3,14.6,,1.9,4.1,7.4,17.4
total_biat,18758.0,13.3,14.6,,0.6,3.0,7.4,17.5
min_fiat,18758.0,15.0,16.1,,10.1,12.5,15.2,19.5
min_biat,18758.0,14.8,16.0,,10.1,12.5,15.2,20.2
max_fiat,18758.0,13.8,14.9,-inf,9.3,10.8,13.1,18.8
max_biat,18758.0,13.7,14.7,-inf,8.6,10.4,13.0,17.5
mean_fiat,18758.0,13.6,15.2,-inf,4.7,9.3,13.3,19.1
mean_biat,18758.0,13.3,14.7,-inf,-inf,9.1,13.1,18.4
pps,18758.0,7.6,9.8,-inf,0.9,2.4,4.6,13.8


Does not look great.

The data has a lot of 0 and negative values. It does not fit in how the features are described in the paper.

And while `std_* == 0` could (no) be a valid value, everything else looks just bad.

In [7]:
some_missing_cond = (ds.orig <= 0).apply("any", axis=1)
some_missing = ds.orig.loc[some_missing_cond]
d.describe(some_missing)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,15329.0,16.0,16.5,-inf,12.3,16.1,16.5,20.2
total_fiat,15329.0,13.4,14.7,,1.7,4.2,8.1,17.4
total_biat,15329.0,13.4,14.7,,-inf,3.0,7.8,17.5
min_fiat,15329.0,14.8,16.2,,10.0,11.4,13.7,19.5
min_biat,15329.0,14.6,16.1,,7.4,11.1,13.7,20.2
max_fiat,15329.0,13.8,15.0,-inf,8.1,9.9,12.6,18.8
max_biat,15329.0,13.7,14.8,-inf,5.6,9.8,12.6,17.5
mean_fiat,15329.0,13.4,15.3,-inf,-inf,8.4,11.1,19.1
mean_biat,15329.0,13.0,14.7,-inf,-inf,8.1,11.1,18.4
pps,15329.0,7.8,9.9,-inf,1.0,3.3,4.6,13.8


In [8]:
ds.class_fraction(some_missing)

Unnamed: 0,pop,pop_f,f
BROWSING (1),1558,10.2,62.3
CHAT (2),490,3.2,55.1
STREAMING (3),407,2.7,84.4
MAIL (4),228,1.5,91.6
VOIP (5),2820,18.4,99.8
P2P (6),956,6.2,95.6
FT (7),893,5.8,87.7
VPN-VOIP (8),2265,14.8,99.7
VPN-CHAT (9),587,3.8,49.1
VPN-STREAMING (10),368,2.4,77.5


In [9]:
none_missing = ds.orig.loc[~some_missing_cond]
none_missing

Unnamed: 0,duration,total_fiat,total_biat,min_fiat,min_biat,max_fiat,max_biat,mean_fiat,mean_biat,pps,...,std_flowiat,min_active,mean_active,max_active,std_active,min_idle,mean_idle,max_idle,std_idle,cls
0,9368711.0,16.0,4.0,1564818.0,1549373.0,1.902053e+05,2.032905e+05,3.898224e+05,3.703237e+05,10.353612,...,2.676002e+05,1871488.0,1.983656e+06,2195089.0,1.832197e+05,1234883.0,1420565.0,1523088.0,1.610965e+05,2
1,7340238.0,18.0,4.0,1567554.0,1527893.0,1.656870e+05,1.869148e+05,3.172675e+05,3.043707e+05,11.580006,...,2.214629e+05,1491627.0,3.572433e+06,5653239.0,2.942704e+06,1131498.0,1324636.0,1517774.0,2.731384e+05,2
3,4978735.0,19.0,8.0,2492050.0,2457286.0,2.395432e+05,2.765964e+05,6.124353e+05,6.283396e+05,8.034169,...,4.369597e+05,1710925.0,2.382905e+06,3054885.0,9.503232e+05,1346073.0,1894031.5,2441990.0,7.749303e+05,2
4,11838189.0,19.0,10.0,3094089.0,3093543.0,2.437665e+05,2.959547e+05,5.997218e+05,6.256327e+05,7.602514,...,4.361296e+05,1747431.0,2.400446e+06,3240696.0,6.232744e+05,1394455.0,1983227.0,3042717.0,7.259878e+05,2
5,11771793.0,30.0,6.0,3149632.0,2964504.0,2.440080e+05,2.551673e+05,6.328574e+05,6.075414e+05,8.155087,...,4.356770e+05,1443143.0,2.872733e+06,3563174.0,9.651035e+05,1065834.0,1933295.0,2964504.0,1.008645e+06,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17968,10006782.0,3999253.0,3999290.0,6007321.0,6007291.0,5.003287e+06,5.003290e+06,1.419918e+06,1.419871e+06,0.599593,...,2.830650e+06,3999290.0,5.003290e+06,6007291.0,1.419871e+06,3999052.0,5003067.5,6007083.0,1.419892e+06,6
17969,14012396.0,6008673.0,6008650.0,8003530.0,8003564.0,7.006102e+06,7.006107e+06,1.410577e+06,1.410617e+06,0.428192,...,3.901489e+06,6008650.0,7.006107e+06,8003564.0,1.410617e+06,6008491.0,7005931.0,8003371.0,1.410593e+06,6
18196,8712389.0,1990162.0,1988563.0,6608031.0,6609078.0,4.299096e+06,4.298820e+06,3.265326e+06,3.267197e+06,0.688674,...,2.763989e+06,1990162.0,4.299096e+06,6608031.0,3.265326e+06,1875414.0,4185148.0,6494882.0,3.266457e+06,6
18261,14253939.0,7002496.0,6995059.0,7006795.0,7042458.0,7.004646e+06,7.018758e+06,3.039852e+03,3.351615e+04,0.420936,...,3.599758e+06,7002496.0,7.004646e+06,7006795.0,3.039852e+03,6790373.0,6794091.5,6797810.0,5.258753e+03,6


In [10]:
ds.class_fraction(none_missing)

Unnamed: 0,pop,pop_f,f
BROWSING (1),942,27.5,37.7
CHAT (2),400,11.7,44.9
STREAMING (3),75,2.2,15.6
MAIL (4),21,0.6,8.4
VOIP (5),6,0.2,0.2
P2P (6),44,1.3,4.4
FT (7),125,3.6,12.3
VPN-VOIP (8),6,0.2,0.3
VPN-CHAT (9),609,17.8,50.9
VPN-STREAMING (10),107,3.1,22.5


It does not help that this is applied to more than a half of the rows

We could try to take `none_missing` as the base and continue with it. But it is very unbalances in terms of classes population. In such a case we will have to remove some categories. At least *VOIP.

We also could do something with the missing values.

Like substitute some features with binary features (present/not present)

In [11]:
val_types_orig = h.ValTypes(ds.orig.drop(columns="cls"))
val_types_orig.unique_types

duration            [1, 0]
total_fiat      [1, -1, 0]
total_biat      [1, -1, 0]
min_fiat           [1, -1]
min_biat           [1, -1]
max_fiat            [1, 0]
max_biat            [1, 0]
mean_fiat           [1, 0]
mean_biat           [1, 0]
pps                 [1, 0]
bps                 [1, 0]
min_flowiat     [1, -1, 0]
max_flowiat        [1, -1]
mean_flowiat        [1, 0]
std_flowiat         [1, 0]
min_active         [-1, 1]
mean_active         [0, 1]
max_active         [-1, 1]
std_active          [0, 1]
min_idle           [-1, 1]
mean_idle           [0, 1]
max_idle           [-1, 1]
std_idle            [0, 1]
dtype: object

In [12]:
val_types_orig.types_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,count
variable,value,Unnamed: 2_level_1
bps,0,532
bps,1,18226
duration,0,532
duration,1,18226
max_active,-1,11355
max_active,1,7403
max_biat,0,3637
max_biat,1,15121
max_fiat,0,2641
max_fiat,1,16117


In [13]:
no_dur = ds.orig.loc[ds.orig.duration == 0]
d.describe(no_dur)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,532.0,-inf,-inf,-inf,-inf,-inf,-inf,-inf
total_fiat,532.0,,-inf,,,,,
total_biat,532.0,,-inf,,,,,
min_fiat,532.0,,-inf,,,,,
min_biat,532.0,,-inf,,,,,
max_fiat,532.0,-inf,-inf,-inf,-inf,-inf,-inf,-inf
max_biat,532.0,-inf,-inf,-inf,-inf,-inf,-inf,-inf
mean_fiat,532.0,-inf,-inf,-inf,-inf,-inf,-inf,-inf
mean_biat,532.0,-inf,-inf,-inf,-inf,-inf,-inf,-inf
pps,532.0,-inf,-inf,-inf,-inf,-inf,-inf,-inf


In [14]:
ds.class_fraction(no_dur)

Unnamed: 0,pop,pop_f,f
BROWSING (1),,,
CHAT (2),2.0,0.4,0.2
STREAMING (3),31.0,5.8,6.4
MAIL (4),4.0,0.8,1.6
VOIP (5),6.0,1.1,0.2
P2P (6),7.0,1.3,0.7
FT (7),156.0,29.3,15.3
VPN-VOIP (8),6.0,1.1,0.3
VPN-CHAT (9),33.0,6.2,2.8
VPN-STREAMING (10),1.0,0.2,0.2


`duration == 0` should probably not be considered a valid input.

If there is no information about the flow, how can we call it a flow?

TODO: Mostly FT and VPN-P2P. Can it be used?

In [15]:
dur = ds.orig.loc[ds.orig.duration != 0]
ds.class_fraction(dur)

Unnamed: 0,pop,pop_f,f
BROWSING (1),2500,13.7,100.0
CHAT (2),888,4.9,99.8
STREAMING (3),451,2.5,93.6
MAIL (4),245,1.3,98.4
VOIP (5),2820,15.5,99.8
P2P (6),993,5.4,99.3
FT (7),862,4.7,84.7
VPN-VOIP (8),2265,12.4,99.7
VPN-CHAT (9),1163,6.4,97.2
VPN-STREAMING (10),474,2.6,99.8


In [16]:
h.ValTypes(dur, ignore=["cls"]).unique_types

duration               [1]
total_fiat      [1, -1, 0]
total_biat      [1, -1, 0]
min_fiat           [1, -1]
min_biat           [1, -1]
max_fiat            [1, 0]
max_biat            [1, 0]
mean_fiat           [1, 0]
mean_biat           [1, 0]
pps                    [1]
bps                    [1]
min_flowiat     [1, 0, -1]
max_flowiat            [1]
mean_flowiat           [1]
std_flowiat         [1, 0]
min_active         [-1, 1]
mean_active         [0, 1]
max_active         [-1, 1]
std_active          [0, 1]
min_idle           [-1, 1]
mean_idle           [0, 1]
max_idle           [-1, 1]
std_idle            [0, 1]
dtype: object

TODO: how about using only `["duration", "flowPktsPerSecond", "flowBytesPerSecond", "max_flowiat", "mean_flowiat"]`. Those are present whenever duration is not zero. Which is - in the most of the columns.

In [17]:
val_types = h.ValTypes(dur, ignore=["cls"] + ds.mostly_present)
val_types.unique_types

total_fiat     [1, -1, 0]
total_biat     [1, -1, 0]
min_fiat          [1, -1]
min_biat          [1, -1]
max_fiat           [1, 0]
max_biat           [1, 0]
mean_fiat          [1, 0]
mean_biat          [1, 0]
min_flowiat    [1, 0, -1]
std_flowiat        [1, 0]
min_active        [-1, 1]
mean_active        [0, 1]
max_active        [-1, 1]
std_active         [0, 1]
min_idle          [-1, 1]
mean_idle          [0, 1]
max_idle          [-1, 1]
std_idle           [0, 1]
dtype: object

TODO: maybe throw away some of small-population types

TODO: inspect feature that turn both into 0 and -1, does not make much sense to me.
If it is only one of them - then it is probably a default for unknown, two of them are strange

In [18]:
print(val_types.impls_df.shape)

(143, 4)


In [19]:
for g in val_types.eq_groups:
    print(g)

{('mean_idle', 0), ('mean_active', 0), ('max_active', -1), ('min_active', -1), ('max_idle', -1), ('min_idle', -1)}
{('std_active', 1), ('std_idle', 1)}
{('total_biat', -1), ('max_biat', 0), ('min_biat', -1)}
{('mean_active', 1), ('min_idle', 1), ('mean_idle', 1), ('max_idle', 1), ('max_active', 1), ('min_active', 1)}
{('std_active', 0), ('std_idle', 0)}
{('min_biat', 1), ('max_biat', 1)}
{('max_fiat', 0), ('min_fiat', -1), ('total_fiat', -1)}
{('max_fiat', 1), ('min_fiat', 1)}


mean_fiat == 0 is not grouped with anything

Substituting some things with flags

In [20]:
flags = dur.copy()
flags["has_active"] = flags["max_active"] > 0
flags["has_std_active"] = flags["std_active"] > 0
flags = flags.drop(columns=["max_active", "max_idle", "mean_active", "mean_idle", "min_active", "min_idle", "std_active", "std_idle"])
# flags["has_total_fiat"] = flags["total_fiat"] >= 0
# flags["has_total_biat"] = flags["total_biat"] >= 0
flags["has_fiat"] = flags["max_fiat"] > 0
flags["has_biat"] = flags["max_biat"] > 0
flags = flags.drop(columns=["total_fiat", "total_biat", "max_fiat", "max_biat", "min_fiat", "min_biat"])

flags["has_min_flowiat"] = flags["min_flowiat"] >= 0
flags["has_std_flowiat"] = flags["min_flowiat"] > 0
flags = flags.drop(columns=["min_flowiat", "std_flowiat"])
flags["has_mean_fiat"] = flags["mean_fiat"] > 0
flags["has_mean_biat"] = flags["mean_biat"] > 0
flags = flags.drop(columns=["mean_fiat", "mean_biat"])
flags

Unnamed: 0,duration,pps,bps,max_flowiat,mean_flowiat,cls,has_active,has_std_active,has_fiat,has_biat,has_min_flowiat,has_std_flowiat,has_mean_fiat,has_mean_biat
0,9368711.0,10.353612,4802.688438,1523088.0,97590.739583,2,True,True,True,True,True,True,True,True
1,7340238.0,11.580006,4340.186245,1517774.0,87383.785714,2,True,True,True,True,True,True,True,True
2,4644225.0,11.412022,4161.512416,1079974.0,89312.019231,2,True,False,True,True,True,True,True,True
3,4978735.0,8.034169,3918.666087,2441990.0,127659.871795,2,True,True,True,True,True,True,True,True
4,11838189.0,7.602514,2802.540152,3042717.0,133013.359551,2,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18753,73240.0,27.307482,6116.876024,73240.0,73240.000000,6,False,False,False,False,True,True,False,False
18754,52083.0,38.400246,8601.655051,52083.0,52083.000000,6,False,False,False,False,True,True,False,False
18755,67923.0,29.445107,6595.703959,67923.0,67923.000000,6,False,False,False,False,True,True,False,False
18756,313588.0,6.377795,1428.626095,313588.0,313588.000000,6,False,False,False,False,True,True,False,False


In [21]:
val_types_flags = h.ValTypes(flags, ignore=["cls"] + ds.mostly_present)
val_types_flags.unique_types

has_active         [0, 1]
has_std_active     [0, 1]
has_fiat           [1, 0]
has_biat           [1, 0]
has_min_flowiat    [1, 0]
has_std_flowiat    [1, 0]
has_mean_fiat      [1, 0]
has_mean_biat      [1, 0]
dtype: object

In [22]:
val_types_flags.eq_groups

[]

In [23]:
flags.drop(columns=["cls"] + ds.mostly_present).value_counts().reset_index()

Unnamed: 0,has_active,has_std_active,has_fiat,has_biat,has_min_flowiat,has_std_flowiat,has_mean_fiat,has_mean_biat,count
0,False,False,True,True,True,True,True,True,6859
1,True,True,True,True,True,True,True,True,3429
2,True,False,True,True,True,True,True,True,2196
3,False,False,False,False,True,True,False,False,1819
4,True,False,True,True,True,True,False,False,955
5,False,False,True,False,True,True,False,False,796
6,False,False,True,True,True,False,True,True,518
7,False,False,True,False,True,True,True,False,314
8,True,True,True,True,True,False,True,True,246
9,True,False,True,True,True,False,True,True,144
