In [6]:
from math import floor
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np
import common # type: ignore
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector
from IPython.display import display, Markdown, Latex, HTML
import matplotlib.pyplot as plt
import matplotlib.colors as mpcolors
from sqlalchemy import create_engine, text
import quantiphy as qq
import warnings
import enum
import os
import importlib
import latex
from latex import dm, pp, ptime, Cites, AC
from latex import Tables, Table, Figure, Figures
from latex import is_latex, set_latex, unset_latex

EPS_TH = 0.5
DATASET = "CTU-13"
database = common.Database()
dataset = common.Dataset()

slots = {}


In [2]:
import os
import importlib
import latex
importlib.reload(latex)
import common
importlib.reload(common)

from latex import dm, pp, ptime, Cites, AC
from latex import Tables, Table, Figure, Figures
from latex import is_latex, set_latex, unset_latex
from enum import Enum


def TH(i="i"):
    return "$%s$-th" % (i)


class Packet:
    SYMBOL="d"
    TS = "\\bar{t}"
    TTS = "t"
    DN = "d"
    FNREQ = "fn"
    TYPE = "qr"
    WL = "w"
    NN = "\\varepsilon^{\\alpha}"
    
    @staticmethod
    def print(sym, i, j="j"):
        if j is None:
            return "%s_{%s}" % (sym, i)
        return "%s_{%s_{%s}}" % (sym, i, j)
    @staticmethod
    def timestamp(i="i", j="j"):
        return Packet.print(Packet.TS, i, j)
    @staticmethod
    def ttimestamp(i="i", j="j"):
        return Packet.print(Packet.TTS, i, j)
    @staticmethod
    def domain(i="i", j="j"):
        return Packet.print(Packet.DN, i, j)
    @staticmethod
    def fnreq(i="i", j="j"):
        return Packet.print(Packet.FNREQ, i, j)
    @staticmethod
    def packet(i="i", j="j"):
        return Packet.print(Packet.SYMBOL, i, j)
    @staticmethod
    def type(i="i", j="j"):
        return Packet.print(Packet.TYPE, i, j)
    @staticmethod
    def nn(i="i", j="j"):
        return Packet.print(Packet.NN, i, j)
    @staticmethod
    def wl(i="i", j="j"):
        return Packet.print(Packet.WL, i, j)
    pass

class SETS(Enum):
    DS="\\mathcal{D}"
    def __str__(self):
        return self.value
    pass

class PCAP:
    SYMBOL="p"
    MW = "m"
    _N = "N^q"
    _R = "N^r"
    _QR = "N^{qr}"
    _U = "N^u"
    DURATION = "D"
    LIST = "\\mathbf{%s}" % Packet.SYMBOL
    
    def print(sym, i):
        return "%s_{%s}" % (sym, i)
    @staticmethod
    def malware(i="i"):
        return PCAP.print(PCAP.MW, i)
    @staticmethod
    def N(i="i"):
        return PCAP.print(PCAP._N, i)
    @staticmethod
    def QR(i="i"):
        return PCAP.print(PCAP._QR, i)
    @staticmethod
    def R(i="i"):
        return PCAP.print(PCAP._R, i)
    @staticmethod
    def U(i="i"):
        return PCAP.print(PCAP._U, i)
    @staticmethod
    def duration(i="i"):
        return PCAP.print(PCAP.DURATION, i)
    @staticmethod
    def pcap(i="i"):
        return PCAP.print(PCAP.SYMBOL, i)
    @staticmethod
    def list(i="i"):
        return PCAP.print(PCAP.LIST, i)
    pass

SLOT_SYMBOL = "s"
class SLOT:
    SYMBOL=SLOT_SYMBOL
    DURATION = "%s^d" % SLOT_SYMBOL
    Q = "%s^q" % SLOT_SYMBOL
    U = "%s^u" % SLOT_SYMBOL
    PP = "%s^\\mu" % SLOT_SYMBOL
    NUMBER="N^%s" % SLOT_SYMBOL
    @staticmethod
    def print(sym, k):
        return "%s_{%s}" % (sym, k)
    @staticmethod
    def duration(k="k"):
        return SLOT.DURATION
    @staticmethod
    def slot(k="k"):
        return SLOT.print(SLOT.SYMBOL, k)
    @staticmethod
    def q(k="k"):
        return SLOT.print(SLOT.Q, k)
    @staticmethod
    def u(k="k"):
        return SLOT.print(SLOT.U, k)
    @staticmethod
    def pp(k="k"):
        return SLOT.print(SLOT.PP, k)
    @staticmethod
    def number(k="k"):
        return SLOT.NUMBER
    @staticmethod
    def domain(k="k"):
        return f"[ k \\cdot {SLOT.duration()}, (k+1) \\cdot {SLOT.duration()} ]"
    

In [3]:
dm(f"""

## Notation

We define our dataset with ${SETS.DS} = \\left\\{{ {PCAP.pcap()} | i < N \\right\\}}$.
where ${PCAP.pcap()}$ is the {TH()} capture.

Every ${PCAP.pcap()}$ is composed by:
- ${PCAP.malware()}$, the malware which infected it.
- ${PCAP.N()}$, the number of DNS requests.
- ${PCAP.R()}$, the number of DNS responses.
- ${PCAP.U()}$, the number of unique domain names apperead in it.
- ${PCAP.duration()}$, the duration in seconds, i.e. the difference between the first and the last packet timestamps.
- ${PCAP.list()} = \\left\\{{ {Packet.packet()} | j < {PCAP.N()} \\right\\}}$, the chronologically ordered list of the DNS packets.

Each DNS packet ${Packet.packet()} \\in {PCAP.pcap()}$, is composed by:
- ${Packet.timestamp()}$, the timestamp of the DNS packet.
- ${Packet.ttimestamp()}$, the shifted timestamp of the DNS packet (defined later).
- ${Packet.fnreq()}$, the request number of the DNS packet.
- ${Packet.domain()}$, the textual representation of the domain name.
- ${Packet.type()}$, the DNS packet type, which could be request or response.
- ${Packet.wl()}$, the whitelisting rank, i.e. the position where ${Packet.domain()}$ appear in a whitelist, lower means that domain do not rapresent malicious actors.
- ${Packet.nn()}$, the prediction probability of the $\\alpha$ {AC.LSTM} neural network.



""")





## Notation

We define our dataset with $\mathcal{D} = \left\{ p_{i} | i < N \right\}$.
where $p_{i}$ is the $i$-th capture.

Every $p_{i}$ is composed by:
- $m_{i}$, the malware which infected it.
- $N^q_{i}$, the number of DNS requests.
- $N^r_{i}$, the number of DNS responses.
- $N^u_{i}$, the number of unique domain names apperead in it.
- $D_{i}$, the duration in seconds, i.e. the difference between the first and the last packet timestamps.
- $\mathbf{d}_{i} = \left\{ d_{i_{j}} | j < N^q_{i} \right\}$, the chronologically ordered list of the DNS packets.

Each DNS packet $d_{i_{j}} \in p_{i}$, is composed by:
- $\bar{t}_{i_{j}}$, the timestamp of the DNS packet.
- $t_{i_{j}}$, the shifted timestamp of the DNS packet (defined later).
- $fn_{i_{j}}$, the request number of the DNS packet.
- $d_{i_{j}}$, the textual representation of the domain name.
- $qr_{i_{j}}$, the DNS packet type, which could be request or response.
- $w_{i_{j}}$, the whitelisting rank, i.e. the position where $d_{i_{j}}$ appear in a whitelist, lower means that domain do not rapresent malicious actors.
- $\varepsilon^{\alpha}_{i_{j}}$, the prediction probability of the $\alpha$ _LSTM_ neural network.





In [4]:

dm(f"""

## Time slot analysis

Since we want to analyze the captures during their evolution, comparing for example the number
of requests in the same period of time, we need to synchronize each capture.
To do so we simply define a new timestamp for each packet:

$${Packet.ttimestamp()} = {Packet.timestamp()} - {Packet.timestamp(j=0)}$$

In that way, $\\forall i$ we obtain ${Packet.ttimestamp(j=0)} = 0$.

We can now split the timeline in the so-called _slots_. We define:

- $[0,\\: \\max{{{PCAP.duration()}}}]$, the timeline domain.

- ${SLOT.duration()}$, the slot duration.

- ${SLOT.number()} = \\left\\lceil \\max{{{PCAP.duration()}}} / {SLOT.duration()} \\right\\rceil$, the total number of slots.

Therefore, we obtain:
$$S = \\left\\{{ s_k \\: | \\: 0 \\le k < {SLOT.number()} \\right\\}}$$

where the time domain of {TH("k")} slot ${SLOT.slot()}$ is:

$$ s_k^t = \\left[ \\: k \\cdot {SLOT.duration()}, \\: (k+1) \\cdot {SLOT.duration()} \\: \\right]$$

For each slot, we calculate the following metrics:

- ${SLOT.q()}$, the number of requests performed during $s^t_k$.

- ${SLOT.u()}$, the number of uniques performed during $s^t_k$.

- ${SLOT.pp()}$, the number of positives performed during $s^t_k$.

If we want to limitate the metric to the {TH()} {AC.PCAP} ${PCAP.pcap()}$, we use the following notation:
$$s^{{q/u/\\mu}}_{{k,i}}$$


""")



## Time slot analysis

Since we want to analyze the captures during their evolution, comparing for example the number
of requests in the same period of time, we need to synchronize each capture.
To do so we simply define a new timestamp for each packet:

$$t_{i_{j}} = \bar{t}_{i_{j}} - \bar{t}_{i_{0}}$$

In that way, $\forall i$ we obtain $t_{i_{0}} = 0$.

We can now split the timeline in the so-called _slots_. We define:

- $[0,\: \max{D_{i}}]$, the timeline domain.

- $s^d$, the slot duration.

- $N^s = \left\lceil \max{D_{i}} / s^d \right\rceil$, the total number of slots.

Therefore, we obtain:
$$S = \left\{ s_k \: | \: 0 \le k < N^s \right\}$$

where the time domain of $k$-th slot $s_{k}$ is:

$$ s_k^t = \left[ \: k \cdot s^d, \: (k+1) \cdot s^d \: \right]$$

For each slot, we calculate the following metrics:

- $s^q_{k}$, the number of requests performed during $s^t_k$.

- $s^u_{k}$, the number of uniques performed during $s^t_k$.

- $s^\mu_{k}$, the number of positives performed during $s^t_k$.

If we want to limitate the metric to the $i$-th _PCAP_ $p_{i}$, we use the following notation:
$$s^{q/u/\mu}_{k,i}$$




In [5]:
time_translation = database.conn.execute(text(f"""
    SELECT MAX(M.TIME_S) TIME_S,  MAX(M.TIME_S_TRANSLATED) TIME_S_TRANSLATED
    FROM MESSAGE M JOIN PCAP ON M.PCAP_ID = PCAP.ID
    WHERE PCAP.DATASET = '{DATASET}'
""")).all()[0]._mapping

In [None]:
slot = common.Slot(database, 1 * 60 * 60, EPS_TH, DATASET, onlyfirsts=False)

q_per_slot = slot.groupsum("q", use_timestamps=True)

slot_max = q_per_slot.shape[0]
slot_median = int(slot_max / 4)
q_tot = q_per_slot.sum()

q_left50 = q_per_slot.cumsum().iloc[slot_median]
q_right50 = q_per_slot.sum() - q_left50

q_per_slot_cumperc = 100 * q_per_slot.cumsum() / q_tot
q_left50_perc = 100 * q_left50 / q_tot
q_right50_perc = 100 * q_right50 / q_tot

In [None]:
# tab_slot_distribution_cum_perc = Table(
#     slot_distribution_cum_perc_desc.map(lambda x: qq.Quantity(x).render(prec=2)).to_frame().T,
#     Tables.SLOTS,
#     f"Distribution of the slots through time."
# )

fig = plt.figure(figsize=(8,3))
ax = fig.add_subplot(1, 1, 1)

# q_per_slot_cumperc.plot(fig=fig)
ax.set_xlabel("$k$")
ax.set_ylabel(f"$\\frac{{\\sum^k_{{x=0}} {SLOT.q(k='x')}}}{{Q}}$", labelpad=16.0, rotation="horizontal", fontsize='x-large')
ax.text(slot_median / 2, 50, f"{q_left50_perc:.2f}%\n{qq.Quantity(q_left50).render(prec=2)}")
ax.text((slot_max + slot_median) / 2, 50, f"{q_right50_perc:.2f}%\n{qq.Quantity(q_right50).render(prec=2)}")

ax.fill_between(np.arange(0, slot_median+1), q_per_slot_cumperc.iloc[0:slot_median+1], alpha=0.4, color="blue")
ax.fill_between(np.arange(slot_median, slot_max), q_per_slot_cumperc.iloc[slot_median:slot_max], alpha=0.2, color="blue")
ax.axvline(slot_median, color="black", lw=1)

slot_ticks = []
for col in range(10):
    v = math.floor(((slot_max) * (0.1 * col)))
    slot_ticks.append(v)
slot_ticks.append(slot_max-1)
ax.set_xticks(slot_ticks)

fig_slot = Figure(fig, ax, Figures.SLOTS,
                  "Percentage of requests made up to slot k.")
fig_slot.ycaption = -.08

plt.close()

In [None]:
q_slot_pcap = slot.df.copy()
q_slot_pcap = q_slot_pcap[["q", "pcap_id", "slotnum"]].groupby(["pcap_id", "slotnum"]).sum()
q_slot_pcap = q_slot_pcap.unstack().T.fillna(0).reset_index(level=0, drop=True)
q_slot_pcap = q_slot_pcap.shift(fill_value=0)
last_row = q_slot_pcap.iloc[-1]
q_slot_pcap = pd.concat([q_slot_pcap, last_row.to_frame().T], ignore_index=True)

fig = plt.figure(figsize=(8,3))
ax = fig.add_subplot(1, 1, 1)
ax = q_slot_pcap.plot(fig=fig, kind="bar", width=1, stacked=True, logy=False, legend=False, ax=ax)
ax.set_xticks(slot_ticks)
ax.set_xlabel("$k$")
ax.set_ylabel(f"${SLOT.q()}$", rotation="horizontal", labelpad=16.0, )
ax.axvline(slot_median, color="black", linewidth=0.4)

fig_slot_pcap = Figure(fig, ax, Figures.SLOTS_PCAP,
                       "Each bar indicates the requests number $q^s_k$ of the $k$-th slot.\nThe color indicates the capture which produced the requests.")
fig_slot_pcap.ycaption = -0.2

plt.close()

In [None]:
# slot = common.Slot(database, 1 * 60 * 60, EPS_TH, DATASET, onlyfirsts=False)

q_slot_dga = slot.df.copy()


q_slot_dga = q_slot_dga[["q", "dga", "slotnum"]].groupby(["dga", "slotnum"]).sum()
q_slot_dga = q_slot_dga.unstack().T.fillna(0).reset_index(level=0, drop=True)
q_slot_dga = q_slot_dga.shift(fill_value=0)
last_row = q_slot_dga.iloc[-1]
q_slot_dga = pd.concat([q_slot_dga, last_row.to_frame().T], ignore_index=True)

fig, axs = plt.subplots(2, 3, figsize=(12,3), sharey=True)
# ax = q_slot_dga.plot(fig=fig, kind="bar", width=1, stacked=True, logy=True , legend=True, ax=ax)
# ax.set_xticks(slot_ticks)
# ax.set_xlabel("$k$")
# ax.set_ylabel(f"${SLOT.q()}$", rotation="horizontal", labelpad=16.0)
# ax.legend(["not-infected", "non-dga", "dga"])
# ax.axvline(slot_median, color="black", linewidth=0.4)
zoom = 20
axs[0][1].set_ylim(0, q_slot_dga.max(axis=1).max())
for dga in range(3):
    axs[1][dga].set_xlim(-0.5, zoom + 0.5)
    # axs[1][dga].set_xticks(list(range(zoom + 1)), [ s for s in range(zoom + 1) ])
    pass

width=1
for slotnum, row in q_slot_dga.iterrows():
    bottom=0
    for dga in range(3):
        axs[0][dga].bar(slotnum-1, row[dga], width=0.5, align="center", label=dga, color=["blue","orange","red"][dga])
        if slotnum <= zoom + 1:
            axs[1][dga].bar(slotnum-1, row[dga], width=0.5, align="center", label=dga, color=["blue","orange","red"][dga])
    pass

plt.close()

fig_slot_dga = Figure(fig, (ax2, ax3, ax4), Figures.SLOTS_DGA, "Each bar indicates the requests number $q^s_k$ of the $k$-th slot.\nThe color indicates the capture which produced the requests.")
fig_slot_dga.show()

In [None]:

dm(f"""
#### Whole timeline slot analysis - ~90% of requests in the first 25% of the slots

We set ${SLOT.duration()}  =  $ {ptime(slot.SEC_PER_SLOT / 3600)}, obtaining
${SLOT.number()}={slot.df["slotnum"].max()}$.

In Figure {Figures.SLOTS.ref()}, showing the ${SLOT.q()}$ distribution, we can
can observe that:

- The {100 * q_left50 / q_tot:.2f}% ({qq.Quantity(q_left50).render(prec=2)}) of
requests are performed within the first { slot_median / slot_max *
100:.0f}% of the slots.

- The {100 * q_right50 / q_tot:.2f}% ({qq.Quantity(q_right50).render(prec=2)})
of requests are within the last {100 - slot_median / slot_max *
100:.0f} slots.

Furthermore, as we can see in Figure {Figures.SLOTS_DGA.ref()}, the {AC.NIC}
slots are relegated just to the first slot.


""")


fig_slot.show()
fig_slot_pcap.show()
fig_slot_dga.show()

### Time slot requests ratio

In [None]:

def slot_boxplot(slots, slot_hours, slot_range, qty_unit, col="q", showfliers=True, onlyfirsts=False):
    if not slot_hours in slots:
        slots[slot_hours] = common.Slot(database, slot_hours * 60 * 60, EPS_TH, DATASET, onlyfirsts=onlyfirsts)
    slot = slots[slot_hours]

    q_slot = slot.df.copy()
    q_slot = q_slot[(q_slot.slotnum >= slot_range[0]) & (q_slot.slotnum < slot_range[1])]
    q_slot = q_slot[[col, "q", "dga", "slotnum"]]
    q_slot = q_slot.groupby(["dga","slotnum"]).agg({"q": "sum", col: lambda x: x.to_list()})
    q_slot.reset_index(inplace=True)

    q_slot = q_slot.pivot(index="slotnum", columns="dga", values=["q", col])
    
    # display(q_slot)

    # box_per_slot = {}
    # for idx, row in q_slot.iterrows():
    #     if not row["slotnum"] in box_per_slot:
    #         box_per_slot[row["slotnum"]] = [None, None]
    #     if row["dga"] == 0:
    #         box_per_slot[row["slotnum"]][0] = { "q": row["q"], "fp": row["pos_nn1"] }
    #     else:
    #         box_per_slot[row["slotnum"]][1] = { "q": row["q"], "fp": row["pos_nn1"] }
    #     pass

    i = 0
    x = []
    xticks_label = []
    xticks_pos = []
    positions = []
    labels = []
    facecolors = []
    edgecolors = []
    widths = []
    totals = []
    W = 2
    XOffset = 6
    # for slot, (box_nic, box_ic) in box_per_slot.items():
    for slot, row in q_slot.iterrows():

        xticks_pos.append(slot * (4 * XOffset))
        xticks_label.append(qq.Quantity(1 + slot, units=qty_unit).render(prec=1))
        for dga in range(3):
            x.append(row[(col, dga)])
            totals.append(row[("q", dga)])
            facecolors.append(mpcolors.to_rgba(["blue","orange","red"][dga], 0.2))
            edgecolors.append(["blue","orange","red"][dga])
            positions.append(slot * (4 * XOffset) + [-XOffset,0,XOffset][dga])
            widths.append(W)
            pass
        i += 1
        pass
    
    fig = plt.figure(figsize=(18,8))
    ax = fig.add_subplot()
    boxs = ax.boxplot(x,
                      # tick_labels=labels,
                      manage_ticks=False,
                      notch=False,
                      positions=positions,
                      patch_artist=True,
                      showmeans=True,
                      widths=widths,
                      meanline=False,
                      showfliers=showfliers
                    )
    ax.set_xticks(xticks_pos, xticks_label)
    
    for i, b in enumerate(boxs["boxes"]):
        b.set_facecolor(facecolors[i])
        b.set_edgecolor(edgecolors[i])
        pass

    for i, x in enumerate(x):
        ax.text(positions[i], -20, f"{sum(x)}\n{totals[i]}\n{100 * sum(x)/totals[i]:.2f}%", ha="center")
        # ax.text(positions[i], -5, totals[i], ha="center")
        # ax.text(positions[i], -7, f"{100 * sum(x)/totals[i]:.2f}%", ha="center")

    ax.set_ylim(bottom=-8)
    fig = Figure(fig, ax, Figures.SLOTS_DGA, "ciao")
    
    # ax.set_yscale('log')
    
    plt.close()
    return fig

# fig2 = slot_boxplot(slots, 1, (0,14), "hr")
# fig2.show()
# slots = {}
# fig1 = slot_boxplot(slots, 1, (0,7), "hour", col="poswl_nn1",
#                     showfliers=False, onlyfirsts=True)

# fig1.caption = "Number of alarms for the first 10 hour"
# fig1.show()


In [14]:
import common
import latex
importlib.reload(common)
importlib.reload(latex)

def ploo(config, dn, nn, wl: bool):
    if config not in slots:
        slots[config] = common.Slot(database, config)
        pass

    tot = "q" if dn == 'dn' else 'bdn'
    kind = f"{'wl' if wl else ''}_{dn}{nn}"
    labelneg = f"neg{kind}"
    labelpos = f"pos{kind}"
    df = slots[config].df.groupby(["dga", "slotnum"]).agg({
        tot: "sum",
        f"{labelneg}": "sum", f"{labelpos}": "sum"
    }).unstack(0).fillna(0)#.reset_index().copy()
    
    fig, axs = plt.subplots(figsize=(20, 4), tight_layout=True)
    
    def TEST_NEGPLUSPOS_EQ_Q(df):
        t = []
        for dga in range(3):
            t.append((df[(tot, dga)] == (df[(f"neg{kind}", dga)] + df[(f"pos{kind}", dga)])).all())
            pass
        return
    TEST_NEGPLUSPOS_EQ_Q(df)
    
    W = 1
    ticks_label = []
    ticks_pos = []
    for slotnum, row in df.iterrows():
        if slotnum >= 0 and slotnum <= 20:
            ticks_label.append(slotnum)
            ticks_pos.append(slotnum * 4*W)
            for dga in range(3):
                neg, pos = row[(labelneg, dga)], row[(labelpos, dga)]
                rects = axs.bar(slotnum * 4*W + [-W, 0, W][dga], neg, color=mpcolors.to_rgba(["blue","orange","red"][dga], 0.2))
                rects = axs.bar(slotnum * 4*W + [-W, 0, W][dga], pos, bottom=neg, color=mpcolors.to_rgba(["blue","orange","red"][dga], 1))
                if (pos+neg) > 0:
                    axs.bar_label(rects, labels=[qq.Quantity(int(pos)).render(prec=1)], padding=10)
                axs.bar_label(rects, labels=[qq.Quantity(int(pos+neg)).render(prec=1)], padding=0)
        pass
    
    axs.set_xticks(ticks_pos, ticks_label)
    
    fig1 = Figure(fig,axs, Figures.SLOTS_PCAP, f"({labelneg}, {labelpos})\n{slots[config].config}.")
    # fig1.show()
    
    plt.close()
    return df, fig1

configs = [
    common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10000, DATASET=DATASET, onlyfirsts=False, WL_COL="DN"),
    common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10000, DATASET=DATASET, onlyfirsts=False, WL_COL="BDN"),
    common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10000, DATASET=DATASET, onlyfirsts=True, WL_COL="DN"),
    common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10000, DATASET=DATASET, onlyfirsts=True, WL_COL="BDN"),
]

In [28]:

dm("""
***
#### NOTA su BDN

> In questa sezione quando si parla di _"Whitelisting dei BDN"_ non si intende applicare NN ai BDN, ma **evitare** di applicare NN ai DN aventi BDN whitelisted.

Infatti:

> NN non viene mai applicato al solo BDN.


Quindi:

- l'analisi dei soli BDNFP implicherebbe **analizzare solo i BDN ed evitare di analizzare tutti i DN ad essi associati**.

- whitelistare un BDN implica **whitelistare tutti i DN ad esso associati**.

- è chiaro che se ci fidassimo al 100% di un BDN allora whitelistarlo sarebbe corretto.


#### Analisi dei falsi positivi considerando solo i BDN unici all'interno di uno slot.

Si:

- (ONLYFIRST=\\*, BDN, WL=True, RANK=\\*, NN=*):

    - Il numero di BDN è piuttosto basso (qualche centinaia per ora).

    - Il numero di FalsiPositivi è bassissimo.
    
    - Il valore di ONLYFIRST non influisce in quanto BDN implica il conteggio di BDN unici.
    
    - Il valore di WL fa sì che ci sia un cambiamento enorme nel numero di Falsi Positivi.

    - Il valore di RANK è ininfluente in quanto con BDN si sceglie automaticamente RANK=BDN.
    
    - Il valore di EPS è stato analizzato.

""")

config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=True, WL_COL="BDN")
df, fig = ploo(config, dn="bdn", nn=1, wl=True)
fig.show()

config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=False, WL_COL="BDN")
df, fig = ploo(config, dn="bdn", nn=1, wl=True)
fig.show()


***
#### NOTA su BDN

> In questa sezione quando si parla di _"Whitelisting dei BDN"_ non si intende applicare NN ai BDN, ma **evitare** di applicare NN ai DN aventi BDN whitelisted.

Infatti:

> NN non viene mai applicato al solo BDN.


Quindi:

- l'analisi dei soli BDNFP implicherebbe **analizzare solo i BDN ed evitare di analizzare tutti i DN ad essi associati**.

- whitelistare un BDN implica **whitelistare tutti i DN ad esso associati**.

- è chiaro che se ci fidassimo al 100% di un BDN allora whitelistarlo sarebbe corretto.


#### Analisi dei falsi positivi considerando solo i BDN unici all'interno di uno slot.

Si:

- (ONLYFIRST=\*, BDN, WL=True, RANK=\*, NN=*):

    - Il numero di BDN è piuttosto basso (qualche centinaia per ora).

    - Il numero di FalsiPositivi è bassissimo.
    
    - Il valore di ONLYFIRST non influisce in quanto BDN implica il conteggio di BDN unici.
    
    - Il valore di WL fa sì che ci sia un cambiamento enorme nel numero di Falsi Positivi.

    - Il valore di RANK è ininfluente in quanto con BDN si sceglie automaticamente RANK=BDN.
    
    - Il valore di EPS è stato analizzato.



![](images/slots-pcap/20240626_190733715659.svg "Example")

![](images/slots-pcap/20240626_190733924572.svg "Example")

In [31]:

dm("""
***
#### Analisi dei falsi positivi considerando solo i BDN unici all'interno di uno slot.

- (ONLYFIRST=*, BDN, WL=NONE, NN=\\*):

    - In questo caso il numero di FP è esiguo, ma rimane il problema dei sottodomini utilizzati per scopi malevoli

- (ONLYFIRST=True, DN, WL=NONE, NN=\\*):

    - In questo caso il numero di FP può raggiungere 100 FP/ora.

    
- (ONLYFIRST=True, DN, WL=DN, NN=\\*):

    - Con WL_TH=10e3 non cambia nulla.
    
    - Con WL_TH=10e6 nemmeno.


- (ONLYFIRST=True, BDN, WL=DN, NN=\\*):

    - Con WL_TH=10e3 non cambia nulla.
    
    - Con WL_TH=10e6 si riduce di qualche decina il numero di FP/ora.

""")
config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=True, WL_COL="BDN")
df, fig = ploo(config, dn="bdn", nn=1, wl=False)
fig.show()

config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=True, WL_COL="BDN")
df, fig = ploo(config, dn="dn", nn=1, wl=False)
fig.show()
config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10e6, DATASET=DATASET, onlyfirsts=True, WL_COL="DN")
df, fig = ploo(config, dn="dn", nn=1, wl=True)
fig.show()
config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10e6, DATASET=DATASET, onlyfirsts=True, WL_COL="DN")
df, fig = ploo(config, dn="dn", nn=1, wl=True)
fig.show()
config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=10e6, DATASET=DATASET, onlyfirsts=True, WL_COL="BDN")
df, fig = ploo(config, dn="dn", nn=1, wl=True)
fig.show()



***
#### Analisi dei falsi positivi considerando solo i BDN unici all'interno di uno slot.

- (ONLYFIRST=*, BDN, WL=NONE, NN=\*):

    - In questo caso il numero di FP è esiguo, ma rimane il problema dei sottodomini utilizzati per scopi malevoli

- (ONLYFIRST=True, DN, WL=NONE, NN=\*):

    - In questo caso il numero di FP può raggiungere 100 FP/ora.

    
- (ONLYFIRST=True, DN, WL=DN, NN=\*):

    - Con WL_TH=10e3 non cambia nulla.
    
    - Con WL_TH=10e6 nemmeno.


- (ONLYFIRST=True, BDN, WL=DN, NN=\*):

    - Con WL_TH=10e3 non cambia nulla.
    
    - Con WL_TH=10e6 si riduce di qualche decina il numero di FP/ora.



![](images/slots-pcap/20240626_191450364849.svg "Example")

![](images/slots-pcap/20240626_191450637950.svg "Example")

![](images/slots-pcap/20240626_191450894545.svg "Example")

![](images/slots-pcap/20240626_191451124186.svg "Example")

![](images/slots-pcap/20240626_191451394981.svg "Example")

In [34]:

dm("""
***
#### Whitelisting potente solo senza onlyfirsts (riduce i FP/ora di un fattore ~10).

""")
config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=False, WL_COL="BDN")
df, fig = ploo(config, dn="bdn", nn=1, wl=False)
fig.show()

config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=False, WL_COL="BDN")
df, fig = ploo(config, dn="dn", nn=1, wl=True)
fig.show()



***
#### Whitelisting potente solo senza onlyfirsts (riduce i FP/ora di un fattore ~10).



![](images/slots-pcap/20240626_192233162220.svg "Example")

![](images/slots-pcap/20240626_192233401230.svg "Example")

In [57]:

config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=0, DATASET=DATASET, onlyfirsts=True, WL_COL="DN")
df, fig = ploo(config, dn="dn", nn=1, wl=False)

dm(f"""
***
#### Conclusioni

- Non utilizzare ONLYFIRSTS sarebbe da stupidi.

- Considerare solo i BDN è utili se-e-solo-se è sicuro al 100% whitelistare un
  BDN, e quindi tutti i DN ad esso associati.

- Non considereremo il caso in cui si considerano solo i BDN, anche se il numero
  di FP/ora è molto basso, per il motivo precedente, e per l'esiguo numero.

- Il whitelisting aiuta relativamente, di qualche decina/ora, considerando i DN,
  se-e-solo-se la soglia WL_TH è 10e6 e facendo il Whitelisting-One-To-Many dei
  BDN, nel caso Whitelisting-One-To-One dei DN non si avrebbero migliorie.

- C'è anche da dire che il WHITELISTING dipende sia dalla lista sia
  dall'ordine/rank. Lo ometterei, avendo qui dimostrato che influisce solo per
  qualche decina se-e-solo-se onlyfirsts è abilitato.

- Quindi le analisi verranno effettuate considerando DN, WHITELISTING non
  utilizzato.

La configurazione con cui si effettueranno i confronti sarà:

> (DN, ONLYFIRSTS, NO-WHITELISTING)

#### Obbiettivo

> Ideare un IDS basato su questo NN che riduca a zero il numero di Falsi
Positivi - {int(df[("pos_dn1", 0)].sum())} su un totale di {int(df[("q",
0)].sum())} prodotti in {(df[("q", 0)] > 0).sum()} ore - mantenendo la capacità
di rilevare le minacce presenti.

""")

fig.show()





***
#### Conclusioni

- Non utilizzare ONLYFIRSTS sarebbe da stupidi.

- Considerare solo i BDN è utili se-e-solo-se è sicuro al 100% whitelistare un
  BDN, e quindi tutti i DN ad esso associati.

- Non considereremo il caso in cui si considerano solo i BDN, anche se il numero
  di FP/ora è molto basso, per il motivo precedente, e per l'esiguo numero.

- Il whitelisting aiuta relativamente, di qualche decina/ora, considerando i DN,
  se-e-solo-se la soglia WL_TH è 10e6 e facendo il Whitelisting-One-To-Many dei
  BDN, nel caso Whitelisting-One-To-One dei DN non si avrebbero migliorie.

- C'è anche da dire che il WHITELISTING dipende sia dalla lista sia
  dall'ordine/rank. Lo ometterei, avendo qui dimostrato che influisce solo per
  qualche decina se-e-solo-se onlyfirsts è abilitato.

- Quindi le analisi verranno effettuate considerando DN, WHITELISTING non
  utilizzato.

La configurazione con cui si effettueranno i confronti sarà:

> (DN, ONLYFIRSTS, NO-WHITELISTING)

#### Obbiettivo

> Ideare un IDS basato su questo NN che riduca a zero il numero di Falsi
Positivi - 330 su un totale di 15017 prodotti in 7 ore - mantenendo la capacità
di rilevare le minacce presenti.



![](images/slots-pcap/20240626_193444514952.svg "Example")

In [None]:
dm("""

- Se usassimo (ONLYFIRST, BDN, WL, RANK=BDN, EPS=DN), allora otterremmo un numero veramente basso di FalsiPositivi.

- Allo stesso tempo però, alcuni BDN whitelisted potrebbero essere comunque utilizzati per comportamenti malevoli, per cui sarebbe meglio non utilizzare questa combinazione (BDN, WL, RANK_BDN, EPS_DN).

- Se invece adottassimo la seguente configurazione (BDN, !WL, RANK_BDN, EPS_DN), otterremmo risultati simili a al caso (DN, WL, RANK_BDN, EPS_DN)

- 

""")

config = common.SlotConfig(SEC_PER_SLOT=1 * 60 * 60, TH=0.999, WL_TH=1000, DATASET=DATASET, onlyfirsts=False, WL_COL="DN")
df, fig = ploo(config, dn="bdn", nn=1, wl=False)
fig.show()

In [None]:
import common
import latex
importlib.reload(common)
importlib.reload(latex)

slot = common.Slot(database, 12 * 60 * 60, EPS_TH, WLTH=1000, DATASET=DATASET, onlyfirsts=True)
df = slot.df.groupby(["dga", "slotnum"])\
.agg({"q": "sum", "negwl_nn1": "sum", "poswl_nn1": "sum", "neg_nn1": "sum", "pos_nn1": "sum"}).unstack(0).fillna(0)#.reset_index().copy()

fig, axs = plt.subplots()

def TEST_NEGPLUSPOS_EQ_Q(df):
    t = []
    for dga in range(3):
        t.append((df[("q", dga)] == (df[("neg_nn1", dga)] + df[("pos_nn1", dga)])).all())
        pass
    print(all(t))
    return

W = 1
ticks_label = []
ticks_pos = []
for slotnum, row in df.iterrows():
    ticks_label.append(slotnum)
    ticks_pos.append(slotnum * 6*W)
    for dga in range(3):
        axs.bar(slotnum * 6*W + [-W, 0, W][dga], row[("negwl_nn1", dga)], color=["blue","orange","red"][dga])
        axs.bar(slotnum * 6*W + [-W, 0, W][dga], row[("q", dga)], color=mpcolors.to_rgba(["blue","orange","red"][dga], 0.2))
    if slotnum >=10: break
    pass

axs.set_xticks(ticks_pos, ticks_label)

fig1 = Figure(fig,axs, Figures.SLOTS_PCAP, "Number of alarms for the first 10 hour")
fig1.show()

plt.close()

In [None]:
slots = {}
fig1 = slot_boxplot(slots, 1, (0,7), "hour", col="poswl_nn1",
                    showfliers=False, onlyfirsts=True)

fig1.caption = "Number of alarms for the first 10 hour"
fig1.show()


slots = {}
fig1 = slot_boxplot(slots, 1, (0,7), "hour", col="poswl_nn1",
                    showfliers=False, onlyfirsts=False)

fig1.caption = "Number of alarms for the first 10 hour"
fig1.show()

slots = {}
fig1 = slot_boxplot(slots, 1, (0,7), "hour", col="pos_nn1",
                    showfliers=False, onlyfirsts=True)

fig1.caption = "Number of alarms for the first 10 hour"
fig1.show()
slots = {}
fig1 = slot_boxplot(slots, 1, (0,7), "hour", col="pos_nn1",
                    showfliers=False, onlyfirsts=False)

fig1.caption = "Number of alarms for the first 10 hour"
fig1.show()



# slots = {}
# fig1 = slot_boxplot(slots, 24, (0,7), "day",
#                     showfliers=False, onlyfirsts=True)

# fig1.caption = "We can see that the fliers are diffuclt to handle"
# fig1.show()

# slots = {}
# fig1 = slot_boxplot(slots, 24, (0,7), "day",
#                     showfliers=False, onlyfirsts=False)

# fig1.caption = "We can see that the fliers are diffuclt to handle"
# fig1.show()

# slots = {}
# fig1 = slot_boxplot(slots, 24, (0,7), "day",
#                     showfliers=False, onlyfirsts=True)

# fig1.caption = "We can see that the fliers are diffuclt to handle"
# fig1.show()

# slots = {}
# fig1 = slot_boxplot(slots, 24, (0,7), "day",
#                     showfliers=False, onlyfirsts=False)

# fig1.caption = "We can see that the fliers are diffuclt to handle"
# fig1.show()

In [None]:

def slot_hist(slots, slot_hours, slot_range, qty_unit, col="q", showfliers=True, onlyfirsts=False):
    if not slot_hours in slots:
        slots[slot_hours] = common.Slot(database, slot_hours * 60 * 60, EPS_TH, DATASET, onlyfirsts=onlyfirsts)
    slot = slots[slot_hours]

    q_slot = slot.df.copy()
    q_slot = q_slot[(q_slot.slotnum >= slot_range[0]) & (q_slot.slotnum < slot_range[1])]
    q_slot = q_slot[["q", "pos_nn1", "dga", "slotnum"]]

    q_slot = q_slot.groupby(["dga","slotnum"]).sum()    
    q_slot = q_slot.unstack(0).fillna(0)

    q_slot[("q",0)] -= q_slot[("pos_nn1",0)]
    q_slot[("q",2)] -= q_slot[("pos_nn1",2)]
    
    fig = plt.figure(figsize=(14,10))
    ax = fig.add_subplot()
    for dga in [0,2]:
        bottom = np.zeros(q_slot.shape[0])
        tmp=q_slot.xs(dga, level=1, axis=1)
        for cc in ["q", "pos_nn1"]:
            rects = ax.bar(
                x=tmp[cc].index.values + (0.2 if dga else -0.2),
                height=tmp[cc].values,
                width=0.3,
                label=cc,
                bottom=bottom
            )
            bottom += tmp[cc].values
            ax.bar_label(rects, labels=[qq.Quantity(v).render(prec=2) for v in rects.datavalues], padding=3, label_type="center")
    fig = Figure(fig, ax, Figures.SLOTS_DGA, "ciao")
    ax.legend()
    plt.close()
    return fig
    
# fig2 = slot_boxplot(slots, 1, (0,14), "hr")
# fig2.show()
fig1 = slot_hist(slots, 24, (0,7), "day", col="pos_nn1",
                    showfliers=True, onlyfirsts=True)
fig1.caption = "We can see that the fliers are diffuclt to handle"
fig1.show()


fig12= slot_boxplot(slots, 24, (0,7), "day", col="pos_nn1",
                    showfliers=True, onlyfirsts=True)
fig2.caption = "We can see that the fliers are diffuclt to handle"
fig2.show()


In [None]:
# slots = {}
fig1 = slot_hist(slots, 24, (0,7), "day", col="pos_nn1",
                    showfliers=True, onlyfirsts=True)

fig1.caption = "We can see that the fliers are diffuclt to handle"
fig1.show()

slots = {}
fig1 = slot_boxplot(slots, 24, (0,7), "day", col="pos_nn1",
                    showfliers=True, onlyfirsts=False)

fig1.caption = "We can see that the fliers are diffuclt to handle"
fig1.show()

In [None]:

pp(f"""
#### Conclusion

The data set in unbalanced towards the infected class in many ways:

- The number of requests.

- The capture duration and so the number of slots.

- The number of requests per time.

> Therefore we decide to ignore the last {100 - slot_median / slot_max *
100:.0f}% slots, corresponding to {slot_median} of {slot_max} slots.
""")