In [1]:
from math import floor
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np
import common # type: ignore
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector
from IPython.display import display, Markdown, Latex, HTML
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, text
import quantiphy as qq
import warnings
import enum

SEC_PER_SLOT = 12 * 60 * 60
EPS_TH = 0.5
DATASET = "CTU-13"
database = common.Database()
dataset = common.Dataset()
slot = common.Slot(database, SEC_PER_SLOT, EPS_TH, DATASET)


In [2]:
import os
import importlib
import latex
importlib.reload(latex)

from latex import dm, pp, ptime, Cites, AC
from latex import Tables, Table, Figure, Figures
from latex import is_latex, set_latex, unset_latex

In [3]:

dm(f"""

### Time slot analysis for uniques

We want to analyse the number of requests in a time window, here called _slot_,
during the evolution of all captures. To achieve this, since that the captures
start in different times, we need to set the starting time of each capture at
the same time. To obtain this, in the Packet-table, we translate all the
timestamps in such a way that the first packet timestamp of each capture is 0 s.


### Distribution of slots

#### Notation

We indicate with $t^s_k$ the timestamp of the $k$-th slot:

$$t^s_k = \\bar{{s}} * k$$

where $s$ is the time slot size.

With $u^s_{{k}}$ we indicate the number of uniques belonging to the $k$-th slot,
i.e. performed between
$t^s_{{k-1}}$ and $t^s_{{k}}$,
with ($t^s_{{-1}} = 0$).

Finally, with $N^s = \\lceil t^{{max}} / s \\rceil$ we indicate the total number of slots.

""")



### Time slot analysis for uniques

We want to analyse the number of requests in a time window, here called _slot_,
during the evolution of all captures. To achieve this, since that the captures
start in different times, we need to set the starting time of each capture at
the same time. To obtain this, in the Packet-table, we translate all the
timestamps in such a way that the first packet timestamp of each capture is 0 s.


### Distribution of slots

#### Notation

We indicate with $t^s_k$ the timestamp of the $k$-th slot:

$$t^s_k = \bar{s} * k$$

where $s$ is the time slot size.

With $u^s_{k}$ we indicate the number of uniques belonging to the $k$-th slot,
i.e. performed between
$t^s_{k-1}$ and $t^s_{k}$,
with ($t^s_{-1} = 0$).

Finally, with $N^s = \lceil t^{max} / s \rceil$ we indicate the total number of slots.



In [4]:
time_translation = database.conn.execute(text(f"""
    SELECT MAX(M.TIME_S) TIME_S,  MAX(M.TIME_S_TRANSLATED) TIME_S_TRANSLATED
    FROM MESSAGE M JOIN PCAP ON M.PCAP_ID = PCAP.ID
    WHERE PCAP.DATASET = '{DATASET}'
""")).all()[0]._mapping

In [5]:
q_per_slot = slot.groupsum("u", use_timestamps=True)

slot_max = q_per_slot.shape[0]
slot_median = int(slot_max / 4)
q_tot = q_per_slot.sum()

q_left50 = q_per_slot.cumsum().iloc[slot_median]
q_right50 = q_per_slot.sum() - q_left50

q_per_slot_cumperc = 100 * q_per_slot.cumsum() / q_tot
q_left50_perc = 100 * q_left50 / q_tot
q_right50_perc = 100 * q_right50 / q_tot

In [6]:
# tab_slot_distribution_cum_perc = Table(
#     slot_distribution_cum_perc_desc.map(lambda x: qq.Quantity(x).render(prec=2)).to_frame().T,
#     Tables.SLOTS,
#     f"Distribution of the slots through time."
# )

fig = plt.figure(figsize=(8,3))
ax = fig.add_subplot(1, 1, 1)

# q_per_slot_cumperc.plot(fig=fig)
ax.set_xlabel("$k$")
ax.set_ylabel("$\\sum^k_{j=0}{\\:\\frac{q_j}{q_{tot}}}$", labelpad=16.0, rotation="horizontal", fontsize='x-large')
ax.text(slot_median / 2, 50, f"{q_left50_perc:.2f}%\n{qq.Quantity(q_left50).render(prec=2)}")
ax.text((slot_max + slot_median) / 2, 50, f"{q_right50_perc:.2f}%\n{qq.Quantity(q_right50).render(prec=2)}")

ax.fill_between(np.arange(0, slot_median+1), q_per_slot_cumperc.iloc[0:slot_median+1], alpha=0.4, color="blue")
ax.fill_between(np.arange(slot_median, slot_max), q_per_slot_cumperc.iloc[slot_median:slot_max], alpha=0.2, color="blue")
ax.axvline(slot_median, color="black", lw=1)

slot_ticks = []
for i in range(10):
    v = math.floor(((slot_max) * (0.1 * i)))
    slot_ticks.append(v)
slot_ticks.append(slot_max-1)
ax.set_xticks(slot_ticks)

fig_slot = Figure(fig, ax, Figures.SLOTS,
                  "Percentage of uniques made up to slot k.")
fig_slot.ycaption = -.04

plt.close()

In [7]:
q_slot_pcap = slot.df.copy()
q_slot_pcap = q_slot_pcap[["u", "pcap_id", "slotnum"]].groupby(["pcap_id", "slotnum"]).sum()
q_slot_pcap = q_slot_pcap.unstack().T.fillna(0).reset_index(level=0, drop=True)
q_slot_pcap = q_slot_pcap.shift(fill_value=0)
last_row = q_slot_pcap.iloc[-1]
q_slot_pcap = pd.concat([q_slot_pcap, last_row.to_frame().T], ignore_index=True)

fig = plt.figure(figsize=(8,3))
ax = fig.add_subplot(1, 1, 1)
ax = q_slot_pcap.plot(fig=fig, kind="bar", width=1, stacked=True, logy=False, legend=False, ax=ax)
ax.set_xticks(slot_ticks)
ax.set_xlabel("$k$")
ax.set_ylabel("$q^s_k$", rotation="horizontal", labelpad=16.0, )
ax.axvline(slot_median, color="black", linewidth=0.4)

fig_slot_pcap = Figure(fig, ax, Figures.SLOTS_PCAP,
                       "Each bar indicates the uniques number $q^s_k$ of the $k$-th slot.\nThe color indicates the capture which produced the requests.")
fig_slot_pcap.ycaption = -0.2

plt.close()

In [8]:
q_slot_dga = slot.df.copy()
q_slot_dga = q_slot_dga[["u", "dga", "slotnum"]].groupby(["dga", "slotnum"]).sum()
q_slot_dga = q_slot_dga.unstack().T.fillna(0).reset_index(level=0, drop=True)
q_slot_dga = q_slot_dga.shift(fill_value=0)
last_row = q_slot_dga.iloc[-1]
q_slot_dga = pd.concat([q_slot_dga, last_row.to_frame().T], ignore_index=True)


fig = plt.figure(figsize=(8,3))
ax = fig.add_subplot(1, 1, 1)
ax = q_slot_dga.plot(fig=fig, kind="bar", width=1, stacked=True, logy=True , legend=True, ax=ax)
ax.set_xticks(slot_ticks)
ax.set_xlabel("$k$")
ax.set_ylabel("$q^s_k$", rotation="horizontal", labelpad=16.0)
ax.legend(["not-infected", "infected"])
ax.axvline(slot_median, color="black", linewidth=0.4)
plt.close()

fig_slot_dga = Figure(fig, ax, Figures.SLOTS_DGA, "Each bar indicates the uniques number $q^s_k$ of the $k$-th slot.\nThe color indicates the capture which produced the requests.")
fig_slot_dga.ycaption = -0.2


In [12]:

dm(f"""
#### Analysis

We set $s =  {ptime(SEC_PER_SLOT / 3600)}$, obtaining
$N^s={slot.df["slotnum"].max()}$.

In Figure {Figures.SLOTS.ref()}, showing the $q_{{s_k}}$ distribution, we can
can observer:

- The {100 * q_left50 / q_tot:.2f}% ({qq.Quantity(q_left50).render(prec=2)}) of
requests are performed within the first {100 * slot_median / slot_max:.1f}% of the slots.
- The {100 * q_right50 / q_tot:.2f}% ({qq.Quantity(q_right50).render(prec=2)})
of requests are within the last {100 * (1 - slot_median / slot_max):.1f}% slots.

Furthermore, as we can see in Figure {Figures.SLOTS_DGA.ref()}, the {AC.NIC}
slots are relegated just to the first slot.


#### Conclusion

The data set in unbalanced towards the infected class in many ways:

- The number of uniques.

- The capture duration and so the number of slots.

- the number of uniques per time.

> Therefore we decide to ignore the last {100 - slot_median / slot_max *
100:.0f}% slots, corresponding to {slot_median} of {slot_max} slots.

""")


fig_slot.show()
fig_slot_pcap.show()
fig_slot_dga.show()


#### Analysis

We set $s =  12 hr$, obtaining
$N^s=142$.

In Figure _[slots]_, showing the $q_{s_k}$ distribution, we can
can observer:

- The 79.31% (286k) of
requests are performed within the first 25.0% of the slots.
- The 20.69% (74.6k)
of requests are within the last 75.0% slots.

Furthermore, as we can see in Figure _[slots-dga]_, the _NIC_
slots are relegated just to the first slot.


#### Conclusion

The data set in unbalanced towards the infected class in many ways:

- The number of requests.

- The capture duration and so the number of slots.




- the number of requests per time.

> Therefore we decide to ignore the last 75% slots, corresponding to 36 of 144 slots.



![](slots.svg "Example")

![](slots-pcap.svg "Example")

![](slots-dga.svg "Example")