In [1]:
from math import floor
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np
import common # type: ignore
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector
from IPython.display import display, Markdown, Latex, HTML
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, text
import quantiphy as qq
import warnings
import enum

# SEC_PER_SLOT = 12 * 60 * 60
# EPS_TH = 0.5
DATASET = "CTU-13"
database = common.Database()
# dataset = common.Dataset()
# slot = common.Slot(database, SEC_PER_SLOT, EPS_TH, DATASET)

def dm(x):
    display(Markdown(x))
    return

In [None]:
import os

TO_LATEX = False
if "TO_LATEX" in os.environ:
    TO_LATEX = True

In [11]:

LI = "infected"
LNI = "not-infected"

class Label(enum.Enum):
    def __init__(self, *args, **kwargs):
        # super(Tables, self).__init__(*args)
        pass
    
    def label(self):
        if TO_LATEX:
            return f"@@label@:{self.prefix}:{self._}:@"
        else:
            return f"[{self._}]"
        pass

    def ref(self):
        if TO_LATEX:
            return f"@@ref@:{self.prefix}:{self._}:@"
        else:
            return f"_[{self._}]_"
        pass

class AC(enum.Enum):
    DNS = "Domain Name System"
    CC = "Command&Control"
    DGA = "Domain Name Generation"
    MCFP = "Malware Capture Facility Project"
    PCAP = "Packet CAPture"

    def __str__(self):
        if TO_LATEX:
            return f"@@ac@:{self.name}:@"
        else:
            return f"_{self.name}_"
    pass

class Cites(enum.Enum):
    CTU_SME_11 = 0
    DOS_DONTS = 1
    STSPH = 2
    
    def __init__(self, *args, **kwargs):
        self._ = self.name.replace("_", "-")
        pass

    def __str__(self):
        if TO_LATEX:
            return f"@@cite@:{self._}:@"
        else:
            return f"_[{self._}]_"
    pass

class Tables(Label):
    TOTAL_Q = 0
    AVERAGE_Q = 1
    Q_PER_S = 2
    DURATION = 3

    def __init__(self, *args, **kwargs):
        self.prefix = "tab"
        self._ = self.name.lower().replace("_", "-")
        pass
    pass

class Figures(Label):
    Q_PER_S = 0
    DURATION = 1

    def __init__(self, *args, **kwargs):
        self.prefix = "fig"
        self._ = self.name.lower().replace("_", "-")
        pass
    pass

class Figure:
    def __init__(self, fig, axs, label, caption):
        self.fig = fig
        self.axs = axs
        self.label = label
        self.caption = caption
        pass

    def show(self, ycaption=-0.1):
        fname = f"{self.label._}.{'pdf' if TO_LATEX else 'svg'}"
        if TO_LATEX is False:
            self.fig.suptitle(self.label.label())
            self.fig.text(.5, ycaption, self.caption, ha='center')
            # self.fig.show()
            self.fig.savefig(fname, bbox_inches="tight")
            dm(f"![]({fname} \"Example\")")
        else:
            self.fig.savefig(fname)
            dm(f"@@begin@:figure:@")
            dm(f"@@centering")
            dm(f"@@includegraphics@qwidth=@@textwidthq@ @:{fname}:@")
            dm(f"@@caption@:{self.caption}:@")
            dm(self.label.label())
            dm(f"@@end@:figure:@")
            pass
        pass

class Table:
    def __init__(self, df, label, caption):
        self.df = df.copy()
        self.label = label
        self.caption = caption
        pass

    def show(self, width=None):
        if TO_LATEX is False:
            self.show_md(width)
        else:
            self.show_latex()
        return
    
    def show_md(self, width):
        s = self.df.style
        caption = f"<i>{self.label.label()}</i>: {self.caption}"
        s = (
            s.set_caption(caption)
            .set_table_styles([
                 dict(selector="caption", props="caption-side: bottom; font-size:1em;")
             ], overwrite=False)
        )
        if width:
            s = s.set_table_attributes(f'style="table-layout: auto; min-width: {width};"')
            pass
        display(HTML(f'<div style="display: flex; justify-content: center;">{s.to_html()}</div>'))
        return

    def show_latex(self):
        dm("@@begin@:table:@")
        dm(f"@@centering")
        dm(self.df.to_markdown())
        dm(f"@@caption@:{self.caption}:@")
        dm(self.label.label())
        dm("@@end@:table:@")
        return
    pass



In [3]:

dm(
f"""
# Dataset

## Data set requirements

Network data set for malware detection based on {AC.DNS} are very limited. As noted in {Cites.CTU_SME_11}, not all malware behaves in the same way
and the choice of malware used to infect a machine is of great importance during the design phase of the dataset.

In the design of the {Cites.CTU_SME_11} network data set, the malicious activity has been chosen accordingly
to its capacity to generate network traffic, otherwise it has not been included in the data set generation.

In addition, since our work is based on malware trying to establish a connection with the {AC.DNS} server, we have two further requirements:
- the malware must establish a connection with the {AC.CC} server,
- it must use {AC.DGA} algorithms.
Hence, this requirements reduce the number of data sets compatible with our experiment.

## Data set used

The data set used make use of network traffic capture provided by the {AC.MCFP} developed by {Cites.STSPH}, a repository of
captured network generated by infected or not-infected machines.

The project provides hundreds of captures, diveded by the so-called normal, which means {LNI}, and the {LI} ones.

For each capture, we have:
- a {AC.PCAP} file.
- The malware if the capture is infected.
- Other files generated by network analysis tools like Argus.

Given this repository, we need to check for each capture if it would be compatible with our purpose. The compatibility check
consists of analysing for each capture the amount of {AC.DNS} traffic:
- In the case of a {LNI} capture, we can only hope that the amount of traffic is the greater possible.
- For an {LI} capture, we check if the malware produce {AC.DGA} traffic.

Therefore, for each capture we performed the following steps:
1. Given the {AC.PCAP}, we filter out all the packets except the {AC.PCAP} ones.
2. Checking in this filtered {AC.PCAP}, the presence of a {AC.DNS} query which looks like a {AC.DGA} malware.
3. Given the amount of {AC.DNS} queries:
    - If it is low or the query generated are not {AC.DGA}, we discard the capture.
    - Otherwise we insert each {AC.DNS} packet of the capture into a relational database.

## Database

In order to achieve better performance during our experiments and analysis, instead of releaving in sparse csv files, we implement a relational database.
The principal table of the database are:
- PCAP-table, indicating each @capture. It is related to the Malware-table.
- Malware-table, indicating the malware which infected one or more captures included in the PCAP-table.
- Packet-table, indicating each DNS-packet. Each packet is related to its parent PCAP.
- DN-table (Domain Name table), indicating each domain name apperead in Packet-table, avoiding duplicates.
It is related to one or more records of Packet-table.
- NN-table (Neural Network table), indicating each @LSTM neural network used to predict {AC.DGA} domain name.
- DN-NN-table, a many to many relationship which relates each domain name of DN-table to a neural network of NN-table,
including the prediction value $\\varepsilon_i = O_i(d_j)$ where $i$ indicate the NN record, and $j$ the the DN record.

Using this methodology, we avoid:
- Duplication of work, the prediction for each packet will be performed just one time for each neural network.
- Duplication of data, the information about the same domain name - and its predictions - will not be duplicated for each time it appears.
Further advantages are:
- Saving data store memory.
- Use of SQL language.
- Better data management related to a test-bed made of *sparse* CSV.
- Each capture insterted into the Database follow the same data processing steps and fits into the database data structure.

""")


# Dataset

## Data set requirements

Network data set for malware detection based on @@ac@:DNS:@ are very limited. As noted in @@cite@:CTU-SME-11:@, not all malware behaves in the same way
and the choice of malware used to infect a machine is of great importance during the design phase of the dataset.

In the design of the @@cite@:CTU-SME-11:@ network data set, the malicious activity has been chosen accordingly
to its capacity to generate network traffic, otherwise it has not been included in the data set generation.

In addition, since our work is based on malware trying to establish a connection with the @@ac@:DNS:@ server, we have two further requirements:
- the malware must establish a connection with the @@ac@:CC:@ server,
- it must use @@ac@:DGA:@ algorithms.
Hence, this requirements reduce the number of data sets compatible with our experiment.

## Data set used

The data set used make use of network traffic capture provided by the @@ac@:MCFP:@ developed by Cites.STSPH, a repository of
captured network generated by infected or not-infected machines.

The project provides hundreds of captures, diveded by the so-called normal, which means not-infected, and the infected ones.

For each capture, we have:
- a @@ac@:PCAP:@ file.
- The malware if the capture is infected.
- Other files generated by network analysis tools like Argus.

Given this repository, we need to check for each capture if it would be compatible with our purpose. The compatibility check
consists of analysing for each capture the amount of @@ac@:DNS:@ traffic:
- In the case of a not-infected capture, we can only hope that the amount of traffic is the greater possible.
- For an infected capture, we check if the malware produce @@ac@:DGA:@ traffic.

Therefore, for each capture we performed the following steps:
1. Given the @@ac@:PCAP:@, we filter out all the packets except the @@ac@:PCAP:@ ones.
2. Checking in this filtered @@ac@:PCAP:@, the presence of a @@ac@:DNS:@ query which looks like a @@ac@:DGA:@ malware.
3. Given the amount of @@ac@:DNS:@ queries:
    - If it is low or the query generated are not @@ac@:DGA:@, we discard the capture.
    - Otherwise we insert each @@ac@:DNS:@ packet of the capture into a relational database.

## Database

In order to achieve better performance during our experiments and analysis, instead of releaving in sparse csv files, we implement a relational database.
The principal table of the database are:
- PCAP-table, indicating each @capture. It is related to the Malware-table.
- Malware-table, indicating the malware which infected one or more captures included in the PCAP-table.
- Packet-table, indicating each DNS-packet. Each packet is related to its parent PCAP.
- DN-table (Domain Name table), indicating each domain name apperead in Packet-table, avoiding duplicates.
It is related to one or more records of Packet-table.
- NN-table (Neural Network table), indicating each @LSTM neural network used to predict @@ac@:DGA:@ domain name.
- DN-NN-table, a many to many relationship which relates each domain name of DN-table to a neural network of NN-table,
including the prediction value $\varepsilon_i = O_i(d_j)$ where $i$ indicate the NN record, and $j$ the the DN record.

Using this methodology, we avoid:
- Duplication of work, the prediction for each packet will be performed just one time for each neural network.
- Duplication of data, the information about the same domain name - and its predictions - will not be duplicated for each time it appears.
Further advantages are:
- Saving data store memory.
- Use of SQL language.
- Better data management related to a test-bed made of *sparse* CSV.
- Each capture insterted into the Database follow the same data processing steps and fits into the database data structure.



In [4]:

def pp(rows):
    if isinstance(rows, list):
        dm("\n".join(rows))
    else:
        dm(rows)
    pass

def ptime(hours):
    if hours < 1:
        return qq.Quantity(hours * 60, units="min").render(prec=1)
    if hours < 24:
        return qq.Quantity(hours, units="hr").render(prec=1)
    if hours >= 24:
        return qq.Quantity(hours / 24, units="days").render(prec=1)


## Data set analysis

In [5]:
DF = pd.read_sql(f"""
    SELECT
        PCAP.*,
        MW.DGA
    FROM PCAP 
    JOIN MALWARE AS MW
    ON MW.ID = PCAP.MALWARE_ID
    WHERE PCAP.DATASET = '{DATASET}'
""", database.engine)

DF["dga"] = DF["dga"].replace([0,2], ["not-infected", "infected"])

In [6]:

total_q = (
    DF.rename(columns={"id": "count"})[["count", "q", "u", "dga"]]
        .groupby("dga")
        .agg({"count": "count", "q": "sum", "u": "sum"})
)

total_q.index.rename(None, inplace=True)

total_q = total_q.map(lambda x: qq.Quantity(x).render(prec=2))

total_q = Table(total_q, Tables.TOTAL_Q, "Amount of requests for each class.")

total_q.show('400px')

pp(
f"""
The final data set is composed by {DF.shape[0]} captures.
Of these, {total_q.df.loc["infected", "count"]} are {LI} and 
{total_q.df.loc["not-infected", "count"]} {LNI}.

Table {total_q.label.ref()} shows the amount of requests for each class.
"""
)



The final data set is composed by 50 captures.
Of these, 33 are infected and 
17 not-infected.

Table @@ref@:total-q:@ shows the amount of requests for each class.


@@begin@:table:@

|              |   count | q     | u    |
|:-------------|--------:|:------|:-----|
| infected     |      33 | 13.1M | 113k |
| not-infected |      17 | 299k  | 29k  |

@@label@:total-q:@

@@caption@:Amount of requests for each class.:@

@@end@:table:@

In [7]:

average_q = DF.rename(columns={"id": "count"})[["count", "q", "u", "dga"]].groupby("dga").agg({"count": "count", "q": "sum", "u": "sum"})
average_q.index.rename(None, inplace=True)
average_q["q"] /= average_q["count"]
average_q["u"] /= average_q["count"]
average_q = average_q.map(lambda x: qq.Quantity(x).render(prec=2))
average_q = Table(average_q, Tables.AVERAGE_Q, f"Average number of requests and uniques per capture.")

average_q.show(width='300px')

@@begin@:table:@

|              |   count | q     | u     |
|:-------------|--------:|:------|:------|
| infected     |      33 | 396k  | 3.42k |
| not-infected |      17 | 17.6k | 1.71k |

@@label@:average-q:@

@@caption@:Average number of requests and uniques per @capture.:@

@@end@:table:@

In [12]:

q_per_s = DF.copy()
for col in ["q"]:
    q_per_s[f"{col}/s"] = q_per_s[f"{col}"] / q_per_s["duration"]


fig, axs = plt.subplots(1,2, figsize=(7,2))
ax = q_per_s[q_per_s["dga"]=="infected"][["q/s", "dga"]].plot(kind="kde", ax=axs[0], color="#FF000066", legend=False)
q_per_s[q_per_s["dga"]=="not-infected"][["q/s", "dga"]].plot(kind="kde", ax=axs[0], color="#0000FF66", legend=False)
fig.legend(["infected", "not-infected"])
ax = q_per_s[q_per_s["dga"]=="infected"][["q/s", "dga"]].plot.hist(ax=axs[1], bins=30, color="#FF000066", legend=False)
ax = q_per_s[q_per_s["dga"]=="not-infected"][["q/s", "dga"]].plot.hist(ax=axs[1], bins=30, color="#0000FF66", legend=False)

fig_q_per_s = Figure(fig, axs, Figures.Q_PER_S, "Histogram and density distribution of $q/s$ per class.")

q_per_s = q_per_s[["q/s", "dga"]].groupby("dga").describe()
q_per_s.index.rename(None, inplace=True)
q_per_s.columns = q_per_s.columns.droplevel(0)
q_per_s = Table(
    q_per_s.map(lambda x: qq.Quantity(x).render(prec=2)),
    Tables.Q_PER_S,
    caption="Average number of queries/uniques per second grouped by the infection class."
)

fig_q_per_s.show()

q_per_s.show()

plt.close()

pp(f"""
If we consider the **duration**, we will have the unbalancing ratios showed in Table {q_per_s.label.ref()}.

We can note that:

- the {LNI} $q/s$ are more _sparse_ relatevely to the {LI} ones.
- the {LI} $q/s$ average is lower than the {LNI} one.

The problem is that the duration is very different for the two kind of captures.
"""
)


# tmp = DF.copy()
# for col in ["u"]:
#     tmp[f"{col}/s"] = tmp[f"{col}"] / tmp["duration"]
# tmp = tmp[[ "u/s", "dga"]].groupby("dga").describe()
# tmp.index.rename(None, inplace=True)

@@begin@:figure:@

@@centering

@@includegraphics@qwidth=@@textwidthq@ @:q-per-s.pdf:@

@@label@:fig:q-per-s:@

@@caption{Histogram and density distribution of $q/s$ per class.}

@@end@:figure:@

@@begin@:table:@

@@centering

|              |   count |   mean |   std | min   | 25%   | 50%   | 75%   |   max |
|:-------------|--------:|-------:|------:|:------|:------|:------|:------|------:|
| infected     |      33 |   1.17 |  2.46 | 1.87m | 13.3m | 108m  | 764m  | 10.1  |
| not-infected |      17 |   2.49 |  2.58 | 310m  | 838m  | 1.77  | 2.91  |  9.92 |

@@label@:tab:q-per-s:@

@@caption@:Average number of queries/uniques per second grouped by the infection class.:@

@@end@:table:@


If we consider the **duration**, we will have the unbalancing ratios showed in Table @@ref@:tab:q-per-s:@.

We can note that:

- the not-infected $q/s$ are more _sparse_ relatevely to the infected ones.
- the infected $q/s$ average is lower than the not-infected one.

The problem is that the duration is very different for the two kind of captures.


In [10]:
pp("#### Captures duration")

TO_LATEX=True

fig, axs = plt.subplots(1,2, figsize=(7, 2))
hours = (1 * 60 * 60)
tmp = DF[["duration", "dga"]].copy()
tmp["duration"] = tmp["duration"] / hours
tmp[tmp["dga"] == "not-infected"].plot.hist(
    ax=axs[0],
    bins=30,
    legend=False,
    title="not-infected",
    color="#0000FF66"
)
axs[0].set_xlabel("1 Hour")
# axs[0].set_yticks([0,1,2,3])
axs[0].set_xticks([0,1,2,3,4,5,6,7])
hours = (12 * 60 * 60)
tmp = DF[["duration", "dga"]].copy()
tmp["duration"] = tmp["duration"] / hours
tmp[tmp["dga"] == "infected"].plot.hist(
    ax=axs[1],
    legend=False,
    bins=30,
    title="infected",
    color="#FF000066"
)
axs[1].set_xlabel("12 Hours")
axs[1].set_ylabel(None)
axs[1].set_xticks([0,25,50,75,100,125,150])

fig = Figure(fig, axs, Figures.DURATION, "Distribution of the duration of all the captures per class.")

fig.show(-0.2)

plt.close()

hours = (1 * 60 * 60)
tab_duration = DF[["duration", "dga"]].copy()
tab_duration["duration"] = tab_duration["duration"] / hours
tab_duration = (
    tab_duration
       .groupby("dga")
       .describe()
       .drop(columns=("duration", "count"))
       .map(ptime))
tab_duration.columns = tab_duration.columns.droplevel(0)

tab_duration = Table(tab_duration, Tables.DURATION, f"bo")

tab_duration.show()

pp(
f"""
The capture duration is higly unbalanced. As we can see in Table {tab_duration.label.ref()} precisely:

- The maximum duration for {LNI} is just 6.5 hours respect the 71 days of the {LI} ones.
- The average duration for {LNI} is just 2.4 hours while the {LI} one is 18 days.
""")


#### Captures duration

@@begin@:figure:@

@@centering

@@includegraphics@qwidth=@@textwidthq@@:duration.pdf:@

@@label@:duration:@

@@caption{Distribution of the duration of all the captures per class.}

@@end@:figure:@


The capture duration is higly unbalanced. As we can see in Table @@ref@:duration:@ precisely:

- The maximum duration for not-infected is just 6.5 hours respect the 71 days of the infected ones.
- The average duration for not-infected is just 2.4 hours while the infected one is 18 days.


@@begin@:table:@

| dga          | mean    | std     | min     | 25%      | 50%     | 75%     | max     |
|:-------------|:--------|:--------|:--------|:---------|:--------|:--------|:--------|
| infected     | 18 days | 17 days | 1.4 min | 5.6 days | 13 days | 30 days | 71 days |
| not-infected | 2.4 hr  | 1.8 hr  | 19 min  | 1.1 hr   | 1.9 hr  | 3.9 hr  | 6.5 hr  |

@@label@:duration:@

@@caption@:bo:@

@@end@:table:@