# DGA Detection - Data Processing

In [1]:
%pip install --upgrade pip
%pip install pandas scikit-learn scipy numpy alibi tldextract pyarrow

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import tldextract
import re

from scipy.stats import entropy
from sklearn.model_selection import train_test_split 
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

## Data Preprocessing

We create additional features for training our unsupervised models

In [83]:
def encode_domain(df):
    """
    Encode domain to string
    """
    domain = str(df["Domain"].decode('latin-1').encode("utf-8"))
    df["Domain"] = domain
    return df

def fqdn_entropy(df):
    if not df["Domain"]:
        return df
    
    domain = df["Domain"]
    
    pk = [domain.count(chr(x)) / len(domain) for x in range(256)]
    pk = np.array(pk)

    df["Entropy"] = entropy(pk, base=2)

    return df

    
def count(df):
    """
    fqdn_count, upper_count, lower_count, numeric_count, special_count
    """
    if not df["Domain"]:
        return df

    domain = df["Domain"]

    df["FQDN_full_count"] = len(domain)
    df["FQDN_upper_count"] = sum(1 for c in domain if c.isupper())
    df["FQDN_lower_count"] = sum(1 for c in domain if c.islower())
    df["FQDN_numeric_count"] = sum(1 for c in domain if c.isdigit())
    df["FQDN_special_count"] = len(df["Domain"]) - len(re.findall("[\w]", domain))

    return df


def subdomain(df):
    """
    subdomain_length, sld, subdomain
    """
    if not df["Domain"]:
        return df

    domain = df["Domain"]

    parsec_domain = tldextract.extract(domain)

    df["Subdomain_length"] = len(parsec_domain.subdomain)

    return df


def labels(df):
    """
    labels, labels_max, labels_average, longest_word
    """
    if not df["Domain"]:
        return df

    labels = df["Domain"].split(".")
    df["Labels_length"] = len(labels)
    df["Labels_max"] = len(max(labels, key=len))
    df["Labels_average"] = sum(len(c) for c in labels)

    return df

Testing feature processing

In [12]:
test = b'aaainfotech.googlepages.com'
parsec_domain = tldextract.extract(str(test.decode('ascii')))
print(parsec_domain)

ExtractResult(subdomain='aaainfotech', domain='googlepages', suffix='com')


### Test on heiCLOUD DNS Data

In [None]:
df_heicloud = pd.read_csv("data/heicloud_dns_queries_30d_2023-06-19.txt")

df_heicloud = df_heicloud.apply(fqdn_entropy, axis=1)
df_heicloud = df_heicloud.apply(count, axis=1)
df_heicloud = df_heicloud.apply(labels, axis=1)
df_heicloud = df_heicloud.apply(subdomain, axis=1)

In [None]:
df_heicloud

In [None]:
X = df_heicloud[['Entropy', 'FQDN_full_count', 'FQDN_upper_count', 'FQDN_lower_count', 'FQDN_numeric_count', 'FQDN_special_count', 'Labels_length', 'Labels_max', 'Labels_average', 'Subdomain_length']].to_numpy()
y_pred = clf.predict(X)

malicious = df_heicloud.loc[np.where(y_pred == 1)[0]]

malicious["Domain"].to_csv("malicious.csv")