# DGA Detection - Data Processing
# Exploratory Data Analysis (EDA)

In [1]:
%%capture
%pip install --upgrade pip
%pip install polars scikit-learn scipy numpy alibi tldextract pyarrow

In [34]:
import polars as pl
import numpy as np
import tldextract
import re

from scipy.stats import entropy
from sklearn.model_selection import train_test_split 
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

## CIC Bell DNS 2021 data set

Write out all domain names.

In [35]:
files = ["CICBellDNS2021_CSV_benign.csv", "CICBellDNS2021_CSV_malware.csv", "CICBellDNS2021_CSV_phishing.csv", "CICBellDNS2021_CSV_spam.csv"]

domains = {}
for file in files:
    with open(f"../data/cic/{file}") as f:
        domain_file = []
        for line in f:
            txt = line.replace(' ', '')
            x = re.split(",(?![^\[\]]*(?:\])|[^()]*\))", txt)
            if "Domain" != x[4]:
                domain_file.append(x[4])
        domains[file] = domain_file
        print(domain_file[:5])
    

["b'google.com.'", "b'google.com.'", "b'www.google.com.'", "b'www.google.com.'", "b'facebook.com.'"]
["b'dicrophani.com.'", "b'dionneg.com.'", "b'vipprojects.cn.'", "b'hhj3.cn.'", "b'hhj3.cn.'"]
["b'programafidelidadeitacard2.cf.'", "b'programafidelidadeitacard2.cf.'", "b'professorjosesilveira.com.'", "b'www.ksylitol.com.'", "b'www.ksylitol.com.'"]
["b'0900259.com.'", "b'koitera.net.'", "b'koitera.com.'", "b'pc.koitera.com.'", "b'0901360.com.'"]


In [37]:
for key in domains:
    pl.DataFrame(domains[key]) #.to_csv(f"{key}")

In [52]:
import tldextract
import re
import numpy as np
from scipy.stats import entropy

def encode_domain(df):
    """
    Encode domain to string
    """
    domain = str(df["Domain"].decode('latin-1').encode("utf-8"))
    df["Domain"] = domain
    
    return df

def fqdn_entropy(df):
    if not df["Domain"]:
        return df
    
    domain = df["Domain"]
    
    pk = [domain.count(chr(x)) / len(domain) for x in range(256)]
    pk = np.array(pk)

    df["Entropy"] = entropy(pk, base=2)

    return df

    
def count(df):
    """
    fqdn_count, upper_count, lower_count, numeric_count, special_count
    """
    if not df["Domain"]:
        return df

    domain = df["Domain"]

    df["FQDN_full_count"] = len(domain)
    df["FQDN_upper_count"] = sum(1 for c in domain if c.isupper())
    df["FQDN_lower_count"] = sum(1 for c in domain if c.islower())
    df["FQDN_numeric_count"] = sum(1 for c in domain if c.isdigit())
    df["FQDN_special_count"] = len(df["Domain"]) - len(re.findall("[\w]", domain))

    return df


def subdomain(df):
    """
    subdomain_length, sld, subdomain
    """
    if not df["Domain"]:
        return df

    domain = df["Domain"]

    parsec_domain = tldextract.extract(domain)

    df["Subdomain_length"] = len(parsec_domain.subdomain)

    return df


def labels(df):
    """
    labels, labels_max, labels_average, longest_word
    """
    # if not df["Domain"]:
    #     return df

    labels = df["Domain"].split(".")
    df["Labels_length"] = len(labels)
    df["Labels_max"] = len(max(labels, key=len))
    df["Labels_average"] = sum(len(c) for c in labels)

    return df

In [54]:
for key in domains:
    data = pl.DataFrame(domains[key])
    data = data.rename({"column_0": "Domain"})
    encode_domain(data)
    labels(data)
    print(data)

AttributeError: 'Series' object has no attribute 'decode'