In [None]:
!pip install keras
!pip install -U scikit-learn
!pip install tldextract
!pip install numpy
!pip install h5py



In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Activation, Embedding

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score

import tldextract
import numpy as np

import json

In [None]:
def open_data(path):
  lines = open(path).readlines()
  df = pd.DataFrame(lines, columns=["domain"])
  df['domain'] = df['domain'].str.strip()
  return df

legit_domains = open_data("./data/dataset_legit.txt")
dga_domains = open_data("./data/dataset_dga.txt")
print(legit_domains)
print(dga_domains)

                           domain
0                        v2ai.top
1      healthcareaustralia.com.au
2                   roblox.com.ee
3                globalsign.cloud
4          searchlenderquotes.com
...                           ...
29995                pimalink.com
29996           theherocompany.co
29997      hendricktoyotaapex.com
29998            springer-sbm.com
29999              spicyonion.com

[30000 rows x 1 columns]
                       domain
0                 jdpggtz.biz
1            euufeagdjngs.com
2              ontutotu.bazar
3        fcffccdccmoaanle.com
4                 yhcdrbr.net
...                       ...
29995            zlxyejhtl.eu
29996  ede17f480038a.feedback
29997          ywvusoyd.bazar
29998        2a2jnbunhkl.life
29999      rimgrejshtvucsf.ir

[30000 rows x 1 columns]


In [None]:
legit_domains['tld'] = [tldextract.extract(d).domain for d in legit_domains['domain']]
dga_domains['tld'] = [tldextract.extract(d).domain for d in dga_domains['domain']]

legit_domains = legit_domains[~legit_domains['tld'].str.contains('\`|\.')]
dga_domains = dga_domains[~dga_domains['tld'].str.contains('\`|\.')]
print(legit_domains)
print(dga_domains)

                           domain                  tld
0                        v2ai.top                 v2ai
1      healthcareaustralia.com.au  healthcareaustralia
2                   roblox.com.ee               roblox
3                globalsign.cloud           globalsign
4          searchlenderquotes.com   searchlenderquotes
...                           ...                  ...
29995                pimalink.com             pimalink
29996           theherocompany.co       theherocompany
29997      hendricktoyotaapex.com   hendricktoyotaapex
29998            springer-sbm.com         springer-sbm
29999              spicyonion.com           spicyonion

[30000 rows x 2 columns]
                       domain               tld
0                 jdpggtz.biz           jdpggtz
1            euufeagdjngs.com      euufeagdjngs
2              ontutotu.bazar             bazar
3        fcffccdccmoaanle.com  fcffccdccmoaanle
4                 yhcdrbr.net           yhcdrbr
...                       

In [None]:
legit_domains = legit_domains.drop_duplicates()
dga_domains = dga_domains.drop_duplicates()

legit_domains['label'] = 0
dga_domains['label'] = 1
print(legit_domains)
print(dga_domains)

                           domain                  tld  label
0                        v2ai.top                 v2ai      0
1      healthcareaustralia.com.au  healthcareaustralia      0
2                   roblox.com.ee               roblox      0
3                globalsign.cloud           globalsign      0
4          searchlenderquotes.com   searchlenderquotes      0
...                           ...                  ...    ...
29995                pimalink.com             pimalink      0
29996           theherocompany.co       theherocompany      0
29997      hendricktoyotaapex.com   hendricktoyotaapex      0
29998            springer-sbm.com         springer-sbm      0
29999              spicyonion.com           spicyonion      0

[30000 rows x 3 columns]
                       domain               tld  label
0                 jdpggtz.biz           jdpggtz      1
1            euufeagdjngs.com      euufeagdjngs      1
2              ontutotu.bazar             bazar      1
3        f

In [None]:
all_domains = pd.concat([legit_domains, dga_domains], ignore_index=True)
all_domains = all_domains.sample(frac=1).reset_index(drop=True)

X, Y = all_domains['tld'], all_domains['label']
print(X, Y)

0                    dxl
1                 vpnkit
2            dentalsuite
3             unmissions
4                imvrgxw
              ...       
58606         wupjajeoqu
58607      873421a3f6795
58608    mkgijapdgenkihd
58609          chantelle
58610        tacticalrmm
Name: tld, Length: 58611, dtype: object 0        0
1        0
2        0
3        0
4        1
        ..
58606    1
58607    1
58608    1
58609    0
58610    0
Name: label, Length: 58611, dtype: int64


In [None]:
valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

max_feature = len(valid_chars) + 1
max_len = np.max([len(x) for x in X])
print(f"{max_feature = }")
print(f"{max_len = }")

max_feature = 38
max_len = 48


In [None]:
X = [[valid_chars[y] for y in x] for x in X]
X = pad_sequences(X, maxlen=max_len)
print(X)

[[ 0  0  0 ... 12  3 21]
 [ 0  0  0 ... 37  5 29]
 [ 0  0  0 ...  5 29 10]
 ...
 [ 0  0  0 ...  5 17 12]
 [ 0  0  0 ... 21 21 10]
 [ 0  0  0 ... 14 19 19]]


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train)
print(X_test)
print(Y_train)
print(Y_test)

[[ 0  0  0 ... 29  2 35]
 [ 0  0  0 ... 27 31 14]
 [ 0  0  0 ... 22 27 19]
 ...
 [ 0  0  0 ...  5  3 12]
 [ 0  0  0 ... 26 26 21]
 [ 0  0  0 ... 28 16 35]]
[[ 0  0  0 ...  5 15 24]
 [ 0  0  0 ... 31  5 21]
 [ 0  0  0 ...  5 29 28]
 ...
 [ 0  0  0 ...  2 20  3]
 [ 0  0  0 ...  3 25 17]
 [ 0  0  0 ... 11 19  9]]
18959    1
27330    1
45133    1
52120    0
45662    0
        ..
55274    1
23398    0
18280    0
17026    0
33204    0
Name: label, Length: 46888, dtype: int64
44074    0
12737    0
37846    0
48236    1
5644     0
        ..
919      1
14073    0
21350    1
14497    1
29218    1
Name: label, Length: 11723, dtype: int64


In [None]:
model = Sequential()
model.add(Embedding(max_feature, 128, input_length=max_len))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="rmsprop")

In [None]:
for i in range(5):
  model.fit(X_train, Y_train, batch_size=16, epochs=1)



In [None]:
probs = model.predict(X_test)



In [None]:
tn, fp, fn, tp = confusion_matrix(Y_test, probs > 0.5).ravel()

print('TP: %d\nTN: %d\nFP: %d\nFN: %d\n' % (tp, tn, fp, fn))
print('FP rate: %.3f%%\nFN rate: %.3f%%\n' % (fp / (fp + tn) * 100, fn / (fn + tn) * 100))

print('Sensitivity: %.3f%%\nSpecificity: %.3f%%\nAccuracy: %.3f%%\n' % (
    tp / (tp + fn),
    tn / (tn + fp),
    (tp + tn) / (tp + tn + fp + fn)
))

print("AUC: %.3f%%" % roc_auc_score(Y_test, probs))

TP: 5149
TN: 5793
FP: 175
FN: 606

FP rate: 2.932%
FN rate: 9.470%

Sensitivity: 0.895%
Specificity: 0.971%
Accuracy: 0.933%

AUC: 0.983%


In [None]:
# Save models and config
model.save("./model/dga_detection.v1.keras")
model.save_weights("./model/dga_detection.v1.h5")

conf = {
    "valid_chars": valid_chars,
    "max_len": int(max_len),
    "max_feature": max_feature
}
json.dump(conf, open("./config.json", "w"))

In [None]:
loaded_model = load_model("./model/dga_detection.v1.keras")
def predict(domains, threshold=0.5):
  domain = [[valid_chars[ch] for ch in tldextract.extract(domain).domain] for domain in domains]
  domain = pad_sequences(domain, maxlen=max_len)

  predicted = loaded_model.predict(domain)

  return (predicted > threshold).astype(int)

print(predict(["wikipedia.com", "youtube.com", "qwfoppqws.net"]))

[[0]
 [0]
 [1]]
