In [51]:
import numpy as np
import pandas as pd

## Load the training data set

In [52]:
df = pd.read_csv('training.csv', header=None, names=['domain', 'dga'])

In [53]:
df.head()

Unnamed: 0,domain,dga
0,m644136d0.tmodns.net,0
1,dfg.ca.gov,0
2,c4w6wpg81xsbopy8a67.ddns.net,1
3,ace.ojom-mobile.de,0
4,pub.3gppnetwork.org.mcdonaldswifi.internal,0


In [54]:
df.dga.value_counts()

dga
0    166
1     34
Name: count, dtype: int64

In [55]:
df.head()

Unnamed: 0,domain,dga
0,m644136d0.tmodns.net,0
1,dfg.ca.gov,0
2,c4w6wpg81xsbopy8a67.ddns.net,1
3,ace.ojom-mobile.de,0
4,pub.3gppnetwork.org.mcdonaldswifi.internal,0


## Feature learning: Use RNN

First, determine vocabulary and tokenization, from the training data set

In [56]:
# build the dataset

domains = df.domain.tolist()

# get character vocab
vocab = sorted(list(set(''.join(domains))))
vocab_size = len(vocab)
print(f"Vocab size {vocab_size}")
# dictionaries to convert input characters to tokens and tokens to characters
char2token = {s:i for i,s in enumerate(vocab)}
token2char = {i:s for s,i in char2token.items()}
print(char2token)

Vocab size 38
{'-': 0, '.': 1, '0': 2, '1': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11, 'a': 12, 'b': 13, 'c': 14, 'd': 15, 'e': 16, 'f': 17, 'g': 18, 'h': 19, 'i': 20, 'j': 21, 'k': 22, 'l': 23, 'm': 24, 'n': 25, 'o': 26, 'p': 27, 'q': 28, 'r': 29, 's': 30, 't': 31, 'u': 32, 'v': 33, 'w': 34, 'x': 35, 'y': 36, 'z': 37}


In [57]:
# e.g., the tokenized representation of 'opentext.com' would be
[char2token[c] for c in 'test.com']

[26, 27, 16, 25, 31, 16, 35, 31, 1, 14, 26, 24]

In [58]:
import torch

In [59]:
# function to convert a domain name to tokens represented as one-hot vectors 

def domain2tensor(domain):
    tensor = torch.zeros(len(domain), 1, vocab_size)
    for i, c in enumerate(domain):
        tensor[ i, 0, char2token[c] ] = 1
    return tensor

In [None]:
import torchshow as ts
# one-hot tensor of a domain, e.g.,
x1 = domain2tensor('test.com')
print(x1)
x1.size()

In [61]:
def load_test_data():
    test_x_df = pd.read_csv('test1.txt', lineterminator='\n', header=None, names=['domain'])
    test_y_df = pd.read_csv('test1label.txt', lineterminator='\n', header=None, names=['dga'])
    return pd.concat([test_x_df, test_y_df], axis=1)

In [62]:
# Define NN architecture - Batch

import torch.nn as nn

class rnnModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_hid_neurons, num_out, nun_hid_layers=1):
        super(rnnModel, self).__init__()
             
        self.nn = nn.RNN(embedding_dim, num_hid_neurons, nun_hid_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(num_hid_neurons, 2)
        self.softmax = nn.Softmax(dim=1)
        self.num_out = num_out
        self.nun_hid_layers = nun_hid_layers
        self.num_hid_neurons = num_hid_neurons

    # define the forward function
    def forward(self, input, hidden):
        if hidden == None :
            batch_size = input.size(0)
            hidden = self.init_hidden(batch_size)

        output, hidden = self.nn(input, hidden)
        output = self.fc(output)
        output = self.softmax(output)
        return output, hidden
        
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.nun_hid_layers, batch_size, self.num_hid_neurons)
        return hidden

In [63]:
from torch.utils.data import Dataset
import torch.nn.functional as F

class DGADataSet(Dataset) :
    def __init__(self, dataframe, padded_size):
        self.padded_size = padded_size
        self.Xtr = dataframe["domain"]
        self.ytr = dataframe["dga"]

    def __len__(self):
        return len(self.Xtr)
    
    def __getitem__(self, index) :
        temp_text = self.Xtr.iloc[index]
        not_padded = domain2tensor(temp_text)
        padded = F.pad(not_padded, pad=(0,0,0,0,self.padded_size-len(not_padded),0))
        x = self.ytr.iloc[index]
        return padded, torch.Tensor([float(x), float(1-x)])

In [64]:
# select loss function, optimizer
dict_size = len(char2token)
n_hidden = 256

model = rnnModel(vocab_size=vocab_size, embedding_dim=dict_size, num_hid_neurons=n_hidden, num_out=2, nun_hid_layers=2)

# Define hyperparameters
n_epochs = 50
lr=1e-5

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
maxlen = len(max(df.domain, key=len))
print("The longest string has {} characters, total rows {}".format(maxlen, len(df.domain)))
batch_size = 100
dataset = DGADataSet(df, maxlen)
train_data_loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
losses = []
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    for input, target in train_data_loader :
        hidden = None
        for i in range(input.size(1)):
            x = input[:,i,:,:]
            output, hidden = model(x, hidden) 
        
        output = output.view(-1, 2)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
    losses.append(loss.item())
    print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
    print("Loss: {:.4f}".format(loss.item()))