In [1]:
pip install --upgrade tensorflow


Usage:   
  C:\Users\Mictlantecuhtli\Anaconda3\python.exe -m pip <command> [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  completion                  A helper command used for command completion.
  help                        Show help for commands.

General Options:
  -h, --help                  Show help.
  --isolated   

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Input, LSTM, Dropout, Embedding, Dense, Conv1D, MaxPooling1D

### Loading Data

We define some functions to read the training data from a labelled set of 100000 genuine and fake domain names and load it into a tensorflow dataset. This also returns a character-index embedding, which is used later on for predictions.

In [0]:
def load_data():
  domains = pd.read_csv('https://github.com/sudo-rushil/CNN-LSTM_Domain_Classifier/blob/master/domains.csv')
  domains.drop(['RootObject.subclass'], axis=1, inplace=True)
  columns = {'RootObject.class': 'pred', 'RootObject.domain': 'domain'}
  domains.rename(columns=columns, inplace=True)
  
  for i in range(domains.shape[0]):
    if domains['pred'][i] == 'legit':
      domains['pred'][i] = 0
    else:
      domains['pred'][i] = 1

  return domains[['domain', 'pred']]

def shuffle_data(domains):
  iter_len = int(domains.shape[0]/2)
  for i in range(iter_len):
    if i % 2 == 0:
      domain, pred = tuple(domains.iloc[i])
      sdomain, spred = tuple(domains.iloc[i + iter_len])
      domains['domain'][i] = sdomain
      domains['pred'][i] = spred
      domains['domain'][i + iter_len] = domain
      domains['pred'][i + iter_len] = pred
  return domains

def load_tf_dataset(domains):
  vocab = sorted(set(''.join(domains['domain'].to_list())))
  char2idx = {u:i for i, u in enumerate(vocab)}
  idx2char = np.array(vocab)

  lines = []
  for i, line in enumerate(domains.iloc[:, 0]):
    lines.append([char2idx[c] for c in line])
  
  tensor = tf.keras.preprocessing.sequence.pad_sequences(lines, padding='post')
  targets = np.array(domains.iloc[:, 1], dtype=np.int32)

  data = tf.data.Dataset.from_tensor_slices(tensor)
  pred = tf.data.Dataset.from_tensor_slices(targets)
  dataset = tf.data.Dataset.zip((data, pred))
  
  return dataset, (char2idx, idx2char)

In [0]:
domains = load_data()
domains = shuffle_data(domains)
dataset, mappings = load_tf_dataset(domains)
char2idx, idx2char = mappings

In [0]:
dataset = dataset.shuffle(domains.shape[0]).batch(1000, drop_remainder=True)

### Building the Model

Here, we build the model using Tensorflow's Keras API. It uses an architecture adapted from B. Yu *et al.*, 2018, which first embeds each character as a 128-dimensional vector, passes it through a 1D convolutional layer, runs it through an LSTM, and classifies it with a single dense layer.

The 82 in the input dimension comes from the fact that the maximum domain length in the training data is 82 characters, and the 38 comes from the 38 possible characters in a domain name.

We train the model for 20 epochs, using Adam.

In [0]:
domain_input = Input(shape=(82,), dtype='int32', name='domain_input')
embedding = Embedding(input_dim=38, output_dim=128, input_length=82, 
                      batch_input_shape=[1000, None])(domain_input)
conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu', strides=1)(embedding)
pool = MaxPooling1D(pool_size=2, padding='same')(conv)
lstm = LSTM(64, return_sequences=False)(pool)
output = Dense(1, activation='sigmoid')(lstm)
model = tf.keras.Model(inputs=domain_input, outputs=output)

In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
EPOCHS = 20
accuracy = []
losses = []

for i in range(EPOCHS):
  history = model.fit(dataset)

  accuracy.append(history.history['accuracy'])
  losses.append(history.history['loss'])

  model.save_weights('domain_class_v1_checkpoint')



### Getting Predictions

These two functions allow us to input a domain name and get the model's prediction of whether it is genuine or fake. It uses the character-index embedding defined above when loading the tensorflow data.

In [0]:
'''
Function to get the prediction score from the model for an input domain.
'''

def get_raw_prediction(domain_name):
  name_vec = [char2idx[c] for c in domain_name]
  vec = np.zeros((1, 82))
  vec[0, :len(domain_name)] = name_vec

  prediction = model(vec).numpy().sum()

  return prediction

In [3]:
'''
Function to print the prediction for an input domain.
'''

def get_prediction(domain_name):
  name_vec = [char2idx[c] for c in domain_name]
  vec = np.zeros((1, 82))
  vec[0, :len(domain_name)] = name_vec

  prediction = nm(vec).numpy().sum()

  if prediction < 0.5:
    print('The domain {} is genuine with probability {}'.format(domain_name, round(1 - prediction, 2)))
  else:
    print('The domain {} is fake with probability {}'.format(domain_name, round(prediction, 2)))

### Testing the model
From here, we load some new data to test the model. The fake examples (tdata1) come from the Bambenek DGA feed, and the 
real examples come from Alexa's top one million websites.

Tests run on the first 10000 domains in each test dataset, after shuffling, gave a 98.7% prediction rate for fake domains and 98.8% for real domains.

In [3]:
tdata1 = pd.read_csv('https://osint.bambenekconsulting.com/feeds/dga-feed.txt', index_col=False, names=['domain', 'junk', 'junk2'], skiprows=15)
tdata1 = tdata1.drop(['junk', 'junk2'], axis=1)
tdata2 = pd.read_csv('https://github.com/sudo-rushil/CNN-LSTM_Domain_Classifier/blob/master/top-1m.csv', names=['domain'], index_col=0)

In [134]:
Correct_Fake = 0
tdata1 = tdata1.sample(frac=1)

for i in range(10000):
  pred = get_raw_prediction(tdata1['domain'][i])
  if pred >=  0.5:
    Correct_Fake += 1

print(Correct_Fake/10000)

0.9869


In [135]:
Correct_Real = 0
tdata2 = tdata2.sample(frac=1)

for i in range(1, 10001):
  pred = get_raw_prediction(tdata2['domain'][i])
  if pred <  0.5:
    Correct_Real += 1

print(Correct_Real/10000)

0.988


In [None]:
get_prediction('wikipedia')

### Saving and Loading the Model

As this model was originally trained on Google Colab, it was saved through Keras serialization as an .h5 file. The predict_domain.py function allows one to pass individual domains to get a prediction. Alternatively, one can load the domain_classifier.h5 file to get the original model.