# Loading Data

Here, we will create the dataset by combining a domain dataset from Splunk, the Bambenek DGA feed, and Alexa's top 1 million domains. We then save this dataset so we can train the model on it later.

In [None]:
import numpy as np
import pandas as pd
from random import randint

In [None]:
def load_data():
  domains = pd.read_csv('/content/drive/My Drive/domains.csv')
  domains.drop(['RootObject.subclass'], axis=1, inplace=True)
  columns = {'RootObject.class': 'pred', 'RootObject.domain': 'domain'}
  domains.rename(columns=columns, inplace=True)
  
  for i in range(domains.shape[0]):
    if domains['pred'][i] == 'legit':
      domains['pred'][i] = 0
    else:
      domains['pred'][i] = 1

  return domains[['domain', 'pred']]

In [None]:
domains = load_data().sample(frac=1)

domains_2 = pd.read_csv('https://osint.bambenekconsulting.com/feeds/dga-feed.txt', index_col=False, names=['domain', 'junk', 'junk2'], skiprows=15)
domains_2 = domains_2.drop(['junk', 'junk2'], axis=1)

domains_3 = pd.read_csv('/content/drive/My Drive/top-1m.csv', names=['domain'], index_col=0).reset_index(drop=True)

pred_2 = np.ones(domains_2.shape[0], dtype=int)
pred_3 = np.zeros(domains_3.shape[0], dtype=int)

domains_2['pred'] = pred_2
domains_3['pred'] = pred_3

domain_data = pd.concat([domains, domains_2, domains_3], ignore_index=True, sort=True)

In [None]:
def strip(domain_name):
  domain_name = domain_name.lower()
  name_chunks = domain_name.split('.')

  if len(name_chunks) == 2:
    return name_chunks[0]
  elif len(name_chunks) == 3:
    return name_chunks[1]
  else:
    return domain_name

In [None]:
def preprocess(domains, K=0.25):
  r = domains.shape[0]
  t = int(K * r)

  for i in range(t):
    idx = randint(0, r)
    domains['domain'][idx] = strip(domains['domain'][idx])
    if i % 1000 == 0:
      print('logging at ' + str(i))
  
  return domains

In [None]:
preprocess(domain_data)

In [None]:
domain_data = domain_data.sample(frac=1).reset_index(drop=True)
domain_data.to_csv('/content/drive/My Drive/domain_data.csv', index=False)