In [1]:
import time
import sagemaker
import boto3
import re
import os
import json
import gzip
#import tldextract
from sklearn.metrics import classification_report, confusion_matrix
import random
import io
import pandas
from io import StringIO
import tensorflow as tf
from tensorflow.keras import Input 
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding,LSTM,Dropout,Dense,Activation
from tensorflow.keras.activations import sigmoid
from sklearn.model_selection import train_test_split
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import matplotlib.pyplot as plt
import math
from collections import Counter
import numpy as np

from sklearn.utils import shuffle
import datetime
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy(gpus)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [3]:
max_vocab = 40
max_sequence_length=40

In [4]:
def read_input_data(path):
    df = pandas.read_csv(path)
    return df
    
    

In [5]:
def split_dataset(df, train_size=0.98):
    X_train, X_rem = train_test_split(df, train_size=train_size)
    X_valid, X_test = train_test_split(X_rem, test_size=0.5)
    return X_train,X_valid,X_test

In [6]:
def is_in_alexa1m(domain,in_alexa_domains):
    return ((domain in in_alexa_domains))

def entropy(s):
        p, lns = Counter(s), float(len(s))
        return -sum( count/lns * math.log(count/lns, 2) for count in p.values())

In [7]:
def create_features(df,path_to_store_processed_file):
    vectorizer_non_dga_domains = CountVectorizer(analyzer='char', ngram_range=(1,4), min_df=1e-4, max_df=1.0)
    vectorizer_words = CountVectorizer(analyzer='char', ngram_range=(1,4), min_df=1e-4, max_df=1.0)
    word_df = pd.read_csv('./data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')
    word_df = word_df[word_df['word'].map(lambda x: str(x).isalpha())]
    word_df = word_df.applymap(lambda x: str(x).strip().lower())
    word_df = word_df.dropna()
    word_df = word_df.drop_duplicates()
    counts_matrix = vectorizer_words.fit_transform(word_df['word'])
    weight_words = np.log10(counts_matrix.sum(axis=0).getA1())

    alexa1m = pandas.read_csv("./data/top-1m.csv",header=None,usecols=[1], names=['domain'])

    alexa_domains = alexa1m['domain']
    in_alexa_domains = set(alexa_domains) & set(df['domain'])


    non_dga_domains = df[df['is_dga']==0]
    t = vectorizer_non_dga_domains.fit_transform(non_dga_domains['domain'])
    weight_non_dga_grams = np.log10(t.sum(axis=0).getA1())
    
    x1= weight_words * vectorizer_words.transform(df['domain']).T 
    x2= weight_non_dga_grams * vectorizer_non_dga_domains.transform(df['domain']).T 



    X_train_2_word_grams =x1
    X_train_2_non_dga_grams = x2

    print ("1. Done adding ngram features")
    X_train_2_entropy =   df['domain'].map(lambda x: entropy(x)) 
    print ("2. Done adding entropy")
    X_train_2_len = df['domain'].map(lambda x: len(x))
    print ("3. Done adding length of domain")
    print("Number of training set domains present in alexa domains ",len(in_alexa_domains))
    X_train_2_alexa = df['domain'].map(lambda x: is_in_alexa1m(x,in_alexa_domains))
    print ("4. Done adding domain present in alexa domains")
    X_train_2_alexa = X_train_2_alexa.astype(int)
    X_train_2 = np.c_[df['domain'],df['is_dga'],X_train_2_word_grams,X_train_2_non_dga_grams,X_train_2_entropy,X_train_2_len,X_train_2_alexa] #
    print ("5. Done appending features")

    print (X_train_2.shape)
    processed_df = pd.DataFrame(X_train_2,columns = 
                 ['domain','is_dga','word_grams','non_dga_grams','entropy','len','in_alexa'])
    processed_df.to_csv(path_to_store_processed_file)
    print ("6. Done creating csv file")
    return processed_df

In [8]:
def prep_text(tokenizer,texts):
    text_sequences = tokenizer.texts_to_sequences(texts)
    return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length)

In [9]:
def prepare_for_training(df):
    X_train, X_valid, X_test = split_dataset(df, train_size=0.98)
    train_df = create_features(X_train,"./data/processed_dga_train_dataset.csv")
    test_df = create_features(X_test,"./data/processed_dga_test_dataset.csv")
    X_train_input1 = train_df[['domain','is_dga']]
    X_train_input2 = train_df[['word_grams','non_dga_grams','entropy','len','in_alexa']]
    tokenizer = text.Tokenizer(num_words=max_vocab,char_level=True)
    tokenizer.fit_on_texts(X_train_input1["domain"])
    X_train_domains = prep_text(tokenizer,X_train_input1['domain'])
    X_test_input1 = test_df[['domain','is_dga']]
    X_test_input2 = test_df[['word_grams','non_dga_grams','entropy','len','in_alexa']]
    X_test_domains = prep_text(tokenizer,X_test_input1['domain'])
    return X_train_domains,X_train_input1,X_train_input2,X_test_domains,X_test_input1,X_test_input2

In [10]:
def decay(epoch):
      if epoch < 3:
        return 1e-3
      elif epoch >= 3 and epoch < 7:
        return 1e-4
      else:
        return 1e-5

In [17]:
def train(df):
    checkpoint_dir = './wide_new/training_checkpoints'
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
    X_train_domains,X_train_input1,X_train_input2,X_test_domains,X_test_input1,X_test_input2 = prepare_for_training(df)

    
    start_time = time.time()

    returns_wide={}
    with strategy.scope():
        input1 = tf.keras.Input(shape = [40], name="deep_input")
        input2 = tf.keras.Input(shape= [5],name="wide_input")
        embedding = Embedding(input_dim=40,output_dim=256, input_length=40)(input1)
        L = LSTM(256, dropout=0.5)(embedding)
        concat = tf.keras.layers.concatenate([input2,L])
        output = Dense(1, name="output",activation = "sigmoid")(concat)
        model = tf.keras.Model(inputs=[input1,input2],outputs=[output])
        model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

        print (model.summary())

        class PrintLR(tf.keras.callbacks.Callback):
              def on_epoch_end(self, epoch, logs=None):
                print('\nLearning rate for epoch {} is {}'.format(epoch + 1, model.optimizer.lr.numpy()))

        callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir='./wide_new/logs', histogram_freq=1),
        tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                            save_weights_only=True),
        tf.keras.callbacks.EarlyStopping(monitor='loss',min_delta=0.0001,restore_best_weights=True),
        tf.keras.callbacks.LearningRateScheduler(decay),
        PrintLR()
        ]
        callbacks_ = callbacks
        returns_wide = {}

        model_epochs = 20
        model_batch_size = 1000

        print(type(X_train_domains))
        print(type(X_train_input2))
        print(type(X_test_domains))
        print(type(X_test_input2))
    returns_wide['fit_history'] = model.fit(x=[X_train_domains,tf.convert_to_tensor(X_train_input2.to_numpy(), dtype=tf.float32)],
                                       y=tf.convert_to_tensor(X_train_input1['is_dga'].to_numpy(), dtype=tf.float32), 
                                       verbose=2, 
                                       epochs=model_epochs, 
                                       batch_size=model_batch_size,
                                       #validation_data = ([X_test_domains,X_test_input2.to_numpy().astype(np.float32)], test_df['is_dga'].to_numpy().astype(np.float32)),
                                       callbacks=callbacks_)

    returns_wide['model_epochs'] = model_epochs
    returns_wide['model'] = model
    returns_wide['model_batch_size'] = model_batch_size
    returns_wide['model_loss_acc'] = model.evaluate([X_test_domains,X_test_input2.to_numpy().astype(np.float32)], test_df['is_dga'].to_numpy().astype(np.float32))
    lstm_wide_time = (time.time() - start_time)
    print("--- Training time in %s seconds ---" % lstm_wide_time)
    return returns_wide

In [None]:
def train_dga():
    df = read_input_data("./data/dga_valid_data.csv")[:1000]
    returns_wide = train(df)
    print (returns_wide['model'])
    print (returns_wide['model_loss_acc'])
train_dga()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  word_df = pd.read_csv('./data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')


1. Done adding ngram features
2. Done adding entropy
3. Done adding length of domain
Number of training set domains present in alexa domains  2
4. Done adding domain present in alexa domains
5. Done appending features
(980, 7)
6. Done creating csv file
1. Done adding ngram features
2. Done adding entropy
3. Done adding length of domain
Number of training set domains present in alexa domains  0
4. Done adding domain present in alexa domains
5. Done appending features
(10, 7)
6. Done creating csv file
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 deep_input (InputLayer)        [(None, 40)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 40, 256)      10240       ['deep_input[0][0

In [None]:
print (type(X_train_domains))
