In [1]:
!pip install tldextract

[33mYou are using pip version 10.0.1, however version 20.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import json
import tldextract
import numpy as np
import pandas as pd
import boto3
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

role = get_execution_role()
bucket = 'dga-storage'
prefix = 'sagemaker/xgboost_dga'

In [3]:
data_key = 'data.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
data = pd.read_csv(data_location)

In [4]:
VALID_CHARS = 'abcdefghijklmnopqrstuvwxyz0123456789-_\n'
LOOKUP_TABLE = None

def encode_fqdn(fqdn='www.google.com'):
    global VALID_CHARS
    global LOOKUP_TABLE
    if not LOOKUP_TABLE:
        LOOKUP_TABLE = dict()
        idx = 1
        for c in VALID_CHARS:
            LOOKUP_TABLE[c] = int(idx)
            idx += int(1)
            
    ds = tldextract.extract(fqdn)
    domain = ds.domain
    rvalue = []
    for c in domain:
        rvalue.append(str(LOOKUP_TABLE[c]))
    for _ in range(len(rvalue), 63):
        rvalue.append('0')
    return (rvalue)


In [5]:
data['domain_type'] = pd.factorize(data['domain_type'])[0]


In [None]:
data_features = []
for domain in data['domain']:
    domain_features = encode_fqdn(domain)
    data_features.append(domain_features)

data_features_df = pd.DataFrame(data_features)
labels = data['domain_type']
full_dataset = pd.concat([labels, data_features_df], axis=1)


In [None]:
train_data, validation_data, test_data = np.split(full_dataset.sample(frac=1, random_state=1729), [int(0.7 * len(full_dataset)), int(0.9 * len(full_dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [None]:
s3_input_train = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_validation = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [None]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.3,
                        objective='multi:softmax',
                        num_class=2,
                        num_round=1000)

# xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})


In [None]:
train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')
train_channel = sagemaker.session.s3_input(train_data, content_type ='csv')
valid_channel = sagemaker.session.s3_input(validation_data, content_type ='csv')

xgb.fit({'train': train_channel, 'validation': valid_channel})

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')


In [None]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')


In [None]:
new_test = pd.read_csv('assignment.csv')
new_test['threat'] = pd.factorize(new_test['threat'])[0]

data_features = []
for domain in new_test['domain']:
    domain_features = encode_fqdn(domain)
    data_features.append(domain_features)

data_features_df = pd.DataFrame(data_features)
labels = new_test['threat']
final_test = pd.concat([labels, data_features_df], axis=1)
predictions = predict(final_test.as_matrix()[:, 1:])
predictions

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt

def calc_specificity(y_true, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_true == 0)) /sum(y_true ==0)

thresh = 0.5
y_pred = predictions
y_pred_binary = np.where(predictions > thresh, 1, 0)
y_true = final_test['threat']

c_mat = confusion_matrix(y_true, y_pred_binary) ## Predicted vs. actual outcome
auc = round(roc_auc_score(y_true, y_pred),4)
accuracy = round(accuracy_score(y_true,(y_pred > thresh) ) ,4)
recall = round(recall_score(y_true, (y_pred > thresh)),4)
precision = round(precision_score(y_true, (y_pred > thresh)),4)
specificity = round(calc_specificity(y_true, y_pred, thresh),4)

print(f'AUC is: {auc}')
print(f'Accuracy is: {accuracy}')
print(f'Recall is: {recall}')
print(f'Precision is: {precision}')
print(f'Specificity is: {specificity}')
