In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler, Normalizer
import seaborn as sns
import matplotlib.pyplot as plt
import time

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if 'csv' in filename:
            print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Introduction
* [The UNSW-NB15 dataset description](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/)
* [Feature visualization and preprocessing](https://www.kaggle.com/khairulislam/unsw-nb15-eda)
* [Feature importance using RandomForest classifier](https://www.kaggle.com/khairulislam/unsw-nb15-feature-importance)
* [Performance with other classifiers](https://www.kaggle.com/khairulislam/unsw-nb15-anomaly-detection)

# Utils

In [None]:
def run_lgb(x, y, tr_idx, val_idx, param, num_round=100):
    lgb_train = lgb.Dataset(x.iloc[tr_idx], y.iloc[tr_idx])
    x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]
    validation = lgb.Dataset(x_val, y_val)
    clf = lgb.train(param, lgb_train, num_round, valid_sets=[validation], early_stopping_rounds=50, verbose_eval=200, feval=lgb_f1_score)
    return clf

def false_alarm_rate(y_true, y_pred):
    CM = metrics.confusion_matrix(y_true, y_pred)
    TN, FN, TP, FP = CM[0][0], CM[1][0], CM[1][1], CM[0][1]
    return (FP+FN)/(TP+TN+FP+FN)

label = "Train"

def plot_roc(y_true, y_prob):
    fpr, tpr, _ = metrics.roc_curve(y_true, y_prob)
    auc = metrics.roc_auc_score(y_true, y_prob)
    plt.plot(fpr,tpr,label=label+", auc= %0.2f" % auc)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc=0)
    plt.show()
    plt.savefig(label+".pdf")
    
    
data = {}
def results(y_test, y_prob):
    threshold = 0.5
    y_pred = np.where(y_prob >= threshold, 1, 0)
    
    acc = metrics.accuracy_score(y_test, y_pred)
    pre = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred) # it is also called detection rate or true positive rate
    f1 = metrics.f1_score(y_test, y_pred)
    print(f"Acc {acc}, Precision {pre}, Recall {rec}, F1-score {f1}")
    
    CM = metrics.confusion_matrix(y_test, y_pred)
    TN, FN, TP, FP = CM[0][0], CM[1][0], CM[1][1], CM[0][1]
    # false positive rate
    FPR = FP/(FP+TN)
    # false alarm rate 
    FAR = (FP+FN)/(TP+TN+FP+FN)
    AUC = metrics.roc_auc_score(y_test, y_prob)
    
    print("FPR {0}, FAR {1}, AUC {2}".format(FPR, FAR, AUC))
    # print(metrics.classification_report(y_test, y_pred))
    # plot_roc(y_test, y_prob)
    if label != "":
        data[label] = (y_test, y_prob)

    
def test_run(x_train, y_train, x_test, y_test, param, num_round=2000):
    start = time.clock()
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_validation = lgb.Dataset(x_test, y_test)
    clf = lgb.train(param, lgb_train, num_round, valid_sets=[lgb_validation], early_stopping_rounds=50, verbose_eval=200, feval=lgb_f1_score)
    # clf = lgb.train(param, lgb_train, 2000, valid_sets=[lgb_validation], early_stopping_rounds=50, verbose_eval=200)
    y_prob = clf.predict(x_test, num_iteration=clf.best_iteration)
    
    print()
    results(y_test, y_prob)
    print("Time spent {0}".format(time.clock() - start))
    return y_prob
    
def cross_validation(X, Y, param, kf, num_round=2000):
    start = time.clock()
    y_probs = []
    y_vals = []

    # for tr_idx, val_idx in tqdm(kf.split(X, Y), total=folds):
    for tr_idx, val_idx in kf.split(X, Y):
        clf = run_lgb(X, Y, tr_idx, val_idx, param, num_round)
        x_val, y_val = X.iloc[val_idx], Y.iloc[val_idx]
        y_prob = clf.predict(x_val, num_iteration=clf.best_iteration)
        
        y_probs.extend(y_prob)
        y_vals.extend(y_val)

    print()
    results(y_vals, np.asarray(y_probs))
    print("Time spent {0}".format(time.clock() - start))

In [None]:
root = '../input/data-preprocessing/'
train = pd.read_csv(root + 'train.csv')
test = pd.read_csv(root + 'test.csv')
# separate features and labels
x_train, y_train = train.drop(['label'], axis=1), train['label']
x_test, y_test = test.drop(['label'], axis=1), test['label']

# Train data

In [None]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm

def lgb_accuracy(preds, data):
    y_true = data.get_label()
    y_pred = np.round(preds)
    return 'acc', metrics.accuracy_score(y_true, y_pred), True

def lgb_f1_score(preds, data):
    y_true = data.get_label()
    y_pred = np.round(preds) # scikits f1 doesn't like probabilities
    return 'f1', metrics.f1_score(y_true, y_pred), True

In [None]:
folds = 10
seed = 1
num_round = 2000
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

In [None]:
# label = ''
# param = {
#     'objective': 'binary', 
#     'learning_rate': 0.1, 
#     "boost_from_average":True,
#     "metric": 'binary_logloss' # 'auc'
# }
# start = time.clock()
# # test_run( x_train, y_train, x_train, y_train, param)
# clf = lgb.train(param, lgb.Dataset(x_train, y_train), 2000, valid_sets=[lgb.Dataset(x_train, y_train)], early_stopping_rounds=50, verbose_eval=200)
# y_prob = clf.predict(x_train, num_iteration=clf.best_iteration)
# print()
# results(y_train, y_prob)
# print("Time spent {0}".format(time.clock() - start))


# y_prob = clf.predict(x_test, num_iteration=clf.best_iteration)
# print()
# results(y_test, y_prob)

## Ten-fold cross validation

In [None]:
param = {
    'objective': 'binary', 
    'learning_rate': 0.1, 
    "boost_from_average":True,
    "metric": 'binary_logloss' # 'auc'
}
label = "train_ten"
cross_validation(x_train, y_train, param, kf, num_round=num_round)

## Five-fold cross validation

In [None]:
label = 'train_five'
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cross_validation(x_train, y_train, param, kf, num_round=num_round)

# Test data

In [None]:
# label = ''
# param = {
#     'objective': 'binary', 
#     'learning_rate': 0.1, 
#     "boost_from_average":True,
#     "metric": 'binary_logloss' # 'auc'
# }
# y_prob = test_run(x_test, y_test, x_test, y_test, param)

## Validate on test data
Here the model trained on test data is being validated using test data.

In [None]:
label = "train_test"
param = {
    'objective': 'binary',
    'learning_rate': 0.05, 
    'boost_from_average':True,
    'is_unbalance':True,
    "metric": 'binary_logloss' # 'auc'
}
y_prob = test_run(x_train, y_train, x_test, y_test, param, num_round=num_round)

In [None]:
# y_pred = np.where(y_prob >= 0.5, 1, 0)
# print(metrics.confusion_matrix(y_test, y_pred))

# target_names = ['Normal', 'Anomaly']
# cm = metrics.confusion_matrix(y_test, y_pred)
# # Normalize
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# plt.rc('font', size=20) 
# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(cmn, annot=True, fmt='.2f', xticklabels=target_names, yticklabels=target_names)

# plt.ylabel('Actual')
# plt.xlabel('Predicted')
 
# plt.show(block=False)

## Ten-fold cross validation

In [None]:
label = 'test_ten'
param = {
    'objective': 'binary',
    'learning_rate': 0.1, 
    "boost_from_average":True,
    # 'is_unbalance':True,
    # "feature_fraction":0.5,
    "metric": 'binary_logloss' # 'auc'
}
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cross_validation(x_test, y_test, param, kf, num_round=num_round)

## Five-fold cross validation

In [None]:
label = 'test_five'
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cross_validation(x_test, y_test, param, kf, num_round=num_round)

# Combined data
Here we combined both train and test set. Then evaluated their ten-fold cross validation performance.

In [None]:
total = pd.concat([train, test], axis=0)
X, Y = total.drop(['label'], axis=1), total['label']

In [None]:
param = {
    'objective': 'binary',
    'learning_rate': 0.1, 
    "boost_from_average":True,
    # 'is_unbalance':True,
    # "bagging_fraction":0.8,
    "feature_fraction":0.5,
    # "bagging_freq":1,
    "metric": 'binary_logloss' # 'auc'
}
label = 'combined_ten'
cross_validation(X, Y, param, kf, num_round=num_round)

In [None]:
# plt.figure(dpi=1200)
for value in ['Train', 'Test', 'Combined']:
    temp = data[value.lower()+'_ten']
    fpr, tpr, _ = metrics.roc_curve(temp[0], temp[1])
    auc = metrics.roc_auc_score(temp[0], temp[1])
    plt.plot(fpr,tpr,label=value+", auc= %0.4f" % auc)

plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.savefig('roc_ten.pdf')
plt.show() 

In [None]:
for value in ['Train', 'Test']:
    temp = data[value.lower()+'_five']
    fpr, tpr, _ = metrics.roc_curve(temp[0], temp[1])
    auc = metrics.roc_auc_score(temp[0], temp[1])
    plt.plot(fpr,tpr,label=value+", auc= %0.4f" % auc)

plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.savefig('roc_five.pdf')
plt.show() 

In [None]:
temp = data['train_test']
fpr, tpr, _ = metrics.roc_curve(temp[0], temp[1])
auc = metrics.roc_auc_score(temp[0], temp[1])
plt.plot(fpr,tpr,label="Test, auc= %0.4f" % auc)

plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('roc_test.pdf')
plt.show() 


In [None]:
from IPython.display import IFrame, display
filepath = "roc_test.pdf"
IFrame(filepath, width=700, height=400)