In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py

from sklearn import model_selection, feature_selection, utils, ensemble, linear_model, metrics


In [6]:
print("Import data")

X = pd.read_hdf('model_pp1.h5', key='data')
X.reset_index(drop=True, inplace=True)

X2 = pd.read_hdf('model_pp2.h5', key='data')
X2.reset_index(drop=True, inplace=True)

X = X.join(X2)

X.drop('window_id', axis=1, inplace=True)

y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)

labels = np.load("labels.npy", allow_pickle=True)

print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])


Import data
['Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum' 'Dur_mean'
 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum' 'TotBytes_mean'
 'TotBytes_std' 'TotBytes_max' 'TotBytes_median' 'SrcBytes_sum'
 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max' 'SrcBytes_median'
 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=From-Norma' 'flow=To-Backgro' 'flow=From-Backg'
 'flow=To-Normal-' 'flow=Normal-V52' 'flow=From-Botne']
6


In [7]:
nb_prediction = 50
np.random.seed(seed=123456)
tab_seed = np.random.randint(0, 1000000000, nb_prediction)
print(tab_seed)

tab_train_precision = np.array([0.]*nb_prediction)
tab_train_recall = np.array([0.]*nb_prediction)
tab_train_fbeta_score = np.array([0.]*nb_prediction)

tab_test_precision = np.array([0.]*nb_prediction)
tab_test_recall = np.array([0.]*nb_prediction)
tab_test_fbeta_score = np.array([0.]*nb_prediction)


[545331265  64051946 930796018 636193841  44994104 883990699 632376047
 123822635 544385883 780062752 370319575 553050788 864905352 385976778
 387642634 926825740 528719691 508226068 184796139 357437743 528333490
 581730774 850389862 904355447 555943458 826610574 471925446 441274154
 302315714 753964207 438316612 682070622 983063469 876566388 115035351
 603291380 407092436 139062484 590788013 387585019 441951173 260506609
 787917988 628041163 776749475 385983453 140379815 317878160 123195836
 263152482]


In [10]:
for i in range(0, nb_prediction):
    y_bin6 = y == np.where(labels == 'flow=From-Botne')[0][0]
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y_bin6, test_size=0.33, random_state=tab_seed[i])

    X_train_new, y_train_new = utils.resample(
        X_train, y_train, n_samples=X_train.shape[0]*10, random_state=tab_seed[i])

    print(i)
    print("y_train", np.unique(y_train_new, return_counts=True))
    print("y_test", np.unique(y_test, return_counts=True))

    clf = ensemble.GradientBoostingClassifier(
        loss='exponential', learning_rate=0.1, n_estimators=100, max_depth=4, random_state=tab_seed[i], verbose=0)
    clf.fit(X_train_new, y_train_new)

    y_pred_train = clf.predict(X_train_new)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(
        y_train_new, y_pred_train)
    tab_train_precision[i] = precision[1]
    tab_train_recall[i] = recall[1]
    tab_train_fbeta_score[i] = fbeta_score[1]

    y_pred_test = clf.predict(X_test)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(
        y_test, y_pred_test)
    tab_test_precision[i] = precision[1]
    tab_test_recall[i] = recall[1]
    tab_test_fbeta_score[i] = fbeta_score[1]


0
y_train (array([False,  True]), array([412261,    129], dtype=int64))
y_test (array([False,  True]), array([20307,     5], dtype=int64))
1
y_train (array([False,  True]), array([412279,    111], dtype=int64))
y_test (array([False,  True]), array([20307,     5], dtype=int64))
2
y_train (array([False,  True]), array([412279,    111], dtype=int64))
y_test (array([False,  True]), array([20306,     6], dtype=int64))
3
y_train (array([False,  True]), array([412283,    107], dtype=int64))
y_test (array([False,  True]), array([20305,     7], dtype=int64))
4
y_train (array([False,  True]), array([412282,    108], dtype=int64))
y_test (array([False,  True]), array([20307,     5], dtype=int64))
5
y_train (array([False,  True]), array([412313,     77], dtype=int64))
y_test (array([False,  True]), array([20304,     8], dtype=int64))
6
y_train (array([False,  True]), array([412332,     58], dtype=int64))
y_test (array([False,  True]), array([20302,    10], dtype=int64))
7
y_train (array([False,  T

In [None]:
print("Train")
print("precision = ", tab_train_precision.mean(),
      tab_train_precision.std(), tab_train_precision)
print("recall = ", tab_train_recall.mean(),
      tab_train_recall.std(), tab_train_recall)
print("fbeta_score = ", tab_train_fbeta_score.mean(),
      tab_train_fbeta_score.std(), tab_train_fbeta_score)

print("Test")
print("precision = ", tab_test_precision.mean(),
      tab_test_precision.std(), tab_test_precision)
print("recall = ", tab_test_recall.mean(),
      tab_test_recall.std(), tab_test_recall)
print("fbeta_score = ", tab_test_fbeta_score.mean(),
      tab_test_fbeta_score.std(), tab_test_fbeta_score)
