In [1]:
#This repo is to share the code shared during the TechGig webinar. 
#We have used a dataset taken from Canadian Institute of CyberSecurity. 
#We cannot share the data here; please contact a.habibi.l@unb.ca to obtain the dataset.
import pandas as pd
import numpy as np

In [2]:
datapath = 'SelectedFeatures-10s-TOR-NonTOR.csv'

In [3]:
# Read data from csv
dataframe = pd.read_csv(datapath,low_memory=False)

In [4]:
# Normalise the data
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:,feature_name]= pd.to_numeric(df.loc[:,feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:,feature_name] = (df.loc[:,feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:,feature_name] = (df.loc[:,feature_name]- min_value)    
    return df

In [5]:
# Randomly permute the data
print dataframe.shape
dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()
print(dataframe.describe())
print (list(dataframe))

(67834, 29)
        Source Port   Destination Port      Protocol   Flow Duration  \
count  67834.000000       67834.000000  67834.000000    6.783400e+04   
mean   37912.753324       11566.395967     12.167291    2.991884e+06   
std    20986.077326       18374.765123      5.459410    4.063005e+06   
min       21.000000          21.000000      6.000000    0.000000e+00   
25%    19305.000000         137.000000      6.000000    4.435975e+04   
50%    43677.000000         443.000000     17.000000    4.108570e+05   
75%    54685.000000       16311.000000     17.000000    7.325550e+06   
max    65534.000000       65514.000000     17.000000    1.000000e+07   

        Flow IAT Mean   Flow IAT Std   Flow IAT Max   Flow IAT Min  \
count    6.783400e+04   6.783400e+04   6.783400e+04   6.783400e+04   
mean     3.155927e+05   2.209662e+05   8.983857e+05   1.924432e+05   
std      6.988069e+05   6.409506e+05   1.738476e+06   5.780313e+05   
min      0.000000e+00   0.000000e+00   0.000000e+00  -2.255

In [6]:
keys = dataframe.keys()
# Feature Selection, Dropping Source IP, Source Port, Destination IP and Destination Port as it 
# specific to each organisation network and generic model should not contain them. 
data_to_process = dataframe[keys[4:len(keys) - 1]].copy()
#data_to_process = dataframe[[' Source Port',' Destination Port', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean','Fwd IAT Mean','Bwd IAT Mean','Active Mean','Idle Mean','label']].copy()
# do a data normalization
x_normalised = dfNormalize(data_to_process)
print(x_normalised.describe())

           Protocol   Flow Duration   Flow Bytes/s   Flow Packets/s  \
count  67834.000000    67834.000000   6.783400e+04     67834.000000   
mean       0.560663        0.299188   4.225052e-04         0.001101   
std        0.496310        0.406300   9.373635e-03         0.016519   
min        0.000000        0.000000   0.000000e+00         0.000000   
25%        0.000000        0.004436   1.744247e-07         0.000002   
50%        1.000000        0.041086   2.218592e-06         0.000005   
75%        1.000000        0.732555   3.613572e-05         0.000034   
max        1.000000        1.000000   1.000000e+00         1.000000   

        Flow IAT Mean   Flow IAT Std   Flow IAT Max   Flow IAT Min  \
count    67834.000000   67834.000000   67834.000000   67834.000000   
mean         0.031600       0.031363       0.089855       0.019491   
std          0.069971       0.090973       0.173880       0.057865   
min          0.000000       0.000000       0.000000       0.000000   
25%       

In [7]:
# get the train and test data
x_train = x_normalised.sample(frac=0.8, replace=True)
x_test = x_normalised.drop(x_train.index)

# change the labels and affix them
change_labels = lambda x: 1 if x == 'nonTOR' else 0
y_train = dataframe['label'].apply(change_labels).loc[x_train.index]
y_test = dataframe['label'].apply(change_labels).loc[x_test.index]

In [8]:
# Figure the Feature dimensions so that it can be used in Deep Neural Net later
feature_dim = x_train.shape[1]
print feature_dim
print x_train[y_train==0].shape

24
(6415, 24)


In [9]:
# Just to check the y_train
#print y_train[0:5]

In [10]:
# Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
lr=LogisticRegressionCV() 
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
target_names = ['class 0 - NonTor', 'class 1 - Tor']
print(classification_report(y_test, y_predict, target_names=target_names))
print("Accuracy = {:.2f}".format(lr.score(x_test, y_test.values)*100))

                  precision    recall  f1-score   support

class 0 - NonTor       0.77      0.72      0.74      3609
   class 1 - Tor       0.96      0.97      0.97     26857

     avg / total       0.94      0.94      0.94     30466

Accuracy = 94.04


In [11]:
# Deep Neural Net Implementation using Keras and TensorFlow
import tensorflow as tf
sess= tf.Session()
from keras import backend as K
reload(K)
K.set_session(sess)
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils
hidden_layers = 10
neurons_num = 128
model = Sequential()
model.add(Dense(feature_dim, input_dim= feature_dim, kernel_initializer='normal', activation='relu'))
for _ in range(0, hidden_layers-1):
    model.add(Dense(neurons_num, kernel_initializer='normal', activation='relu'))
model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
print model.summary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 128)               3200      
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_6 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_7 (Dense)              (None, 128)               16512     
__________

In [12]:
# Deep Neural Net Implementation using Keras and TensorFlow
# Compute the accuracies and visualise using TensorBoard
from keras.callbacks import TensorBoard
from time import time
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
model.fit(x_train,y_train, epochs=10, batch_size=100, verbose=2, callbacks=[tensorboard],validation_split=0.1)
scores = model.evaluate(x_test, y_test, verbose=2)
#loss, accuracy = model.evaluate(x_test, y_test)
print("\nTest %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
scores_0 = model.evaluate(x_test[y_test==0], y_test[y_test==0])
print("\nTest %s for class 0: %.2f%%" % (model.metrics_names[1], scores_0[1]*100))
scores_1 = model.evaluate(x_test[y_test==1], y_test[y_test==1])
print("\nTest %s for class 1: %.2f%%" % (model.metrics_names[1], scores_1[1]*100))

Train on 48840 samples, validate on 5427 samples
Epoch 1/10
 - 3s - loss: 0.1949 - acc: 0.9171 - val_loss: 0.1508 - val_acc: 0.9355
Epoch 2/10
 - 3s - loss: 0.1439 - acc: 0.9410 - val_loss: 0.1371 - val_acc: 0.9403
Epoch 3/10
 - 3s - loss: 0.1407 - acc: 0.9404 - val_loss: 0.1359 - val_acc: 0.9364
Epoch 4/10
 - 3s - loss: 0.1373 - acc: 0.9430 - val_loss: 0.1322 - val_acc: 0.9432
Epoch 5/10
 - 3s - loss: 0.1327 - acc: 0.9440 - val_loss: 0.1315 - val_acc: 0.9438
Epoch 6/10
 - 3s - loss: 0.1322 - acc: 0.9456 - val_loss: 0.1282 - val_acc: 0.9475
Epoch 7/10
 - 3s - loss: 0.1329 - acc: 0.9443 - val_loss: 0.1299 - val_acc: 0.9436
Epoch 8/10
 - 3s - loss: 0.1284 - acc: 0.9462 - val_loss: 0.1321 - val_acc: 0.9444
Epoch 9/10
 - 3s - loss: 0.1238 - acc: 0.9475 - val_loss: 0.1256 - val_acc: 0.9488
Epoch 10/10
 - 3s - loss: 0.1182 - acc: 0.9482 - val_loss: 0.1161 - val_acc: 0.9495

Test acc: 94.84%

Test acc for class 0: 71.90%

Test acc for class 1: 97.93%


In [13]:
# Good practice to clear Keras session
K.clear_session()