In [17]:
#import statements
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from pathlib import Path
import os
import sys
from sklearn.model_selection import train_test_split
from helper_methods import print_schema, encode_numeric_zscore, encode_text_dummy, print_column, encode_text_index, to_xy, plot_losses
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt 

### Define Helper Methods

In [18]:
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y, file_path):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.savefig(file_path)
    plt.close()

### Setup Environment
Create folders for putting test output in

In [19]:
base_path=os.path.join(os.getcwd(), 'test-output/')

iteration='iteration-4'
full_path = os.path.join(base_path, iteration)
try:
        os.mkdir(base_path)
except Exception as e:
     print(f"Probably safe to ignore the following error: \n{e}")
try:
    os.mkdir(full_path)
except Exception as e:
    print(f"{e}\nExiting to protect previous work.")
    sys.exit(0)
    

Probably safe to ignore the following error: 
[WinError 183] Cannot create a file when that file already exists: 'c:\\Users\\User\\csc180\\network-traffic-analyzer\\test-output/'


### Read Datasets
simple one-shot

In [20]:
#read from train/test datasets
training_dataframe = pd.read_csv(os.path.join(os.getcwd(), 'data/UNSW_NB15_training-set.csv'))
testing_dataframe = pd.read_csv(os.path.join(os.getcwd(), 'data/UNSW_NB15_testing-set.csv'))

### Clean and Merge
Drop missing values, drop unique columns, etc

In [21]:
#drop all entries that have a missing value anywhere in the set
training_dataframe = training_dataframe.replace('-', np.nan).dropna()
testing_dataframe = testing_dataframe.replace('-', np.nan).dropna()


In [22]:
testing_dataframe.shape

(35179, 45)

In [23]:
#get one instance of each value from each dataframe
training_unique = training_dataframe["proto"].unique()
testing_unique = testing_dataframe["proto"].unique()

#get only values from each numpy array
common_values = np.intersect1d(training_unique, testing_unique)

#make a filtered dataframe containing only values in common_values
training_dataframe = training_dataframe[training_dataframe['proto'].isin(common_values)]
testing_dataframe = testing_dataframe[testing_dataframe['proto'].isin(common_values)]

#same for service
training_unique = training_dataframe["service"].unique()
testing_unique = testing_dataframe["service"].unique()
common_values = np.intersect1d(training_unique, testing_unique)
training_dataframe = training_dataframe[training_dataframe['service'].isin(common_values)]
testing_dataframe = testing_dataframe[testing_dataframe['service'].isin(common_values)]

#same for state
training_unique = training_dataframe["state"].unique()
testing_unique = testing_dataframe["state"].unique()
common_values = np.intersect1d(training_unique, testing_unique)
training_dataframe = training_dataframe[training_dataframe['state'].isin(common_values)]
testing_dataframe = testing_dataframe[testing_dataframe['state'].isin(common_values)]


In [24]:
testing_dataframe.shape

(35178, 45)

In [25]:
print_schema(training_dataframe)
print_schema(testing_dataframe)

~~~~~~dataframe schema~~~~~~
Dataframe shape: (81159, 45) | Dataframe length: 81159
Column labels: 
Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')
Dataframe head: 
    id       dur proto service state  spkts  dpkts  sbytes  dbytes  \
3    4  1.681642   tcp     ftp   FIN     12     12     628     770   
11  12  2.093085   tcp    smtp   FIN     62     28   56329    2212   
15  16  0.000002   udp    snmp   INT      2      0     138       0   
17  18  

### Encode Data

One-hot encode relevant rows


In [26]:
# one-hot encode non-y features of training dataset
encode_text_dummy(training_dataframe, 'proto')
encode_text_dummy(training_dataframe, 'service')
encode_text_dummy(training_dataframe, 'state')

# one-hot encode attack-category column (y)
training_dataframe.drop('attack_cat', inplace=True, axis=1)
# encode_text_index(training_dataframe, 'attack_cat')
# one-hot encode non-y features of testing dataset
encode_text_dummy(testing_dataframe, 'proto')
encode_text_dummy(testing_dataframe, 'service')
encode_text_dummy(testing_dataframe, 'state')

# one-hot encode attack-category column (y)
testing_dataframe.drop('attack_cat', inplace=True, axis=1)
# encode_text_index(testing_dataframe, 'attack_cat')


In [27]:
print_schema(training_dataframe)
print_schema(testing_dataframe)


~~~~~~dataframe schema~~~~~~
Dataframe shape: (81159, 59) | Dataframe length: 81159
Column labels: 
Index(['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label', 'proto-tcp',
       'proto-udp', 'service-dhcp', 'service-dns', 'service-ftp',
       'service-ftp-data', 'service-http', 'service-irc', 'service-pop3',
       'service-radius', 'service-smtp', 'service-snmp', 'service-ssh',
       'service-ssl', 'state-CON', 'state-FIN', 'state-INT', 'state-REQ'],
      dtype='object')
Dataframe head: 
    id       dur  spkts  dpkts  sbytes 

Z-Score relevant rows

In [28]:
for column in training_dataframe.columns:
    if column not in ['service', 'srcip', 'dstip', 'proto', 'state','service', 'attack_cat', 'label', 'is_ftp_login', 'is_sm_ips_ports', 'Stime', 'Ltime']:
        encode_numeric_zscore(training_dataframe, column)
for column in testing_dataframe.columns:
    if column not in ['service', 'srcip', 'dstip', 'proto', 'state','service', 'attack_cat', 'label', 'is_ftp_login', 'is_sm_ips_ports', 'Stime', 'Ltime']:
        encode_numeric_zscore(testing_dataframe, column)


In [29]:
print_schema(training_dataframe)
print_schema(testing_dataframe)

~~~~~~dataframe schema~~~~~~
Dataframe shape: (81159, 59) | Dataframe length: 81159
Column labels: 
Index(['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label', 'proto-tcp',
       'proto-udp', 'service-dhcp', 'service-dns', 'service-ftp',
       'service-ftp-data', 'service-http', 'service-irc', 'service-pop3',
       'service-radius', 'service-smtp', 'service-snmp', 'service-ssh',
       'service-ssl', 'state-CON', 'state-FIN', 'state-INT', 'state-REQ'],
      dtype='object')
Dataframe head: 
          id       dur     spkts     dp

In [30]:
x_train,y_train=to_xy(training_dataframe, 'label')
x_test, y_test=to_xy(testing_dataframe, 'label')

### Create Fully Connected Model
Design the model's architecture. 

In [31]:
# TODO implement a more robust training setup, implement model tabulation 

checkpointer = ModelCheckpoint(filepath=os.path.join(full_path, "fcn-best_weights.keras"), verbose=0, save_best_only=True) # save best model
for i in range(5):
        print(f"training FCN: {i}")
        model = Sequential()
        model.add(Dense(8, input_dim=x_train.shape[1], activation='relu'))
        model.add(Dense(8, activation='relu'))
        model.add(Dense(2, activation='sigmoid')) # single output w/o activation function b/c regression
        model.compile(loss='binary_crossentropy', optimizer=SGD(learning_rate=0.01, momentum=0.01))
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='min', restore_best_weights=True)
        history = model.fit(x_train, y_train, validation_data=(x_test, y_test),batch_size=64, callbacks=[monitor, checkpointer], verbose=2, epochs=100)
        plot_losses(history, full_path, i)

training FCN: 0
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


1269/1269 - 3s - 2ms/step - loss: 0.2271 - val_loss: 0.3107
Epoch 2/100
1269/1269 - 2s - 2ms/step - loss: 0.0844 - val_loss: 0.2738
Epoch 3/100
1269/1269 - 2s - 2ms/step - loss: 0.0706 - val_loss: 0.2775
Epoch 4/100
1269/1269 - 2s - 2ms/step - loss: 0.0645 - val_loss: 0.2810
Epoch 5/100
1269/1269 - 3s - 2ms/step - loss: 0.0602 - val_loss: 0.2957
Epoch 6/100
1269/1269 - 2s - 2ms/step - loss: 0.0567 - val_loss: 0.3404
Epoch 7/100
1269/1269 - 2s - 2ms/step - loss: 0.0539 - val_loss: 0.3654
Epoch 8/100
1269/1269 - 2s - 2ms/step - loss: 0.0517 - val_loss: 0.3806
Epoch 9/100
1269/1269 - 2s - 2ms/step - loss: 0.0500 - val_loss: 0.4177
Epoch 10/100
1269/1269 - 2s - 2ms/step - loss: 0.0486 - val_loss: 0.4398
Epoch 11/100
1269/1269 - 2s - 2ms/step - loss: 0.0476 - val_loss: 0.4548
Epoch 12/100
1269/1269 - 3s - 2ms/step - loss: 0.0468 - val_loss: 0.4681
Epoch 12: early stopping
Restoring model weights from the end of the best epoch: 2.
training FCN: 1
Epoch 1/100
1269/1269 - 3s - 2ms/step - loss:

### Perform Evaluation
Make charts, generate reports, compute scores

In [32]:
# load model, do prediction, do evaulation
model.load_weights(os.path.join(full_path, "fcn-best_weights.keras"))
prediction = model.predict(x_test)
pred_roc = prediction[:,1]
y_true = np.argmax(y_test, axis=1)

plot_roc(pred_roc, y_true, os.path.join(full_path, 'fcn-roc-curve.png'))
try:
        pred = np.argmax(prediction, axis=1)
        # confusion matrix
        cm = confusion_matrix(y_true, pred)
        plt.figure()
        plot_confusion_matrix(cm, ['normal', 'malicious'])
        plt.savefig(os.path.join(full_path, 'fcn-confusion-matrix.png'))
        plt.close()
except Exception as e:
        print(f"error creating confusion matrix plot:\n{e} ")
try:
        with open(os.path.join(full_path, 'fcn-metrics.txt'), 'x') as file:
                prediction=np.argmax(prediction, axis=1)
                y_true = np.argmax(y_test, axis=1)
                accuracy_score = metrics.accuracy_score(y_true, prediction)
                precision_score = metrics.precision_score(y_true, prediction, average='weighted')
                recall_score = metrics.recall_score(y_true, prediction, average= "weighted")
                fl_score = metrics.f1_score(y_true, prediction, average= "weighted")
                file.write(f"Accuracy Score: {accuracy_score}\n")
                file.write(f"Precision Score: {precision_score}\n")
                file.write("Recall score: {}\n".format(recall_score))
                file.write("F1 score: {}\n".format(fl_score))
                log_loss = metrics.log_loss(y_test, prediction)
                print("Log loss score: {}\n".format(log_loss))                
                file.write(metrics.classification_report(y_true, prediction))
                file.write("\nNumpy array of predictions\n")
                file.write(np.array_str(prediction[0:5]))
                file.write("y_test:\n")
                file.write(np.array2string(y_test[0:5]))
except OSError as e:
        print(f"Error while writing model metrics: \n{e}")

[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 898us/step
Log loss score: 2.9436982257926427

