<a href="https://colab.research.google.com/github/samantha96/GCN-for-network-attack/blob/main/GCN_for_network_attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import itertools
import random

import matplotlib.pyplot as plt

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch as T
device = T.device("cpu")

In [None]:
#import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
srcIP = pd.read_csv("/home/ml/bnn/data/unknownSrcIP17-5m.csv", sep = '|',low_memory=False)
#len(srcIP)
srcIPBot = pd.read_csv("/home/ml/cnn/data/botSrcIP17.csv", sep = '|',low_memory=False)

print(len(srcIP))
print(len(srcIPBot))

In [None]:
srcIP['label'] = 0
srcIPBot['label'] = 1

## Clean the fields
#srcIP.srcIP= srcIP.srcIP.str.replace('.*&','')
#srcIP.head(3)

In [None]:
srcIP = [srcIP, srcIPBot]
srcIP = pd.concat(srcIP)
srcIP.reset_index(drop=True, inplace=True)

print(len(srcIP))
srcIP.head(2)

In [None]:
numBotSrcIP = len(srcIP.loc[srcIP["label"] == 1]['srcIP'])
print(f'Number of BotSrcIP: {numBotSrcIP}')
numUnknownSrcIP = len(srcIP.loc[srcIP["label"] == 0]['srcIP'])
print(f'Number of UnknownSrcIP: {numUnknownSrcIP}')

numUniBotSrcIP = len(srcIP.loc[srcIP["label"] == 1]['srcIP'].unique())
print(f'Number of Unique BotSrcIP: {numUniBotSrcIP}')
numUniUnknownSrcIP = len(srcIP.loc[srcIP["label"] == 0]['srcIP'].unique())
print(f'Number of Unique UnknownSrcIP: {numUniUnknownSrcIP}')

In [None]:
### Use unique data
#srcIP = srcIP.rename(columns = {'srciptypid': 'label'})
srcIP = srcIP.groupby(['srcIP','label']).mean()
srcIP = srcIP.reset_index('label')
print(len(srcIP))
srcIP.head(2)

In [None]:
srcIP = srcIP.drop(columns='keyid')
srcIP.head(2)

In [None]:
labels = np.array(srcIP.pop('label'))
labels

In [None]:
# Set random seed to ensure reproducible runs
RSEED = 60

# 30% examples in test data
train, test, train_labels, test_labels = train_test_split(srcIP, labels, 
                                                          stratify = labels,
                                                          test_size = 0.3, 
                                                          random_state = RSEED)

In [None]:
# Train and Test dataset informaiton
print(f'Train dataset: {train.shape}')
unique, counts = np.unique(train_labels, return_counts=True)
print (np.asarray((unique, counts)).T)
print(f'Test dataset: {test.shape}')
unique, counts = np.unique(test_labels, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
train.head(2)

## Torch Neural Network

In [None]:
def acc_coarse(model, ds):
  inpts = ds[:]['predictors']  # all rows
  targets = ds[:]['target']    # all target 0s and 1s
  with T.no_grad():
    oupts = model(inpts)         # all computed ouputs
  pred_y = oupts >= 0.5        # tensor of 0s and 1s
  num_correct = T.sum(targets == pred_y)
  acc = (num_correct.item() * 1.0 / len(ds))  # scalar
  return acc, targets, oupts

In [None]:
class Classifier(T.nn.Module):
  def __init__(self):
    super(Classifier, self).__init__()
    self.hid1 = T.nn.Linear(27, 10) 
    self.hid2 = T.nn.Linear(10, 5)
    self.oupt = T.nn.Linear(5, 1)

    T.nn.init.xavier_uniform_(self.hid1.weight) 
    T.nn.init.zeros_(self.hid1.bias)
    T.nn.init.xavier_uniform_(self.hid2.weight) 
    T.nn.init.zeros_(self.hid2.bias)
    T.nn.init.xavier_uniform_(self.oupt.weight) 
    T.nn.init.zeros_(self.oupt.bias)

  def forward(self, x):
    z = T.tanh(self.hid1(x)) 
    z = T.tanh(self.hid2(z))
    z = T.sigmoid(self.oupt(z)) 
    return z

In [None]:
class ProcessDataset(T.utils.data.Dataset):

  def __init__(self, data_file, label_file, num_rows=None):

    self.x_data = T.tensor(data_file.values,
      dtype=T.float32).to(device)
    self.y_data = T.tensor(label_file,
      dtype=T.float32).to(device)

    self.y_data = self.y_data.reshape(-1,1)

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    if T.is_tensor(idx):
      idx = idx.tolist()
    preds = self.x_data[idx,:]  
    lbl = self.y_data[idx,:]    
    sample = { 'predictors' : preds, 'target' : lbl }

    return sample

In [None]:
#### Plot Confusion Matrix

from sklearn.metrics import confusion_matrix
import itertools
from matplotlib.colors import Normalize
import matplotlib.pyplot as plt
#import matplotlib.cm as cm

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.figure(figsize = (8, 8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap, norm=Normalize(0,200))
    #plt.imshow(cm, interpolation='bilinear', cmap=cmap, norm=Normalize(0,100))
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 4.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="Green" if cm[i, j] > thresh else "Red")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

In [None]:
%%time 
# Main
# 0. get started
print("\nStart using PyTorch \n")
T.manual_seed(1)
np.random.seed(1)

# 1. create Dataset and DataLoader objects
print("Creating the train and test DataLoader ")

train_ds = ProcessDataset(train, train_labels)  # all rows
test_ds = ProcessDataset(test, test_labels)

bat_size = 30000
train_ldr = T.utils.data.DataLoader(train_ds,
  batch_size=bat_size, shuffle=True)

# 2. create neural network
print("Creating 27-(10-10)-1 binary NN classifier ")
model = Classifier().to(device)

# 3. train network
print("\nPreparing training")
model = model.train()  # set training mode
lrn_rate = 0.002
loss_obj = T.nn.BCELoss()  # binary cross entropy
optimizer = T.optim.SGD(model.parameters(),lr=lrn_rate)
max_epochs = 200
ep_log_interval = 10
epoch_losses = []

print("Loss function: " + str(loss_obj))
print("Optimizer: SGD")
print("Learn rate: ", lrn_rate)
print("Batch size: ", bat_size)
print("Max epochs: " + str(max_epochs))

print("\nStarting training")
for epoch in range(0, max_epochs):
    epoch_loss = 0.0            # for one full epoch
    num_lines_read = 0

    for (batch_idx, batch) in enumerate(train_ldr):
      X = batch['predictors']  # [Batch size, 27] inputs
      Y = batch['target']      # [Batch size,1]  targets
      oupt = model(X)            # [Batch size,1]  computeds 

      loss_val = loss_obj(oupt, Y)   # a tensor
      epoch_loss += loss_val.item()  # accumulate
      # epoch_loss += loss_val  # is OK
      # epoch_loss_custom += my_bce(model, batch)     
    
      optimizer.zero_grad() # reset all gradients
      loss_val.backward()   # compute all gradients
      optimizer.step()      # update all weights
    
    epoch_losses.append(epoch_loss)
    if epoch % ep_log_interval == 0:
      print("epoch = %4d   loss = %0.4f" % (epoch, epoch_loss))
      
print("Training Done ")

In [None]:
#4. evaluate model
model = model.eval()
acc_train,train_target,train_prob = acc_coarse(model, train_ds)
print("\nAccuracy on train data = %0.2f%%" % (acc_train * 100))
acc_test,test_target,test_prob = acc_coarse(model, test_ds)
print("Accuracy on test data = %0.2f%%" % (acc_test * 100))

# 5. save model
print("\nSaving trained model state_dict \n")
path = "./Models/saveCNNModel.pth"
T.save(model.state_dict(), path)

In [None]:
print(len(test_target))
len(test_prob)

In [None]:
results = pd.DataFrame(test_target.numpy().astype(int),columns=['trueLabel'])
results['predProb'] = test_prob.numpy()

In [None]:
#Predict based on the threshold of the probability
probCut = 0.5
results['predLabel'] = results.predProb.apply(lambda x: 1 if x >= probCut else 0)
print(results['trueLabel'].sum())
print(results['predLabel'].sum())

#cm = confusion_matrix(test_labels, predictions)
cm = confusion_matrix(results['predLabel'], results['trueLabel'])
#cm=cm.T
cm[(0,0)], cm[(1,1)] = cm[(1,1)], cm[(0,0)]
plot_confusion_matrix(cm, classes = ['BotSrcIP', 'UnknowSrcIP'],
                      title = "Test CM, Prob = " + str(probCut) + ", Batch Size = " + str(bat_size))

In [None]:
print(f'Train dataset: {train.shape}')
unique, counts = np.unique(train_labels, return_counts=True)
print (np.asarray((unique, counts)).T)
print(f'Test dataset: {test.shape}')
unique, counts = np.unique(test_labels, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
numMidProb20 = len(results.loc[(results["predProb"] >= 0.4) & (results["predProb"] <= 0.6)])
print("# of IPs in the probability range (0.4 - 0.6):", numMidProb20)
numMidProb60 = len(results.loc[(results["predProb"] >= 0.2) & (results["predProb"] <= 0.8)])
print("# of IPs in the probability range (0.2 - 0.8):", numMidProb60)
numMidProb80 = len(results.loc[(results["predProb"] >= 0.1) & (results["predProb"] <= 0.9)])
print("# of IPs in the probability range (0.1 - 0.9):", numMidProb80)

In [None]:
# Histogram Plot
from matplotlib.patches import Rectangle

fig, ax = plt.subplots(figsize=(10,10))
colors = {0:'green', 1:'red'}
#labels = ['Bot','Unknown']

#plt.hist(results.predProb, color="green")
plt.hist([results.loc[results.trueLabel == x, 'predProb'] for x in colors], color=['green','red'])

#create legend
handles = [Rectangle((0,0),1,1,color=c,ec="k") for c in ['green','red']]
labels= ["Unknown","Bot"]
plt.legend(handles, labels)

#plt.pie(results.predProb)
plt.xlabel('Predict Probability')
plt.ylabel('Frequency')
plt.title("Validate Predicted Probability Distribution")
plt.show();

In [None]:
import sklearn.metrics as metrics

fpr, tpr, threshold = metrics.roc_curve(results['trueLabel'], results['predProb'])
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Curve, bs=1000 in Distinct')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
## Plot errors
fig, ax = plt.subplots(1, 1)
plt.title('Training Loss vs Epoch, lr=0.001, bs=1000, Distinct')
plt.xlabel('Epoch')
plt.ylabel('Training Loss', color = "r", fontsize=15)
ax.plot(epoch_losses, 'r')
#ax2 = ax.twinx()
#ax2.plot(epochError[:100], 'g')
#ax2.set_ylabel("Testing Error",color="g",fontsize=15)
plt.draw()