In [4]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import socket
import glob
import os
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
import torchbnn as bnn
from torch.utils.data import TensorDataset, DataLoader
import time
import warnings
warnings.filterwarnings("ignore")

In [5]:
if socket.gethostname() == 'Rohits-MacBook-Pro.local':
    rootdir = '/Users/rohitchanne/Documents/capstone/data/data_parquet/'
else: 
    rootdir = '' # Enter your hone dir here

In [6]:
list_of_files = filter( os.path.isfile, glob.glob(rootdir + '*') )
files_with_size = [ file_path for file_path in list_of_files ]

In [7]:
dfs_parquet = {}
for file_path in files_with_size:
    if 'parquet' in file_path:
        file_name = file_path.split('/')[-1]
        df_name = file_name.split('_')[0]
        print(f'Reading Data File: {file_name}')       
        dfs_parquet[df_name] = pd.read_parquet(file_path, engine='pyarrow')

Reading Data File: Wednesday-21-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Friday-23-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Thuesday-20-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Thursday-22-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Friday-16-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Wednesday-28-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Wednesday-14-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Thursday-15-02-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Thursday-01-03-2018_TrafficForML_CICFlowMeter_clean.parquet
Reading Data File: Friday-02-03-2018_TrafficForML_CICFlowMeter_clean.parquet


In [8]:
for k,df in dfs_parquet.items():
    print(k)
    df['is_allowed'] = df['Label'] == 'Benign'
    del df['Label']
    print(f'DF:{k}, Shape{df.shape}')

Wednesday-21-02-2018
DF:Wednesday-21-02-2018, Shape(1048575, 80)
Friday-23-02-2018
DF:Friday-23-02-2018, Shape(1048575, 80)
Thuesday-20-02-2018
DF:Thuesday-20-02-2018, Shape(7948748, 84)
Thursday-22-02-2018
DF:Thursday-22-02-2018, Shape(1048575, 80)
Friday-16-02-2018
DF:Friday-16-02-2018, Shape(1048574, 80)
Wednesday-28-02-2018
DF:Wednesday-28-02-2018, Shape(613071, 80)
Wednesday-14-02-2018
DF:Wednesday-14-02-2018, Shape(1048575, 80)
Thursday-15-02-2018
DF:Thursday-15-02-2018, Shape(1048575, 80)
Thursday-01-03-2018
DF:Thursday-01-03-2018, Shape(331100, 80)
Friday-02-03-2018
DF:Friday-02-03-2018, Shape(1048575, 80)


In [9]:
from sklearn.preprocessing import MinMaxScaler
cat_cols = ['Dst Port']

In [10]:
df_train_rnn = pd.concat([dfs_parquet['Friday-02-03-2018'], 
                      dfs_parquet['Friday-16-02-2018'], 
                      dfs_parquet['Friday-23-02-2018'],
                      dfs_parquet['Thursday-01-03-2018'],
                      dfs_parquet['Thursday-15-02-2018'],
                      dfs_parquet['Thursday-22-02-2018']
                     ]
                    )
#Transforming timestamp and category columns and also scaling data using minmax scalar
df_train_rnn['Timestamp'] = pd.to_datetime(df_train_rnn['Timestamp'])
df_train_rnn['Date'] = pd.to_datetime(df_train_rnn['Timestamp']).dt.date
df_train_rnn['TS_relative'] = (df_train_rnn['Timestamp'].astype(int) - 
                             pd.to_datetime(df_train_rnn['Date']).astype(int))/ 10**9
df_train_rnn = df_train_rnn.drop(['Timestamp'], axis = 1)
df_train_rnn = df_train_rnn.drop(['Date'], axis = 1)
df_train_rnn.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train_rnn.fillna(0, inplace=True)
df_train_rnn[cat_cols] = df_train_rnn[cat_cols].astype('category')
X_train = df_train_rnn.drop(['is_allowed'], axis = 1)
y_train = df_train_rnn['is_allowed']*1 
stdsc = MinMaxScaler()
stdsc.fit(X_train)
X_train_scaled = stdsc.transform(X_train)
x_for_nn =  torch.from_numpy(X_train_scaled).float()
x_for_nn.shape
y_for_nn = torch.from_numpy(y_train.to_numpy())
y_for_nn.shape

torch.Size([5573974])

In [11]:
num_classes = 2
num_epochs = 2
batch_size = 32768
learning_rate = 0.001

input_size = 79
sequence_length = 28
hidden_size = 128
num_layers = 2
device = torch.device("cpu")

# RNN model: GRU Implementation

In [12]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

model = GRUNet(input_dim= input_size, hidden_dim = hidden_size, output_dim = 1, n_layers = num_layers).to(device)

In [13]:
x_for_nn.shape

torch.Size([5573974, 79])

In [14]:
batch_size = 32768
train_data = TensorDataset(x_for_nn, y_for_nn)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
model.train()
for epoch in range(num_epochs):
        start_time = time.process_time()
        h = model.init_hidden(batch_size)
        print(h.shape)
        avg_loss = 0.
        counter = 0
        for x, y in train_loader:
            counter += 1
            h = h.data
            model.zero_grad() 
            print(x.shape)
            x = x.view([batch_size, -1, input_size]).to(device)
            y = y.to(device)
            y = y.unsqueeze(1)
            out, h = model(x.float(), h)
            loss = criterion(out, y.float())
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            print(f"Epoch {epoch}, Batch {counter}...... Average Loss for Epoch: {avg_loss/counter}")
            _, predicted = torch.max(out.data, 1)
            print(f'F1: {f1_score(y, predicted)}')
            print(f'Accuracy: {accuracy_score(y, predicted)}')
            print(f'Precision: {precision_score(y, predicted)}')
            print(f'Recall: {recall_score(y, predicted)}') 
        current_time = time.process_time()

torch.Size([2, 32768, 128])
torch.Size([32768, 79])
Epoch 0, Batch 1...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18890380859375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 2...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.188323974609375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 3...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.188995361328125
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 4...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1844482421875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 5...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18426513671875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 6...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18536376953125
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 7...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18585205078125
Precision: 0.0
Recall: 0.0
torch.Size([

torch.Size([32768, 79])
Epoch 0, Batch 61...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.183349609375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 62...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18475341796875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 63...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.188079833984375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 64...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18157958984375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 65...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18853759765625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 66...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1890869140625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 67...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.185638427734375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Ba

torch.Size([32768, 79])
Epoch 0, Batch 121...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.185943603515625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 122...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18560791015625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 123...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.184112548828125
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 124...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1826171875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 125...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18603515625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 126...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.184417724609375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 0, Batch 127...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.185760498046875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 

torch.Size([32768, 79])
Epoch 1, Batch 10...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18792724609375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 11...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18280029296875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 12...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1864013671875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 13...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18548583984375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 14...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18609619140625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 15...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18359375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 16...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18389892578125
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 17

torch.Size([32768, 79])
Epoch 1, Batch 70...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.182952880859375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 71...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18218994140625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 72...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18560791015625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 73...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18408203125
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 74...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.184783935546875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 75...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.183197021484375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 76...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1806640625
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch

torch.Size([32768, 79])
Epoch 1, Batch 130...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.18499755859375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 131...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.185302734375
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 132...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.185028076171875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 133...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.187225341796875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 134...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.187225341796875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 135...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1856689453125
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoch 1, Batch 136...... Average Loss for Epoch: 0.0
F1: 0.0
Accuracy: 0.1815185546875
Precision: 0.0
Recall: 0.0
torch.Size([32768, 79])
Epoc

In [16]:
x_for_nn.shape[0]

5573974

In [17]:
batch_size=x_for_nn.shape[0]
train_loader = DataLoader(train_data, shuffle=True, batch_size=x_for_nn.shape[0], drop_last=True)

### Running with complete data

In [None]:
num_epochs = 4
for epoch in range(num_epochs):
        start_time = time.process_time()
        h = model.init_hidden(batch_size)
        print(h.shape)
        avg_loss = 0.
        counter = 0
        for x, y in train_loader:
            counter += 1
            h = h.data
            model.zero_grad() 
            print(x.shape)
            x = x.view([batch_size, -1, input_size]).to(device)
            y = y.to(device)
            y = y.unsqueeze(1)
            out, h = model(x.float(), h)
            loss = criterion(out, y.float())
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            print(f"Epoch {epoch}...... Average Loss for Epoch: {avg_loss/counter}")
            _, predicted = torch.max(out.data, 1)
            print(f'F1: {f1_score(y, predicted)}')
            print(f'Accuracy: {accuracy_score(y, predicted)}')
            print(f'Precision: {precision_score(y, predicted)}')
            print(f'Recall: {recall_score(y, predicted)}')            
        current_time = time.process_time()

torch.Size([2, 5573974, 128])
torch.Size([5573974, 79])


# RNN model: LSTM Implementation

In [None]:
class LSTMNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
        super(LSTMNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.lstm(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [None]:
model = LSTMNet(input_dim= input_size, hidden_dim = hidden_size, output_dim = 1, n_layers = num_layers).to(device)

In [None]:
num_epochs = 4
for epoch in range(num_epochs):
        start_time = time.process_time()
        h = model.init_hidden(batch_size)
        #print(h.shape)
        avg_loss = 0.
        counter = 0
        for x, y in train_loader:
            counter += 1
            h = tuple([e.data for e in h])
            model.zero_grad() 
            print(x.shape)
            x = x.view([batch_size, -1, input_size]).to(device)
            y = y.to(device)
            y = y.unsqueeze(1)
            out, h = model(x.float(), h)
            loss = criterion(out, y.float())
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            print(f"Epoch {epoch}...... Average Loss for Epoch: {avg_loss/counter}")
            _, predicted = torch.max(out.data, 1)
            print(f'F1: {f1_score(y, predicted)}')
            print(f'Accuracy: {accuracy_score(y, predicted)}')
            print(f'Precision: {precision_score(y, predicted)}')
            print(f'Recall: {recall_score(y, predicted)}')            
        current_time = time.process_time()