In [1]:
import os
import torch

import numpy as np
import torch.optim as optim
import torch.nn.functional as F

from torch import nn
from tqdm import tqdm

In [2]:
END_TOKEN = '\x04'
TRAIN_PART = 0.7
EPOCHS = 10
LR = 1e-3
TH = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'
print(DEVICE)

cuda:0


In [3]:
normal_1_path = '../data/normal_train.txt'
normal_2_path = '../data/normal_test.txt'
abnormal_path = '../data/abnormal_test.txt'

with open(normal_1_path) as f:
    normal_raw_data = f.readlines()
with open(normal_2_path) as f:
    normal_raw_data += f.readlines()
with open(abnormal_path) as f:
    abnormal_raw_data = f.readlines()

In [4]:
tokens = set(''.join(normal_raw_data) + ''.join(abnormal_raw_data))
print(len(tokens))
tokens.add(END_TOKEN)
print(len(tokens))
print(tokens)

80
81
{'V', '1', ')', '~', 'r', 'z', 'I', ';', '&', 'J', 'j', '+', 'i', 'w', 'o', '7', 'h', '4', 'n', '=', '2', 'O', 'c', 'e', 't', 'x', 'q', 'U', 'R', '%', 'C', 'S', 'A', '6', '.', ':', 'd', 'E', '5', 'W', '8', 'F', 'N', 'B', '?', ',', '_', 'v', ' ', '0', 'l', '/', 'P', 'm', 'X', 'M', '3', 'H', 'G', '9', 'u', '\n', 'K', 'L', 'b', '-', 'Y', 'Q', '*', 'a', 'y', 'D', 'g', 'T', 'p', 's', 'Z', '(', 'k', 'f', '\x04'}


In [5]:
int2char = dict(enumerate(tokens))
char2int = {char: idx for idx, char in int2char.items()}

In [6]:
normal_samples, abnormal_samples = [], []

start_idx = 0
for idx, line in enumerate(normal_raw_data):
    if line == 'Connection: close\n':
        if normal_raw_data[idx + 1] != '\n':
            sample_as_chars = ''.join(normal_raw_data[start_idx:idx + 4]) + END_TOKEN
            start_idx = idx + 5
        else:
            sample_as_chars = ''.join(normal_raw_data[start_idx:idx + 1]) + END_TOKEN
            start_idx = idx + 3
        
        # convert from text to nums
        sample_as_nums = np.array([char2int[char] for char in sample_as_chars])
        normal_samples.append(sample_as_nums)

start_idx = 0 
for idx, line in enumerate(abnormal_raw_data):
    if line == 'Connection: close\n':
        if abnormal_raw_data[idx + 1] != '\n':
            sample_as_chars = ''.join(abnormal_raw_data[start_idx:idx + 4]) + END_TOKEN
            start_idx = idx + 5
        else:
            sample_as_chars = ''.join(abnormal_raw_data[start_idx:idx + 1]) + END_TOKEN
            start_idx = idx + 3
        
        # convert form text to nums
        sample_as_nums = np.array([char2int[char] for char in sample_as_chars])
        abnormal_samples.append(sample_as_nums)

print(len(normal_samples))  # must be 72000
print(len(abnormal_samples))  # must be 25065

72000
25065


In [7]:
samples = normal_samples + abnormal_samples
labels = np.hstack([np.zeros(len(normal_samples)), np.ones(len(abnormal_samples))])

print(len(samples))
print(len(labels))

97065
97065


In [8]:
train_indices = []
test_indices = []

for i in np.arange(len(samples)):
    if np.random.uniform() < TRAIN_PART:
        train_indices.append(i)
    else:
        test_indices.append(i)

train_indices = np.array(train_indices)
test_indices = np.array(test_indices)

In [9]:
def sample2one_hot(sample, size):
    """
    Convert the array of number to one hot vector view.
    
    :param sample: Array of ints.
    :param size: Number of unique nums.
    :return: 2D-array.
    """
    one_hot = np.zeros((len(sample), size), dtype=int)
    one_hot[np.arange(len(sample)), sample] = 1
    return one_hot

In [10]:
class TrafficRNN(nn.Module):
    def __init__(self, tokens, n_hidden=128, n_layers=1):
        super().__init__()
        self.tokens = tokens
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.int2char = dict(enumerate(tokens))
        self.char2int = {char: idx for idx, char in int2char.items()}
        self.lstm = nn.LSTM(input_size=len(tokens),
                            hidden_size=n_hidden,
                            num_layers=n_layers,
                            batch_first=True)
        self.fc = nn.Linear(n_hidden, 1)
        self.sigmoid = nn.Sigmoid()
        
        self.h = torch.zeros((n_layers, 1, n_hidden), dtype=torch.float).to(DEVICE)
        self.c = torch.zeros((n_layers, 1, n_hidden), dtype=torch.float).to(DEVICE)
        
    def forward(self, x):
        r_output, (self.h, self.c) = self.lstm(x, (self.h, self.c))
        
        out = r_output[0, -1, :] 
        
        out = self.fc(out)

        out = self.sigmoid(out)
        
        self.h = torch.zeros((self.n_layers, 1, self.n_hidden), dtype=torch.float).to(DEVICE)
        self.c = torch.zeros((self.n_layers, 1, self.n_hidden), dtype=torch.float).to(DEVICE)
        
        return out

model = TrafficRNN(tokens)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

model.to(DEVICE)

TrafficRNN(
  (lstm): LSTM(81, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [12]:
for ep in np.arange(EPOCHS):
    # train
    np.random.shuffle(train_indices)
    model.train()
    running_loss = 0.0
    correct = 0
    for i in tqdm(train_indices):
        optimizer.zero_grad()

        sample = (sample2one_hot(samples[i], len(tokens))[np.newaxis, :]).astype(np.float32)
        sample = torch.from_numpy(sample).to(DEVICE)
        outputs = model(sample)
        
        predict = 1 if outputs.cpu().detach().numpy()[0] > TH else 0
        label = np.array([labels[i]])
        if (predict == label[0]):
            correct += 1

        label = torch.from_numpy(label.astype(np.float32)).to(DEVICE)
        loss = criterion(outputs, label)
        running_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    print('Train loss:', running_loss / len(train_indices))
    print('Train acc:', correct / len(train_indices))

    with torch.no_grad():
        model.eval()
        running_loss = 0.0
        correct = 0
        for i in tqdm(test_indices):
            sample = (sample2one_hot(samples[i], len(tokens))[np.newaxis, :]).astype(np.float32)
            sample = torch.from_numpy(sample).to(DEVICE)
            outputs = model(sample)

            predict = 1 if outputs.cpu().detach().numpy()[0] > TH else 0
            label = np.array([labels[i]])
            if (predict == label[0]):
                correct += 1

            label = torch.from_numpy(label.astype(np.float32)).to(DEVICE)
            loss = criterion(outputs, label)
            running_loss += loss.item()

        print('Test loss:', running_loss / len(test_indices))
        print('Test acc:', correct / len(test_indices))

100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:50<00:00, 41.11it/s]


Train loss: 0.5253691067119441
Train acc: 0.7652926118810133


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:35<00:00, 86.73it/s]


Test loss: 0.47501169470806054
Test acc: 0.8069506032793647


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:44<00:00, 42.36it/s]


Train loss: 0.5017537831217956
Train acc: 0.7854473769382411


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.94it/s]


Test loss: 0.5031334791955766
Test acc: 0.7973600082499742


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:47<00:00, 42.28it/s]


Train loss: 0.2614541349045705
Train acc: 0.901241651219584


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 82.37it/s]


Test loss: 0.04777140833373064
Test acc: 0.9851156715135265


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:58<00:00, 40.51it/s]


Train loss: 0.04021682767886484
Train acc: 0.9886868508547386


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.93it/s]


Test loss: 0.03044730303962651
Test acc: 0.9916812759960125


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:50<00:00, 42.19it/s]


Train loss: 0.02967481574451139
Train acc: 0.9921440550798835


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.95it/s]


Test loss: 0.027950393021996204
Test acc: 0.9930906465917294


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [27:02<00:00, 41.89it/s]


Train loss: 0.0914470885708789
Train acc: 0.9700767940683203


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.95it/s]


Test loss: 0.022170421756764667
Test acc: 0.9936750197655633


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:59<00:00, 41.98it/s]


Train loss: 0.01983095460267925
Train acc: 0.9950127990113867


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.91it/s]


Test loss: 0.016158768070071477
Test acc: 0.9961156371386339


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:56<00:00, 42.04it/s]


Train loss: 0.038788355369756174
Train acc: 0.9861270485773972


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.96it/s]


Test loss: 0.02309484223323164
Test acc: 0.9935375201952494


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [27:01<00:00, 41.93it/s]


Train loss: 0.016352763060085658
Train acc: 0.9960278930179186


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 82.44it/s]


Test loss: 0.01576615210237359
Test acc: 0.9963562613866832


100%|████████████████████████████████████████████████████████████████████████████| 67974/67974 [26:58<00:00, 42.01it/s]


Train loss: 0.020291787594423853
Train acc: 0.9959984700032365


100%|████████████████████████████████████████████████████████████████████████████| 29091/29091 [05:34<00:00, 86.95it/s]


Test loss: 0.01852947968484151
Test acc: 0.9958406379980063
