In [None]:
import os
import torch

import numpy as np
import torch.optim as optim
import torch.nn.functional as F

from torch import nn
from tqdm import tqdm

In [None]:
END_TOKEN = '\x04'
TRAIN_PART = 0.7
EPOCHS = 10
LR = 1e-3
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'

In [None]:
normal_1_path = '../data/normal_train.txt'
normal_2_path = '../data/normal_test.txt'
abnormal_path = '../data/abnormal_test.txt'

with open(normal_1_path) as f:
    normal_raw_data = f.readlines()
with open(normal_2_path) as f:
    normal_raw_data += f.readlines()
with open(abnormal_path) as f:
    abnormal_raw_data = f.readlines()

In [None]:
tokens = set(''.join(normal_raw_data) + ''.join(abnormal_raw_data))
print(len(tokens))
tokens.add(END_TOKEN)
print(len(tokens))
print(tokens)

In [None]:
int2char = dict(enumerate(tokens))
char2int = {char: idx for idx, char in int2char.items()}

In [None]:
normal_samples, abnormal_samples = [], []

start_idx = 0
for idx, line in enumerate(normal_raw_data):
    if line == 'Connection: close\n':
        if normal_raw_data[idx + 1] != '\n':
            sample_as_chars = ''.join(normal_raw_data[start_idx:idx + 4]) + END_TOKEN
            start_idx = idx + 5
        else:
            sample_as_chars = ''.join(normal_raw_data[start_idx:idx + 1]) + END_TOKEN
            start_idx = idx + 3
        
        # convert from text to nums
        sample_as_nums = np.array([char2int[char] for char in sample_as_chars])
        normal_samples.append(sample_as_nums)

start_idx = 0 
for idx, line in enumerate(abnormal_raw_data):
    if line == 'Connection: close\n':
        if abnormal_raw_data[idx + 1] != '\n':
            sample_as_chars = ''.join(abnormal_raw_data[start_idx:idx + 4]) + END_TOKEN
            start_idx = idx + 5
        else:
            sample_as_chars = ''.join(abnormal_raw_data[start_idx:idx + 1]) + END_TOKEN
            start_idx = idx + 3
        
        # convert form text to nums
        sample_as_nums = np.array([char2int[char] for char in sample_as_chars])
        abnormal_samples.append(sample_as_nums)

print(len(normal_samples))  # must be 72000
print(len(abnormal_samples))  # must be 25065

In [None]:
samples = normal_samples + abnormal_samples
labels = np.hstack([np.zeros(len(normal_samples)), np.ones(len(abnormal_samples))])

print(len(samples))
print(len(labels))

In [None]:
train_indices = []
test_indices = []

for i in np.arange(len(samples)):
    if np.random.uniform() < TRAIN_PART:
        train_indices.append(i)
    else:
        test_indices.append(i)

train_indices = np.array(train_indices)
test_indices = np.array(test_indices)

In [None]:
def sample2one_hot(sample, size):
    """
    Convert the array of number to one hot vector view.
    
    :param sample: Array of ints.
    :param size: Number of unique nums.
    :return: 2D-array.
    """
    one_hot = np.zeros((len(sample), size), dtype=int)
    one_hot[np.arange(len(sample)), sample] = 1
    return one_hot

In [None]:
class TrafficRNN(nn.Module):
    def __init__(self, tokens, n_hidden=512, n_layers=3):
        super().__init__()
        self.tokens = tokens
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.int2char = dict(enumerate(tokens))
        self.char2int = {char: idx for idx, char in int2char.items()}
        self.lstm = nn.LSTM(input_size=len(tokens),
                            hidden_size=n_hidden,
                            num_layers=n_layers,
                            batch_first=True)
        self.fc = nn.Linear(n_hidden, 1)
        self.sigmoid = nn.Sigmoid()
        
        self.h = torch.zeros((n_layers, 1, n_hidden), dtype=torch.float).to(DEVICE)
        self.c = torch.zeros((n_layers, 1, n_hidden), dtype=torch.float).to(DEVICE)
        
    def forward(self, x):
        print(x.shape)
        r_output, (self.h, self.c) = self.lstm(x, (self.h, self.c))
        print(r_output.shape)
        
        out = r_output[0, -1, :] 
        print(out.shape)
        
        out = self.fc(out)
        print(out.shape)

        out = self.sigmoid(out)
        print(out.shape)
        
        self.h = torch.zeros((self.n_layers, 1, self.n_hidden), dtype=torch.float).to(DEVICE)
        self.c = torch.zeros((self.n_layers, 1, self.n_hidden), dtype=torch.float).to(DEVICE)
        
        return out

model = TrafficRNN(tokens)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

model.to(DEVICE)

In [None]:
for ep in np.arange(EPOCHS):
    # train
    np.random.shuffle(train_indices)
    model.train()
    for i in tqdm(train_indices):
        optimizer.zero_grad()

        sample = (sample2one_hot(samples[0], len(tokens))[np.newaxis, :]).astype(np.float32)
        sample = torch.from_numpy(sample).to(DEVICE)

        #label = torch.from_numpy(np.array([labels[i]]))
        #label = label.to(DEVICE)

        outputs = model(sample)
        
        
        
        break
        