# Description
This notebook containts 3 parts: common, train and evaluation. The common part is necessary for run. Train and evaluation parts can working separately.

# Common part
The following code is used for both train and demo parts.

In [None]:
# common imports
import os
import torch

import numpy as np

from torch import nn
from sklearn.metrics import f1_score

from tqdm import tqdm_notebook as tqdm

In [None]:
# common params
TH = 0.5
END_TOKEN = '\x04'
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

In [None]:
# network definition
class TrafficRNN(nn.Module):
    def __init__(self, tokens, hidden_size=128, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.tokens = tokens
        self.int2char = dict(enumerate(tokens))
        self.char2int = {char: idx for idx, char in self.int2char.items()}
        
        self.lstm = nn.LSTM(input_size=len(tokens),
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
        self.h = torch.zeros((num_layers, 1, hidden_size), dtype=torch.float).to(DEVICE)
        self.c = torch.zeros((num_layers, 1, hidden_size), dtype=torch.float).to(DEVICE)
        
    def forward(self, x):
        r_output, (self.h, self.c) = self.lstm(x, (self.h, self.c))
        out = r_output[0, -1, :] #  take just last output of LSTM
        out = self.fc(out)
        out = self.sigmoid(out)
        
        self.h = torch.zeros((self.num_layers, 1, self.hidden_size), dtype=torch.float).to(DEVICE)
        self.c = torch.zeros((self.num_layers, 1, self.hidden_size), dtype=torch.float).to(DEVICE)
        
        return out


In [None]:
def sample2one_hot(sample, size):
    """Convert the array of numbers to one hot vector view.
    
    Parameters
    ----------
    sample : array-like
        Array of integers.
    size : int
        Maximal possible number in array.

    Returns
    -------
    array-like
        Array of one hot vectors.

    """
    one_hot = np.zeros((len(sample), size), dtype=int)
    one_hot[np.arange(len(sample)), sample] = 1
    return one_hot

# Training part
The following code process the dataset [CSIC 2010](http://www.isi.csic.es/dataset/) and train a classifier based on LSTM. Best models will be saved into choiced directory with graphics of training process.

In [None]:
# imports section
import torch.optim as optim
import torch.nn.functional as F

from sklearn.utils.class_weight import compute_class_weight

from datetime import datetime as dtm

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [None]:
# path for save resplitted data
SAVE_DATASET_TO = '../data/resplitted'
# path for save trained models and graphics of training process
SAVE_TO = '../trained_models'

for p in [SAVE_TO, SAVE_DATASET_TO]:
    if not os.path.exists(p):
        try:
            os.mkdir(p)
        except FileNotFoundError as e:
            print('ERROR:', e)

In [None]:
# params and constants section
TRAIN_PART = 0.65
EPOCHS = 12
LSTM_HIDDEN_SIZE = 128
LSTM_NUM_LAYERS = 1
LR = 9e-4

In [None]:
# dataset loading
normal_1_path = '../data/original/normal_train.txt'
normal_2_path = '../data/original/normal_test.txt'
abnormal_path = '../data/original/abnormal_test.txt'

with open(normal_1_path) as f:
    normal_raw_data = f.readlines()
with open(normal_2_path) as f:
    normal_raw_data += f.readlines()
with open(abnormal_path) as f:
    abnormal_raw_data = f.readlines()

In [None]:
# tokens creation
tokens = set(''.join(normal_raw_data) + ''.join(abnormal_raw_data))
print(len(tokens))
tokens.add(END_TOKEN)
print(len(tokens))
print(tokens)

In [None]:
# dataset processing
# create the model for getting char2int function
model = TrafficRNN(tokens, LSTM_HIDDEN_SIZE, LSTM_NUM_LAYERS)
model.to(DEVICE)

normal_samples, abnormal_samples = [], []
samples_as_chars = []
start_idx = 0
for idx, line in enumerate(normal_raw_data):
    if line == 'Connection: close\n':
        if normal_raw_data[idx + 1] != '\n':
            sample_as_chars = ''.join(normal_raw_data[start_idx:idx + 4]) + END_TOKEN
            start_idx = idx + 5
        else:
            sample_as_chars = ''.join(normal_raw_data[start_idx:idx + 1]) + END_TOKEN
            start_idx = idx + 3
        
        # convert from text to nums
        sample_as_nums = np.array([model.char2int[char] for char in sample_as_chars])
        normal_samples.append(sample_as_nums)
        samples_as_chars.append(sample_as_chars)

start_idx = 0 
for idx, line in enumerate(abnormal_raw_data):
    if line == 'Connection: close\n':
        if abnormal_raw_data[idx + 1] != '\n':
            sample_as_chars = ''.join(abnormal_raw_data[start_idx:idx + 4]) + END_TOKEN
            start_idx = idx + 5
        else:
            sample_as_chars = ''.join(abnormal_raw_data[start_idx:idx + 1]) + END_TOKEN
            start_idx = idx + 3
        
        # convert form text to nums
        sample_as_nums = np.array([model.char2int[char] for char in sample_as_chars])
        abnormal_samples.append(sample_as_nums)
        samples_as_chars.append(sample_as_chars)

print(len(normal_samples))  # must be 72000
print(len(abnormal_samples))  # must be 25065

In [None]:
# dataset resplitting
samples = normal_samples + abnormal_samples
labels = np.hstack([np.zeros(len(normal_samples)), np.ones(len(abnormal_samples))])

train_indices = []
test_indices = []
for i in np.arange(len(samples)):
    if np.random.uniform() < TRAIN_PART:
        train_indices.append(i)
    else:
        test_indices.append(i)

train_indices = np.array(train_indices)
test_indices = np.array(test_indices)

In [None]:
# saving resplitted datasets
for name, indices in zip(['train', 'test'],
                         [train_indices, test_indices]):
    with open(os.path.join(SAVE_DATASET_TO, name + '_samples.txt'), 'w') as f:
        for idx in indices:
            print(samples_as_chars[idx], file=f, end='\n\n')
    with open(os.path.join(SAVE_DATASET_TO, name + '_labels.txt'), 'w') as f:
        for idx in indices:
            print(labels[idx], file=f)

In [None]:
# additional tools initialization
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

cw = compute_class_weight(
    'balanced', np.array([0, 1]), labels[train_indices]
).astype(np.float32)
class_weights = torch.tensor(cw).to(DEVICE)

In [None]:
def plot_metric(metric_name, train_history, test_history, save_to):
    """Plot and save a graphic of a given metric."""
    plt.grid(True)
    plt.xlabel('Epochs')
    plt.ylabel(metric_name + ' score')
    plt.title(metric_name + ' score dynamic')
    plt.plot(np.arange(len(train_history)), train_history, color='orange')
    plt.plot(np.arange(len(test_history)), test_history, color='blue')
    plt.legend(('Train', 'Test'))
    plt.savefig(os.path.join(save_to, metric_name.lower() + '.jpg'), dpi=350)
    plt.clf()

In [None]:
# train loop
best_train_loss, best_test_loss = np.inf, np.inf
best_train_acc, best_test_acc = 0.0, 0.0
best_train_f1, best_test_f1 = 0.0, 0.0
train_losses, test_losses = [], []
train_accs, test_accs = [], []
train_f1, test_f1 = [], []
test_labels = labels[test_indices]

for ep in np.arange(EPOCHS):
    #
    # train
    #
    print('EPOCH #', ep)
    np.random.shuffle(train_indices) #  shuffle only train indices
    model.train()
    running_loss = 0.0
    correct = 0
    pd_labels = []
    for i in tqdm(train_indices):
        optimizer.zero_grad()

        sample = (sample2one_hot(samples[i], len(tokens))[np.newaxis, :]).astype(np.float32)
        sample = torch.from_numpy(sample).to(DEVICE)
        outputs = model(sample)
        
        predict = 1 if outputs.cpu().detach().numpy()[0] > TH else 0
        label = np.array([labels[i]])
        if (predict == label):
            correct += 1
        pd_labels.append(predict)

        label = torch.from_numpy(label.astype(np.float32)).to(DEVICE)
        loss = criterion(outputs, label)
        weight = class_weights[label.long()].view_as(label)
        weighted_loss = loss * weight
        running_loss += weighted_loss.item()
        
        weighted_loss.backward()
        optimizer.step()
    train_losses.append(running_loss / len(train_indices))
    train_accs.append(correct / len(train_indices))
    train_f1.append(f1_score(labels[train_indices], pd_labels))
    
    if train_losses[-1] < best_train_loss:
        best_train_loss = train_losses[-1]
    
    if train_accs[-1] > best_train_acc:
        best_train_acc = train_accs[-1]
    
    if train_f1[-1] > best_train_f1:
        best_train_f1 = train_f1[-1]
    
    print('Train loss: {}, (best: {})'.format(train_losses[-1], best_train_loss))
    print('Train acc: {}, (best: {})'.format(train_accs[-1], best_train_acc))
    print('Train F1: {}, (best: {})'.format(train_f1[-1], best_train_f1))

    #
    # test
    #
    with torch.no_grad():
        model.eval()
        running_loss = 0.0
        correct = 0
        pd_labels = []
        for i in tqdm(test_indices):
            sample = (sample2one_hot(samples[i], len(tokens))[np.newaxis, :]).astype(np.float32)
            sample = torch.from_numpy(sample).to(DEVICE)
            outputs = model(sample)

            predict = 1 if outputs.cpu().detach().numpy()[0] > TH else 0
            label = np.array([labels[i]])
            if (predict == label):
                correct += 1
            pd_labels.append(predict)

            label = torch.from_numpy(label.astype(np.float32)).to(DEVICE)
            loss = criterion(outputs, label)
            weight = class_weights[label.long()].view_as(label)
            weighted_loss = loss * weight
            running_loss += weighted_loss.item()
    
    test_losses.append(running_loss / len(test_indices))
    test_accs.append(correct / len(test_indices))
    test_f1.append(f1_score(test_labels, pd_labels))
    
    if test_losses[-1] <= best_test_loss:
        best_test_loss = test_losses[-1]
        model_name = dtm.now().strftime('%Y_%m_%d__%H_%M_%S_%f_') + 'loss.pth'
        torch.save({
            'epoch': ep,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': test_losses[-1],
            'accuracy': test_accs[-1],
            'f1': test_f1[-1],
            'tokens': tokens,
            'hidden_size': LSTM_HIDDEN_SIZE,
            'num_layers': LSTM_NUM_LAYERS
            }, os.path.join(SAVE_TO, model_name))
    
    if test_accs[-1] >= best_test_acc:
        best_test_acc = test_accs[-1]
        model_name = dtm.now().strftime('%Y_%m_%d__%H_%M_%S_%f_') + 'acc.pth'
        torch.save({
            'epoch': ep,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': test_losses[-1],
            'accuracy': test_accs[-1],
            'f1': test_f1[-1],
            'tokens': tokens,
            'hidden_size': LSTM_HIDDEN_SIZE,
            'num_layers': LSTM_NUM_LAYERS
            }, os.path.join(SAVE_TO, model_name))
    
    if test_f1[-1] >= best_test_f1:
        best_test_f1 = test_f1[-1]
        model_name = dtm.now().strftime('%Y_%m_%d__%H_%M_%S_%f_') + 'f1.pth'
        torch.save({
            'epoch': ep,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': test_losses[-1],
            'accuracy': test_accs[-1],
            'f1': test_f1[-1],
            'tokens': tokens,
            'hidden_size': LSTM_HIDDEN_SIZE,
            'num_layers': LSTM_NUM_LAYERS
            }, os.path.join(SAVE_TO, model_name))
        
    
    print('Test loss: {}, (best: {})'.format(test_losses[-1], best_test_loss))
    print('Test acc: {}, (best: {})'.format(test_accs[-1], best_test_acc))
    print('Test F1: {}, (best: {})'.format(test_f1[-1], best_test_f1))
    
    # plotting
    plot_metric('Loss', train_losses, test_losses, SAVE_TO)
    plot_metric('Accuracy', train_accs, test_accs, SAVE_TO)
    plot_metric('F1', train_f1, test_f1, SAVE_TO)

# Evaluation part
The following code evaluates a trained model and demonstrates how to use a trained by the code above models.

In [None]:
# params section
MODEL_PATH = '../trained_models/model.pth'
SAMPLES_PATH = '../data/resplitted/test_samples.txt'
LABELS_PATH = '../data/resplitted/test_labels.txt'

In [None]:
# model loading
cp = torch.load(MODEL_PATH)
model = TrafficRNN(cp['tokens'], hidden_size=cp['hidden_size'], num_layers=cp['num_layers'])
model.load_state_dict(cp['model_state_dict'])
model.to(DEVICE)
model.eval()

print('Checkpoint epoch:', cp['epoch'])
print('Checkpoint accuracy:', cp['accuracy'])
print('Checkpoint F1:', cp['f1'])

In [None]:
# read data
samples = []
with open(SAMPLES_PATH) as f:
    start_idx = 0
    lines = f.readlines()
    
for idx, line in enumerate(lines):
    if line == END_TOKEN + '\n':
        sample_as_chars = ''.join(lines[start_idx:idx + 1])[:-1] #  remove last end of line
        samples.append(np.array([model.char2int[char] for char in sample_as_chars]))
        start_idx = idx + 2

with open(LABELS_PATH) as f:
    labels = np.array(f.readlines(), dtype=np.float32)
    
print(len(samples))
print(len(labels))

In [None]:
# predictions generation
predicts = []
with torch.no_grad():
    for sample in tqdm(samples):
        sample = (sample2one_hot(sample, len(model.tokens))[np.newaxis, :]).astype(np.float32)
        sample = torch.from_numpy(sample).to(DEVICE)
        outputs = model(sample)
        predicts.append(1 if outputs.cpu().detach().numpy()[0] > TH else 0)

In [None]:
print('Logged accuracy of model:', cp['accuracy'])
print('Calculated accuracy of model:', sum(predicts == labels) / len(labels))
print('\nLogged F1:', cp['f1'])
print('Calculated F1:', f1_score(labels, predicts))