# Pipeline

In [24]:
sentence_types = ['f', 'i']
word_types = ['b', 'p']
hp_filter_sizes = [0, 100, 150]
burst_timeouts = [1000, 30000, 50000]
sentence_timeouts = [500000, 1000000, 15000000]
labeling_granularity = ['sentence', 'document', 'host']
ngram_sizes = [2, 3, 4, 5]
buckets = [5, 20, 50]
percentiles = [0.95, 0.99]
validation_year = 2022
training_years = [2017, 2018, 2019, 2022]
token_features = ['log_bytes', 'avg_bytes']
log_maxs = [15, 16]
avg_maxs = [1500, 2500]
seq_lengths = [128, 512]
doc_lengths = [100]
embedding_lengths = [128, 256, 768]
num_attention_heads = [4, 8, 12]
num_hidden_layers = [4, 8, 12]

sentence_type_id = 1
word_type_id = 1
hp_filter_size_id = 1
burst_timeout_id = 1
sentence_timeout_id = 2
labeling_granularity_id = 1
ngram_size_id = 1
bucket_id = 2
percentile_id = 0
validation_year_id = 0
training_year_ids = [0,1]
token_feature_id = 0
seq_length_id = 1
doc_length_id = 0
max_id = 0
embedding_length_id = 1
num_attention_heads_id = 1
num_hidden_layers_id = 1

filter_singletons = True
do_training = True
do_validation1 = True
do_validation2 = True
do_validation3 = False


### 1. Dataset Loading

In [25]:
import zipfile
import tqdm

def read_zip_archive(zip_file_path, contents, names, ips):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        for file_name in tqdm.tqdm(file_list):
            if (not file_name.endswith('.csv')):
                continue
            name = file_name.split('/')[-1]
            feature_name = file_name.split('/')[-2]
            if (feature_name != token_features[token_feature_id]):
                continue
            name = name.split('.csv')[0]
            ip1 = name.split('-')[0]
            ip2 = name.split('-')[1]
            names.append(name)
            ips.add(ip1)
            ips.add(ip2)
            # Read the content of each file into a list of strings
            with zip_ref.open(file_name) as file:
                content = file.read().decode('utf-8')  # Assuming the content is in UTF-8 encoding
                contents.append(content)
    return contents, names, ips


contents = []
names = []
ips = set()

if word_types[word_type_id] == 'p':
    burst_timeout = 'x'
else:
    burst_timeout = str(burst_timeouts[burst_timeout_id])


for year_id in training_year_ids:
    print('Reading year', training_years[year_id])
    path = 'config_' + sentence_types[sentence_type_id] + word_types[word_type_id] + '_' + str(hp_filter_sizes[hp_filter_size_id]) + '_' + burst_timeout + '_' + str(sentence_timeouts[sentence_timeout_id]) + '.zip'
    contents, names, ips = read_zip_archive(str(training_years[year_id]) + '/' + path, contents, names, ips)


Reading year 2017


100%|██████████| 27045/27045 [00:01<00:00, 21397.21it/s]


Reading year 2018


100%|██████████| 66815/66815 [00:02<00:00, 26307.94it/s]


### 2. Statistics

In [26]:
# Unused for now

### 3. Preprocessing I

In [27]:
# TODO: Delete singletons if flag is enabled

### 4. Bucketization

In [28]:
import numpy as np

num_buckets = buckets[bucket_id]
min_buckets = 0
if token_features[token_feature_id] == 'log_bytes':
    max_buckets = log_maxs[max_id]
else:
    max_buckets = avg_maxs[max_id]

# Define the bucket boundaries
bucket_boundaries = np.linspace(min_buckets, max_buckets, num_buckets)

def bucket_id_from_decimal(decimal):
    # Use np.digitize to find the bucket that each decimal belongs to
    # 0 - 4 is reserved for special tokens
    return np.digitize(decimal, bucket_boundaries) + 4

corpus = []
for content in tqdm.tqdm(contents):
    document = []
    corpus.append(document)
    for sentence_id, row in enumerate(content.split('\n')):
        if sentence_id > doc_lengths[doc_length_id]:
            break
        sentence = []
        for i, value in enumerate(row.split(' ')):
            if i > seq_lengths[seq_length_id]:
                break
            if value == '':
                continue
            value = float(value)
            sentence.append(bucket_id_from_decimal(value))
        document.append(sentence)

100%|██████████| 46927/46927 [00:56<00:00, 836.77it/s] 


### 5. Preprocessing II

In [29]:
# TODO: Deduplicate sentences (e.g. only take first 100 sentences in each document)
deduplicated_corpus = []
for document in corpus:
    document = document[:100]
    deduplicated_corpus.append(document)

### 6. Validation Split

In [30]:
from sklearn.model_selection import train_test_split
import numpy as np

np.random.shuffle(deduplicated_corpus)
train_corpus, validation_corpus = train_test_split(deduplicated_corpus, test_size=0.1, random_state=42)

### 7.1. BERT Preprocessing

Special Tokens:
- **0:** PAD token
- **1:** CLS token (beginning of a sentence)
- **2:** SEP token (end of sentence)
- **3:** EMPTY token (empty sentence / end of document sentence)
- **4:** MASK token for MLM objective

In [31]:
from tqdm import tqdm
from bert_dataset import BERTDataset
from torch.utils.data import DataLoader
from bert_model import BERT, BERTLM
from bert_trainer import BERTTrainer

if do_training:
    train_data = BERTDataset(train_corpus, seq_length=seq_lengths[seq_length_id], min_bucket=5, max_bucket=4 + num_buckets, augment=True)
    # train_loader = DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=False)
    eval_data = BERTDataset(validation_corpus, seq_length=seq_lengths[seq_length_id], min_bucket=5, max_bucket=4 + num_buckets)
    # eval_loader = DataLoader(eval_data, batch_size=32, shuffle=False, pin_memory=False)

  0%|          | 0/42234 [00:00<?, ?it/s]

100%|██████████| 42234/42234 [00:10<00:00, 4018.52it/s]
100%|██████████| 4693/4693 [00:00<00:00, 83816.43it/s]


In [42]:
train_data[31]

{'input_ids': tensor([ 1, 31, 31, 39, 39, 37, 31, 29, 17, 32, 31, 31, 33, 32, 32, 31,  4, 32,
          2,  4, 32, 39, 39,  4, 31, 31, 32, 22, 31, 31, 33,  2,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0

In [22]:
from transformers import BertConfig
from transformers import BertForPreTraining

model_config = BertConfig(vocab_size=num_buckets + 5, 
                          max_position_embeddings=seq_lengths[seq_length_id],
                          hidden_size=embedding_lengths[embedding_length_id],
                          intermediate_size=4 * embedding_lengths[embedding_length_id],
                          num_hidden_layers=8,
                          num_attention_heads=8,)
model = BertForPreTraining(config=model_config)

In [23]:
from transformers import Trainer, TrainingArguments

import os
model_path = "checkpoints"
folders = os.listdir(model_path)
ids = [int(folder.split('_')[-1]) for folder in folders]
max_id = max(ids) if ids else 0
model_path = os.path.join(model_path, f"model_{max_id + 1}")
try:
    os.mkdir(model_path)
except:
    pass

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=3,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=128,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    greater_is_better=False,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    save_total_limit=3,
    
    # do_eval=False,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

trainer = Trainer(
    model=model,

    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
)

In [24]:
trainer.train()

Step,Training Loss,Validation Loss
1000,1.1357,0.581934
2000,0.5693,0.534101
3000,0.5347,0.513446
4000,0.521,0.50112
5000,0.5095,0.495093
6000,0.5043,0.491159
7000,0.4972,0.48377
8000,0.4941,0.483563
9000,0.4903,0.481723
10000,0.49,0.478434


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=23214, training_loss=0.5197988940505409, metrics={'train_runtime': 8606.5213, 'train_samples_per_second': 690.562, 'train_steps_per_second': 2.697, 'total_flos': 7393731414650880.0, 'train_loss': 0.5197988940505409, 'epoch': 3.0})

In [25]:
result = trainer.evaluate()

In [26]:
import os

models_path = "models"

# list all folders in models
folders = os.listdir(models_path)

# extract id in each folder
ids = [int(folder.split('_')[-1]) for folder in folders]

# get the maximum id
max_id = max(ids) if ids else 0

# get the evaluation loss
eval_loss = result['eval_loss']

# format eval_loss to first 3 decimal places
eval_loss = "{:.3f}".format(eval_loss)

dir_name = f"model_{eval_loss}_{max_id + 1}"
path = os.path.join(models_path, dir_name)
if not os.path.exists(path):
    os.makedirs(path)

# save the model
trainer.save_model(path)

import json
# Assuming the hyperparameters are stored in a dictionary named `hyperparameters`
hyperparameters = {
    'sentence_type': sentence_types[sentence_type_id],
    'word_type': word_types[word_type_id],
    'hp_filter_size': hp_filter_sizes[hp_filter_size_id],
    'burst_timeout': burst_timeouts[burst_timeout_id],
    'sentence_timeout': sentence_timeouts[sentence_timeout_id],
    'num_buckets': buckets[bucket_id],
    'bucket_min': min_buckets,
    'bucket_max': max_buckets,
    'bucket_boundaries': bucket_boundaries.tolist(),
    'seq_length': seq_lengths[seq_length_id],
    'doc_length': doc_lengths[doc_length_id],
    'embedding_length': embedding_lengths[embedding_length_id],
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'vocab_size': num_buckets + 5,
    'max_position_embeddings': seq_lengths[seq_length_id],
    'hidden_size': embedding_lengths[embedding_length_id],
    'intermediate_size': 4 * embedding_lengths[embedding_length_id],
    'num_hidden_layers': num_hidden_layers[num_hidden_layers_id],
    'num_attention_heads': num_attention_heads[num_attention_heads_id],
    'years': [training_years[year_id] for year_id in training_year_ids],
    'feature': token_features[token_feature_id],
    'num_train_epochs': 3,
    'per_device_train_batch_size': 32,
    'gradient_accumulation_steps': 8,
    'per_device_eval_batch_size': 128,
    'logging_steps': 1000,
    'save_steps': 1000,
    'greater_is_better': False,
    'metric_for_best_model': "eval_loss",
    'load_best_model_at_end': True,
    'save_total_limit': 3
}

hyperparameters['evaluation_result'] = result

# Write to json file
with open(os.path.join(path, 'hyperparameters.json'), 'w') as f:
    json.dump(hyperparameters, f, indent=4)
