In [1]:
import time
import torch
import sentencepiece as spm
from gpt_model import Transformer, ModelConfig
from utils import generate, evaluate, create_datasets, InfiniteDataLoader

In [2]:
# NN params
VOCAB_SIZE = 512
MAX_LOG_EVENT_LENGTH = 50
TOKENS_IN_BLOCK_SIZE = 50
EMBEDDING_SIZE = 256
BATCH_SIZE = 32

---
## Tokenization

In [3]:
# Remove non-ASCII characters from a file
def remove_non_ascii(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as infile, \
         open(output_file_path, 'w', encoding='utf-8') as outfile:
        count = 0
        for line in infile:
            # Remove non-ASCII characters from each line
            cleaned_line = ''.join(char for char in line if ord(char) < 128)
            outfile.write(cleaned_line)
            count += 1
            if count == 2_000_000:
                break

# Example usage
input_file_path = 'datasets\\applogcat.log'
output_file_path = 'datasets\\applogcat_clean.log'
#remove_non_ascii(input_file_path, output_file_path)

In [4]:
# dataset file
file_path = "datasets/applogcat_clean.log"

In [5]:
spm_training_args = f"--input={file_path} --model_prefix=sentencepiece_bpe --vocab_size={VOCAB_SIZE} " \
                    f"--model_type=bpe --unk_piece=[UNK] --pad_piece=[PAD] --bos_piece=[CLS] --eos_piece=[SEP] " \
                    f"--user_defined_symbols=[MASK] --hard_vocab_limit=false"
spm.SentencePieceTrainer.Train(spm_training_args)
sp = spm.SentencePieceProcessor()
sp.Load("sentencepiece_bpe.model")
example_line = '[CLS]12-19 11:59:59.296  3262  3381 I QCNEJ   : |CORE| NOTIFY_FEATURE_STATUS received[SEP]'
encoded_ids = sp.EncodeAsIds(example_line)
encoded_tokens = sp.EncodeAsPieces(example_line)
decoded_text = sp.DecodeIds(encoded_ids)
print(f'Encoded IDs: {encoded_ids}')
print(f'Encoded tokens: {encoded_tokens}')
print(f'Decoded text: {decoded_text}')
tokenizer = sp

Encoded IDs: [109, 446, 496, 476, 493, 433, 434, 8, 4, 433, 92, 445, 92, 445, 176, 445, 451, 22, 31, 30, 26, 18, 23, 24, 454, 159, 478, 161, 478, 158, 54, 492, 476, 444, 480, 493]
Encoded tokens: ['▁[', 'C', 'L', 'S', ']', '1', '2', '-19', '▁1', '1', ':5', '9', ':5', '9', '.2', '9', '6', '▁3262', '▁3381', '▁I', '▁QCNEJ', '▁:', '▁|', 'CORE', '|', '▁NOTIFY', '_', 'FEATURE', '_', 'STATUS', '▁received', '[', 'S', 'E', 'P', ']']
Decoded text: [CLS]12-19 11:59:59.296 3262 3381 I QCNEJ : |CORE| NOTIFY_FEATURE_STATUS received[SEP]


---
## Model Training

In [6]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

Using device:  cpu


In [7]:
train_dataset, test_dataset = create_datasets(file_path, tokenizer, MAX_LOG_EVENT_LENGTH)
batch_loader = InfiniteDataLoader(train_dataset, batch_size=BATCH_SIZE)

Training set size: 1945863
Test set size: 2000
LogEventDataset init 1945863
LogEventDataset init 2000


In [8]:
# Initialize the model
my_config = ModelConfig(block_size = TOKENS_IN_BLOCK_SIZE, 
                        vocab_size = tokenizer.GetPieceSize())
print(f"model config: {my_config}")
model = Transformer(my_config).to(device)
print(f"model #params: {sum(p.numel() for p in model.parameters())}")

model config: ModelConfig(block_size=50, vocab_size=512, n_embd=256, n_embd2=256, n_head=4, n_layer=4)
number of parameters: 3.30M
model #params: 3434496


In [9]:
loss_history = []
test_loss_history = []
optimizer = torch.optim.AdamW(model.parameters(), lr= 1e-3, betas=(0.9, 0.99), eps=1e-8)

In [10]:
# training loop
best_loss = None
NUM_STEPS = 50_000
for step in range(NUM_STEPS):
    t0 = time.time()
    # get the next batch, ship to device, and unpack it to input and target
    batch = batch_loader.next()
    batch = [t.to(device) for t in batch]
    X, Y = batch
    logits, loss = model(X, Y)

    # calculate the gradient, update the weights
    model.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    t1 = time.time()

    # logging
    if step % 100 == 0:
        print(f"step {step} | loss {loss.item():.4f} | step time {(t1-t0)*1000:.2f}ms")
    loss_history.append(loss.item())
    
    # evaluate the model
    if step > 0 and step % 500 == 0 or step == NUM_STEPS - 1:
        train_loss = evaluate(model, device, train_dataset, batch_size=100, max_batches=10)
        test_loss  = evaluate(model, device, test_dataset,  batch_size=100, max_batches=10)
        test_loss_history.append((step, test_loss))
        print(f"step {step} train loss: {train_loss} test loss: {test_loss}")
        # save the model to disk if it has improved
        if best_loss is None or test_loss < best_loss:
            print(f"test loss {test_loss} is the best so far")
            best_loss = test_loss

step 0 | loss 6.3996 | step time 616.57ms
step 100 | loss 2.1888 | step time 548.39ms
step 200 | loss 1.6799 | step time 556.27ms
step 300 | loss 1.3382 | step time 543.41ms
step 400 | loss 1.4693 | step time 556.31ms
step 500 | loss 1.3059 | step time 542.48ms
step 500 train loss: 1.3014250993728638 test loss: 1.2810940742492676
test loss 1.2810940742492676 is the best so far
step 600 | loss 0.9587 | step time 555.83ms
step 700 | loss 1.2161 | step time 581.46ms
step 800 | loss 1.3576 | step time 603.27ms
step 900 | loss 1.0837 | step time 532.28ms
step 1000 | loss 1.0855 | step time 627.80ms
step 1000 train loss: 1.1220816373825073 test loss: 1.115871787071228
test loss 1.115871787071228 is the best so far
step 1100 | loss 0.9005 | step time 581.10ms
step 1200 | loss 1.0183 | step time 532.46ms
step 1300 | loss 1.1493 | step time 542.39ms
step 1400 | loss 1.3055 | step time 519.83ms
step 1500 | loss 0.8425 | step time 547.51ms
step 1500 train loss: 1.0536514520645142 test loss: 1.018

In [11]:
example_line = '12-19 1'
tokenizer.EncodeAsIds(example_line)

[9, 4]

In [16]:
example_of_start = torch.tensor([1]).reshape(1,-1)
example_of_start.to(device)
model.to(device) 
for _ in range(10):
    res = ""
    gen_ids = generate(model, device, example_of_start, max_new_tokens=MAX_LOG_EVENT_LENGTH, do_sample=True, temperature=1, top_k=20).reshape(-1).tolist()
    res = tokenizer.DecodeIds(gen_ids)
    print(res)

12-19 12:09:34.514 3262 3381 W QCNEJ : |CORE| UNKOWN Unsolicited Event 8: (ST_REString_ize_id=1660)7707
12-19 12:03:34.495 3262 3381 I QCNEJ : |CORE| received protobuf msg:OpertupdateCapClient lase. receivedeleteP7
12-19 12:08:59.321 3262 3381 W QCNEJ : |CORE| UNKOWN Unsolicited Event 8d in changecess3KSIONelveDData:
12-19 13:00:49.611 31521 14922 D HwActivityManagerService: handleANRFilterFIFO,uid = 10110cmd = 19
12-19 12:11:33.241 3262 3381 I QCNEJ : |CORE| received protobuf msg:Oper: Setting ...siIME_De7
12-19 12:06:27.097 3262 3381 I QCNEJ : |CORE:COM:RCVR| Disconnected from 'cnd' socket bemoniCache peerkify308 for nc
12-19 12:08:50.417 3262 3381 I QCNEJ : |CORE:COM:RCVR| Disconnected from 'cnd' socketmIOExceptionmsched size_subId''bc
12-19 12:27:41.472 3262 3371 W QCNEJ : |CORE:COM:SNDR| IOException java.io.IOException: socket closedC
12-19 12:01:49.766 2605 16893 D PhoneInterfaceManager: [PhoneIntfMgr] getDataEnabled: subId=0 phoneId=07 portontent=false,calling aus
12-19 12:26:26

In [13]:
# save the model
torch.save(model.state_dict(), "model.pth")