In [1]:
import os
import re
import pandas as pd

from tokenizers import BertWordPieceTokenizer, SentencePieceBPETokenizer
from transformers import BertTokenizerFast

from tqdm import tqdm, tqdm_pandas

In [2]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

## Preprocessing

In [3]:
def clean(x):
    x = x.lower()
    x = re.sub(f'[0-9]', 'N', x)
    x = re.sub(r'(\\n)', ' ', x)
    x = re.sub(r',', ' ', x)
    x = re.sub(r'[^a-zㄱ-ㅣ가-힣\s]+', '',x)
    x = re.sub(r'N', '', x)
    x = re.sub(r'jan|feb|oct|dec|mar|nov|sep', '', x)
    x = re.sub(r'\s+', ' ', x)
    return x.strip()

In [4]:
%%time

train_df['clean_log'] = train_df['full_log'].map(lambda x: clean(x))
test_df['clean_log'] = test_df['full_log'].map(lambda x: clean(x))

CPU times: user 1min 20s, sys: 314 ms, total: 1min 20s
Wall time: 1min 20s


In [5]:
train_df.head()

Unnamed: 0,id,level,full_log,clean_log
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""err...",localhost kibana typeerror timestamptz tagswar...
1,1,0,Feb 8 16:21:00 localhost logstash: [2021-02-0...,localhost logstash t info logstashoutputselast...
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""err...",localhost kibana typeerror timestamptz tagswar...
3,3,0,"Jan 4 10:18:31 localhost kibana: {""type"":""err...",localhost kibana typeerror timestamptz tagswar...
4,4,1,type=SYSCALL msg=audit(1603094402.016:52981): ...,typesyscall msgaudit archce syscall successyes...


In [11]:
import numpy as np 

new_train = train_df.pivot_table(index='clean_log',
                                 values='id',
                                 columns='level',
                                 aggfunc='count',
                                 fill_value = 0).reset_index()

new_train['total'] = np.sum(new_train.iloc[:,1:], axis=1)
new_train.iloc[:,1:-1] = new_train.iloc[:,1:-1].div(new_train.iloc[:,-1], axis=0)
new_train = new_train.drop(['total'], axis=1)

In [6]:
with open('./vocab/corpus.txt', 'w') as f:
    for sent in train_df['clean_log']:
        f.write(sent+'\n')

## Tokenize

In [9]:
bert_wordpiece_tokenizer = BertWordPieceTokenizer()

bert_wordpiece_tokenizer.train(
    files = './vocab/corpus.txt',
    vocab_size = 30000,
    min_frequency = 2,
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    show_progress = True,
    wordpieces_prefix = "##",
)

bert_wordpiece_tokenizer.save_model(
    directory='./vocab/',
    prefix = 'bert_version_1'
)

['./vocab/bert_version_1-vocab.txt']

In [10]:
vocab = bert_wordpiece_tokenizer.get_vocab()
print(len(vocab))

30000


In [11]:
tokenizer = BertTokenizerFast(vocab_file = './vocab/bert_version_1-vocab.txt')

## Encoding

In [12]:
test_df[['id', 'clean_log']].to_pickle('./dataset/clean_test.pkl')

In [13]:
test_drop_df = test_df[['clean_log']].drop_duplicates().reset_index(drop=True)

In [14]:
print(test_drop_df.shape)

(124092, 1)


In [15]:
def Encoding(x):
    output = tokenizer.encode_plus(x,                       
                                   truncation=True,
                                   max_length = 256,
                                   padding='max_length',
                                   add_special_tokens=True,
                                   return_token_type_ids=False,
                                   return_attention_mask=True)
    
    return pd.Series([output['input_ids'], output['attention_mask']])

In [16]:
tqdm.pandas()
train_df[['input_ids', 'attention_mask']] = train_df['clean_log'].progress_apply(lambda x: Encoding(x))

100%|██████████| 53917/53917 [00:20<00:00, 2673.66it/s]


In [17]:
tqdm.pandas()
test_drop_df[['input_ids', 'attention_mask']] = test_drop_df['clean_log'].progress_apply(lambda x: Encoding(x))

100%|██████████| 124092/124092 [00:48<00:00, 2556.55it/s]


## Save

In [18]:
train_df[['level', 'input_ids', 'attention_mask']].to_pickle('./dataset/clean_train.pkl')
test_drop_df[['input_ids', 'attention_mask']].to_pickle('./dataset/clean_drop_test.pkl')