In [1]:
import os
import re
import pandas as pd
import numpy as np 
import csv
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

In [3]:
def clean(x):
    x = x.lower()
    x = re.sub(f'[0-9]', 'N', x)
    x = re.sub(r'(\\n)', ' ', x)
    x = re.sub(r',', ' ', x)
    x = re.sub(r'[^a-zA-Zㄱ-ㅣ가-힣\s]+', '',x)
    x = re.sub(r'N', '', x)
    x = re.sub(r'jan|feb|oct|dec|mar|nov|sep', '', x)
    x = re.sub(r'\s+', ' ', x)
    return x.strip()

In [4]:
%%time

train_df['clean_log'] = train_df['full_log'].map(lambda x: clean(x))
test_df['clean_log'] = test_df['full_log'].map(lambda x: clean(x))

CPU times: user 1min 26s, sys: 232 ms, total: 1min 27s
Wall time: 1min 27s


In [5]:
def truncated_string(x, max_length):
    arr = []
    idx = max_length//2
    token_list = x.split(' ')
    if len(token_list)>max_length:
        arr+=token_list[:idx]
        arr+=token_list[-idx:]
    else:
        arr+=token_list
    return ' '.join(arr)

In [6]:
train_df['sample_word'] = train_df['clean_log'].map(lambda x: truncated_string(x, 32))
test_df['sample_word'] = test_df['clean_log'].map(lambda x: truncated_string(x, 32))

In [7]:
train_df['level'] = train_df['level'].map(lambda x: '__label__' + str(x))
trn_df, val_df = train_test_split(train_df, stratify=train_df['level'], test_size=0.3, random_state=42)

In [8]:
trn_df[['level', 'sample_word']].to_csv('./dataset/train.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
val_df[['level', 'sample_word']].to_csv('./dataset/valid.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

## Modeling

In [56]:
import fasttext

model = fasttext.load_model("./model_log.bin")



In [57]:
val_df.head()

Unnamed: 0,id,level,full_log,clean_log,sample_word,label
154958,154958,0,Jan 22 06:01:11 localhost logstash: [2021-01-2...,localhost logstash t warn logstashoutputselast...,localhost logstash t warn logstashoutputselast...,0
389760,389760,0,Sep 24 14:59:04 localhost logstash: [2020-09-2...,localhost logstash t errorlogstashoutputselast...,localhost logstash t errorlogstashoutputselast...,0
87830,87830,0,Jan 30 07:41:06 localhost logstash: { 3974 ruf...,localhost logstash rufusscheduler intercepted ...,localhost logstash rufusscheduler intercepted ...,0
265430,265430,1,type=SYSCALL msg=audit(1611896236.412:526401):...,typesyscall msgaudit archce syscall successyes...,typesyscall msgaudit archce syscall successyes...,1
210995,210995,1,type=SYSCALL msg=audit(1611886056.087:49868): ...,typesyscall msgaudit archce syscall successyes...,typesyscall msgaudit archce syscall successyes...,1


In [11]:
def predict_case(model, x, mode='train'):
    if mode=='train':
        result = model.predict(x)
        return result[0][0][-1]
    

In [58]:
# val_df['level'] = val_df['level'].apply(lambda x: x[-1]).astype('int')
val_df['label'] = val_df['sample_word'].apply(lambda x: predict_case(model, x)).astype('int')

In [59]:
val_df['level'].value_counts()

0    100220
1     39755
3      1242
5       666
2         4
4         3
6         2
Name: level, dtype: int64

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print('Acc:', accuracy_score(val_df['level'], val_df['label']))
print('Precision:', precision_score(val_df['level'], val_df['label'], average='macro'))
print('Recall:', recall_score(val_df['level'], val_df['label'], average='macro'))
print('F1-Score:', f1_score(val_df['level'], val_df['label'], average='macro'))

Acc: 0.9977518112367153
Precision: 0.5654465320254051
Recall: 0.5606129352870989
F1-Score: 0.5629942924123867


In [61]:
print(classification_report(val_df['level'], val_df['label']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    100220
           1       1.00      1.00      1.00     39755
           2       0.00      0.00      0.00         4
           3       0.99      0.99      0.99      1242
           4       0.00      0.00      0.00         3
           5       0.97      0.94      0.96       666
           6       0.00      0.00      0.00         2

    accuracy                           1.00    141892
   macro avg       0.57      0.56      0.56    141892
weighted avg       1.00      1.00      1.00    141892



In [53]:
valid_df = pd.read_csv('./dataset/validation_sample.csv')
valid_df['clean_log'] = valid_df['full_log'].map(lambda x: clean(x))
valid_df['sample_word'] = valid_df['clean_log'].map(lambda x: truncated_string(x, 32))

valid_text=valid_df['sample_word'].to_list()

In [54]:
valid_df.head()

Unnamed: 0,full_log,clean_log,sample_word
0,type=ANOM_PROMISCUOUS msg=audit(1600402733.466...,typeanompromiscuous msgaudit devenps prom oldp...,typeanompromiscuous msgaudit devenps prom oldp...
1,"oscap: msg: ""xccdf-result"", scan-id: ""00016007...",oscap msg xccdfresult scanid content ssgcentos...,oscap msg xccdfresult scanid content ssgcentos...
2,Sep 22 10:56:19 localhost kernel: Out of memor...,localhost kernel out of memory kill process pr...,localhost kernel out of memory kill process pr...


In [55]:
np.set_printoptions(precision=3, suppress=True)
model.predict(valid_text, k=-1)

([['__label__1',
   '__label__3',
   '__label__6',
   '__label__4',
   '__label__2',
   '__label__0',
   '__label__5'],
  ['__label__5',
   '__label__3',
   '__label__6',
   '__label__4',
   '__label__2',
   '__label__0',
   '__label__1'],
  ['__label__0',
   '__label__1',
   '__label__5',
   '__label__6',
   '__label__2',
   '__label__4',
   '__label__3']],
 [array([1., 0., 0., 0., 0., 0., 0.], dtype=float32),
  array([0.811, 0.186, 0.001, 0.001, 0.001, 0.   , 0.   ], dtype=float32),
  array([0.957, 0.011, 0.011, 0.006, 0.006, 0.005, 0.004], dtype=float32)])

In [22]:
test_df['label'] = test_df['sample_word'].apply(lambda x: predict_case(model, x))

In [23]:
test_df['label'].value_counts()

0    1002648
1     396749
3      12998
5       6453
2         34
4         34
Name: label, dtype: int64

In [24]:
submission_df = pd.read_csv('./dataset/sample_submission.csv')
submission_df['level'] = test_df['label'].astype('int')

In [25]:
submission_df.to_csv('./fasttext_baseline.csv', index=False)