In [164]:
import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, log_loss, recall_score, f1_score, classification_report

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [34]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

In [35]:
def clean(x):
    x = x.lower()
    x = re.sub(f'[0-9]', 'N', x)
    x = re.sub(r'(\\n)', ' ', x)
    x = re.sub(r',', ' ', x)
    x = re.sub(r'[^a-zA-Zㄱ-ㅣ가-힣\s]+', '',x)
    x = re.sub(r'N', '', x)
    x = re.sub(r'jan|feb|oct|dec|mar|nov|sep', '', x)
    x = re.sub(r'\s+[a-zA-Z]\s+', ' ', x) 
    x = re.sub(r'\s+', ' ', x)
    return x.strip()

In [37]:
%%time

train_df['clean_log'] = train_df['full_log'].map(lambda x: clean(x))
test_df['clean_log'] = test_df['full_log'].map(lambda x: clean(x))

CPU times: user 1min 42s, sys: 176 ms, total: 1min 42s
Wall time: 1min 42s


In [38]:
def truncated_string(x, max_length):
    arr = []
    idx = max_length//2
    token_list = x.split(' ')
    if len(token_list)>max_length:
        arr+=token_list[:idx]
        arr+=token_list[-idx:]
    else:
        arr+=token_list
    return ' '.join(arr)

In [39]:
train_df['sample_word'] = train_df['clean_log'].map(lambda x: truncated_string(x, 64))
test_df['sample_word'] = test_df['clean_log'].map(lambda x: truncated_string(x, 64))

In [181]:
train_df.head()

Unnamed: 0,id,level,full_log,clean_log,sample_word
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""err...",localhost kibana typeerror timestamptz tagswar...,localhost kibana typeerror timestamptz tagswar...
1,1,0,Feb 8 16:21:00 localhost logstash: [2021-02-0...,localhost logstash info logstashoutputselastic...,localhost logstash info logstashoutputselastic...
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""err...",localhost kibana typeerror timestamptz tagswar...,localhost kibana typeerror timestamptz tagswar...
3,3,0,"Jan 4 10:18:31 localhost kibana: {""type"":""err...",localhost kibana typeerror timestamptz tagswar...,localhost kibana typeerror timestamptz tagswar...
4,4,1,type=SYSCALL msg=audit(1603094402.016:52981): ...,typesyscall msgaudit archce syscall successyes...,typesyscall msgaudit archce syscall successyes...


In [208]:
train_drop_df = train_df[['level','clean_log']].drop_duplicates().reset_index(drop=True)
train_drop_df['doc_idx'] = train_drop_df.index

In [209]:
train_drop_df[['clean_log', 'doc_idx']].to_csv('sample.txt', sep = '\t', index=False, header=False)

In [210]:
train_drop_df.head(10)

Unnamed: 0,level,clean_log,doc_idx
0,0,localhost kibana typeerror timestamptz tagswar...,0
1,0,localhost logstash info logstashoutputselastic...,1
2,1,typesyscall msgaudit archce syscall successyes...,2
3,1,typesyscall msgaudit archce syscall successyes...,3
4,0,localhost logstash warn logstashoutputselastic...,4
5,0,localhost kibana typelog timestamptz tagswarni...,5
6,1,typesyscall msgaudit archce syscall successyes...,6
7,0,localhost logstash warn logstashoutputselastic...,7
8,0,localhost logstash rufusscheduler intercepted ...,8
9,1,typesyscall msgaudit archce syscall successyes...,9


In [211]:
class Doc2VecCorpus:
    def __init__(self, fname):
        self.fname = fname
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                text, doc_idx = doc.split('\t')
                yield TaggedDocument(
                    words = text.split(' '), 
                    tags = ['Doc_%s' % doc_idx])

In [212]:
doc2vec_corpus = Doc2VecCorpus('./sample.txt')

In [213]:
model = Doc2Vec(vector_size = 128, 
                window = 2, 
                dm = 1,
                min_count = 2, 
                negative = 5,
                workers = 6, 
                seed = 42)

In [214]:
model.build_vocab(doc2vec_corpus)

In [215]:
%%time

model.train(doc2vec_corpus, total_examples=model.corpus_count, epochs=30)
model.save("d2v.model")

CPU times: user 2min 54s, sys: 28 s, total: 3min 22s
Wall time: 2min 21s


In [195]:
sample = model.dv[4]
model.dv.most_similar([sample])

[('Doc_4\n', 0.9999998807907104),
 ('Doc_7\n', 0.9872230291366577),
 ('Doc_9169\n', 0.9686447381973267),
 ('Doc_28775\n', 0.9613114595413208),
 ('Doc_22314\n', 0.956368625164032),
 ('Doc_20649\n', 0.9459691047668457),
 ('Doc_7537\n', 0.9424419403076172),
 ('Doc_552\n', 0.9419867396354675),
 ('Doc_8963\n', 0.9391908645629883),
 ('Doc_44001\n', 0.9324362874031067)]

In [197]:
train_features = []

for i in tqdm(range(len(train_drop_df))):
    train_features.append(model.dv[i])
    
train_features = np.array(train_features)

  0%|          | 0/54248 [00:00<?, ?it/s]

In [206]:
train_features.shape

(54248, 128)

In [200]:
train_level = train_drop_df['level']

In [201]:
TEST_SIZE = 0.3
RANDOM_SEED = 42

train_x, eval_x, train_y, eval_y = train_test_split(train_features, train_level,
                                                    stratify = train_level,
                                                    shuffle = True,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_SEED)

clf = LogisticRegression(random_state=42,
                         solver='newton-cg',
                         max_iter=1000,
                         n_jobs=-1)

clf.fit(train_x, train_y)
clf_pred = clf.predict_proba(eval_x)

In [202]:
print("CV Accuracy score: {:<8.5f}".format(accuracy_score(eval_y, np.argmax(clf_pred, axis=1))))
print("CV Precision score: {:<8.5f}".format(precision_score(eval_y, np.argmax(clf_pred, axis=1), average='macro')))
print("CV Recall score: {:<8.5f}".format(recall_score(eval_y, np.argmax(clf_pred, axis=1), average='macro')))
print("CV F1 score: {:<8.5f}".format(f1_score(eval_y, np.argmax(clf_pred, axis=1), average='macro')))

CV Accuracy score: 0.95177 
CV Precision score: 0.39832 
CV Recall score: 0.28049 
CV F1 score: 0.29618 


  _warn_prf(average, modifier, msg_start, len(result))


In [205]:
print(classification_report(eval_y, np.argmax(clf_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.64      0.68      0.66       922
           1       0.97      0.99      0.98     15004
           2       0.00      0.00      0.00         1
           3       0.62      0.05      0.09       157
           4       0.00      0.00      0.00         1
           5       0.56      0.24      0.34       189
           6       0.00      0.00      0.00         1

    accuracy                           0.95     16275
   macro avg       0.40      0.28      0.30     16275
weighted avg       0.95      0.95      0.95     16275



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [185]:
param = {'boosting_type':'gbdt',
         'max_depth':7, 
         'num_leaves':31,
         'objective': 'multi-class',
         'n_estimators':1000, 
         'learning_rate':0.01, 
         'subsample':0.8, 
         'colsample_bytree':0.8,
         'num_class': 7,
#          'reg_alpha':0.5, 
#          'reg_lambda':1.0, 
         'random_state':42
        }

In [204]:
from lightgbm import LGBMClassifier

oof_lgb = np.zeros((len(train_drop_df), 7))
# lgb_pred = np.zeros((len(test_df), 7))
# eval_results = np.zeros((3, 7))

# valid_df = pd.read_csv('./dataset/validation_sample.csv')
# valid_df['clean_log'] = valid_df['full_log'].map(lambda x: clean(x))
# valid_df['first_word'] = valid_df['clean_log'].map(lambda x: truncated_string(x, 64))
# valid_text=valid_df['first_word'].to_list()
# valid_features=vectorizer3.transform(valid_text)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_features, train_level)):
    print("\nfold num_: {}".format(fold_))
    y_train, y_valid = train_level[trn_idx], train_level[val_idx]
    X_train, X_valid = train_features[trn_idx], train_features[val_idx]
    
    clf = LGBMClassifier(**param)
    clf.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=50,
            verbose=50)
    
    oof_lgb[val_idx] = clf.predict_proba(X_valid, num_iteration=clf.best_iteration_)   
#     eval_results += clf.predict_proba(valid_features, num_iteration=clf.best_iteration_)
#     lgb_pred += clf.predict_proba(test_features, num_iteration=clf.best_iteration_)

# eval_results /= 2
# lgb_pred /= 5    
print('\nCross Validation Is Complete')
print("CV Accuracy score: {:<8.5f}".format(accuracy_score(train_level, np.argmax(oof_lgb, axis=1))))
print("CV Precision score: {:<8.5f}".format(precision_score(train_level, np.argmax(oof_lgb, axis=1), average='macro')))
print("CV Recall score: {:<8.5f}".format(recall_score(train_level, np.argmax(oof_lgb, axis=1), average='macro')))
print("CV F1 score: {:<8.5f}".format(f1_score(train_level, np.argmax(oof_lgb, axis=1), average='macro')))




fold num_: 0
Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 0.12812	valid_1's multi_logloss: 0.146385
[100]	training's multi_logloss: 0.0811647	valid_1's multi_logloss: 0.103388
[150]	training's multi_logloss: 0.0601368	valid_1's multi_logloss: 0.0852644
[200]	training's multi_logloss: 0.0485861	valid_1's multi_logloss: 0.0761708
[250]	training's multi_logloss: 0.0407011	valid_1's multi_logloss: 0.0706806
[300]	training's multi_logloss: 0.0345743	valid_1's multi_logloss: 0.0671128
[350]	training's multi_logloss: 0.029644	valid_1's multi_logloss: 0.0646896
[400]	training's multi_logloss: 0.0252796	valid_1's multi_logloss: 0.0629707
[450]	training's multi_logloss: 0.0217325	valid_1's multi_logloss: 0.0618195
[500]	training's multi_logloss: 0.0186908	valid_1's multi_logloss: 0.0611104
[550]	training's multi_logloss: 0.0161452	valid_1's multi_logloss: 0.060773
[600]	training's multi_logloss: 0.0139204	valid_1's multi_logloss: 0.060651
[650]	tra

  _warn_prf(average, modifier, msg_start, len(result))


In [150]:
trn_idx

array([    0,     1,     3, ..., 54244, 54246, 54247])