In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('./data/train.csv')
validation = pd.read_csv('./data/validation_sample.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv', index_col=0)

train.shape, validation.shape, test.shape, submission.shape

  mask |= (ar1 == a)


((472972, 3), (3, 1), (1418916, 2), (1418916, 1))

# 전처리

## 마스킹

- 날짜 및 시간 
    - Time Stamp → `<TS>` 
    <br>
    e.g) `2021-02-08T16:21:00` `2021-01-12T07:22:32Z`
    - `<YEAR>` `<DATE>` `<TIME>` `<DAY>`
    <br>
    e.g) `2020 Oct 30 08:39:31` `Fri Sep 18 13:36:11 2020` `2021-01-12T07:22:32Z`


- IP 주소  
    - `127.0.0.1` → `localhost`
    - IP, IP.PORT 패턴 → `<IP>`
    <br>
    e.g) `211.253.243.71` `211.253.243.71.6000`


- 헥사, 시리얼 넘버, 숫자
    - `0x`로 시작하는 16진수 → `<HEX>`
    - 시리얼 넘버 -> `<SN>`
    - `=`, `:`, `'`, ` ` 다음에 오는 16진수 → `<NUM>`
    - 숫자 → `<NUM>`


- 특수문자 처리
    - `][` → `] [`
    - `,` → ` , `
    - 공백 하나로 변환

In [3]:
TARGET_COLUMN = 'full_log'
PATTERNS = [('\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z?', '<TS>'), 
            # YEAR, MON, DAY, TIME
            ('\d{4}(?= (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))', '<YEAR>'), 
            ('(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', '<DATE> <TIME>'), 
            ('[A-Za-z]{3}\s+<DATE> <TIME>\s+\d{4}', '<DAY> <DATE> <TIME> <YEAR>'), 
            # IP
            ('127.0.0.1', 'localhost'),
            ('\d+\.\d+\.\d+\.\d+(?:\.\d+)?', '<IP>'), 
            # HEX, NUM
            ('(?<![0-9a-fA-F])0x[0-9a-fA-F]+(?=\W|$)', '<HEX>'), 
            ('(?<=\W)(?=[a-fA-F0-9\-]*[0-9])(?=[a-fA-F0-9\-]*[a-fA-F])[a-fA-F0-9]{3,}(?:-[a-fA-F0-9]{3,})+(?=\W|$)', '<SN>'),
            ('(?<==)[a-fA-F0-9]+(?=\W|$)', '<NUM>'),
            ('(?<=:)[a-fA-F0-9]+(?=\s|$)', '<NUM>'),
            ('(?<=\')[a-fA-F0-9]+(?=\')', '<NUM>'),
            ('(?<= )(?=[a-fA-F0-9]*[0-9])[a-fA-F0-9]{4,}(?=\W|$)', '<NUM>'), 
            ('(?<=[^a-zA-Z0-9])(\d+)(?=[^a-zA-Z0-9]|$)', '<NUM>'),
            # 특수문자
            ('\]\[', '] ['), 
            (',', ' , '), 
            ('\s+', ' ')]

def apply_masking(df):
    for pat, repl in PATTERNS:
        df[TARGET_COLUMN] = df[TARGET_COLUMN].str.replace(pat, repl, regex=True)

`train`, `validation`, `test`에 각각 적용

In [4]:
FILES = {'train': train, 'validation': validation, 'test': test}

for file, df in FILES.items():
    apply_masking(df)

In [5]:
train.head()

Unnamed: 0,id,level,full_log
0,0,0,"<DATE> <TIME> localhost kibana: {""type"":""error..."
1,1,0,"<DATE> <TIME> localhost logstash: [<TS> , <NUM..."
2,2,0,"<DATE> <TIME> localhost kibana: {""type"":""error..."
3,3,0,"<DATE> <TIME> localhost kibana: {""type"":""error..."
4,4,1,type=SYSCALL msg=audit(<NUM>.<NUM>:<NUM>): arc...


## 중복 제거

`full_log`가 동일하지만 `level`이 다른 데이터가 다수 있었습니다. 

따라서 중복되는 데이터 중에서 빈도수가 가장 높은 level의 로그만 남기고 나머지를 제거하였고, 
레벨별 빈도수가 같은 경우에는 level이 낮은 로그만 남기고 나머지를 제거하였습니다.


### 1) 마스킹된 `full_log`를 기준으로 로그가 같은데 위험도 `level`이 다른 경우 추출

1. pivot table로 `full_log`별 `level`의 개수 확인
2. pivot table의 `level` column 중 값이 0 이상인 column의 개수 계산 (→ `count` column 생성)
3. pivot table의 `count` column이 1 보다 큰 로그 데이터 추출

In [6]:
def count_level(df):
    columns = df.columns
    count = 0
    for col in columns:
        count += (df[col] > 0)
    return count

In [7]:
pivot = train.pivot_table(values='id', index='full_log', columns='level', aggfunc='count', fill_value=0)
pivot.head()

level,0,1,2,3,4,5,6
full_log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"--MARK--: %&)Y , UK%AV1Q$*H_U!C1HyPwEtE?ox8@1xQOHm*%YglknvkO-V$8JeugrDIbo/C.<NUM>^Qn2ZE%rhPbzErRort7-2Dz) , mg%K;?y(MQ5yMC6#Nt2^fJyaRrph!83D1@jawDob9%TooFvx/q*FuL1Vz2'+*Y2qhA/@?8NdvwqDIHWv)7V6H/+R*.f^aw1Lmj9Rh@ir6yU=H#.AQsLG1R?d)Vn1Kh50Xa!?nik , CpaJ;#vF-fU%n]BebMAXe_aC'tn'w-Dnrub_Rv)n#@PlrD8TA , L/'EaJ3oADm?[OBE6;o36b6R4A3f'ECA , zDvQmdH*S#3Zl4Ahd$L;#;w871bsZ#N(HcXzFI[Pr#.B388)w;TIBm5_",1,0,0,0,0,0,0
"--MARK--: 0KY#u2STO^[TWE^uuL.+#a3!+;KeBpHtbuyJ.A&Q_ffa1PSJ]=OuJxVV.A*FuI#*3FBj)/<NUM>'ekCsGF3k!dg$56x%<NUM>'(5stFifpWDtxYnq]%BYiRjz3;w?/)Oh[U!S , ^6Wd-'Iq5*<NUM>$h , _Is0RdURexeSl_O[H!KP;;?#q!BB , Jwtk19MpcPp!kh3nkFAtAlZdGh#-EsgnS9kW5Rd9Vet?TM-ejJpQv;QrD(wnVPK^vDR2D0!^CkI_dq82O?A#6o*=llR026#TolYPZ2p*/Wz@qkTkqD(.@BGT6w1d1FSVCz5+C5ehmz?ndyqC9@&BPa[QmY$vrT2&mmF3tEQ%HQL[J&?OS5+kZ8Du0t$[xNDN4!fl-oSt#gU[$@+0Z;0E'm.I)oZrZ]'",1,0,0,0,0,0,0
"--MARK--: 2k06(D+8h?6q;fFD34mn2lmfGI'(vl6K4Sh##aw39pt_&iORKEzG_K! , , I&@AhO[D59PR3eG(7nFic.S#5P)b@niw , uB*_]n!&.fiy.aS*3st[/iLcFDn3&Y*$] , W!FKh&i!Mwxs&zNJUIhyzLxCJ+*]RPMAzC5BKv7s/GM23J**owGveQJeKj1Uq?rHD=l88zP_.wUVrSCG'9w[DW2GeR-u-@1C4 , -DdeRroR8^_$eHMS$ElzEmdZb$c-IqIu2N.]?yJnu(T)aE(4Kc9gP!VR9Y!'o).jM= , gkvOryu_vB)d5Oo4'3ctt?L[<NUM>'El6#fxla_RGK;!t^sV&_OH9$=<NUM>[Cl-[G7X)E]Q!UyvSd-h51@uAj//ZJfm/Y4PZRICu&o*u , %",1,0,0,0,0,0,0
"--MARK--: =?s_w*?!5u]7c/s?Z1(FATAl?o0khglYB]Q0OTwjGzB!nVlVSdiJnnN/VJJ#qi091Xao?iduzHEdkUw&rDtQQc2sV7(6a8$U#j0MC&wG , YRG$rbf+@1c0-n6E^.2Cv&<NUM>=T7mXZ$D]1nf9yl6@QER690Yup*5Zt48Lu'9gc65Ixx]B2'JMqv_qAY0=XEdVmW6NpRRFqy+3S&mKFPru+erp3d]mRJ;v@D*8nfRlrFu!1vU;pNsHl5524*pD)VOkSOf$<NUM>(&)J?E6FERs4KV;'y6cZxVcO6Rf/@+s?kK5LSVHo'D2](^l_zzD9]5o&m!&;Kw^@*.?tBL. , L.K))IKkU*6kD4+DmgYrYG9$WZn)hTf0ozna4wdJ&f5u]te.<NUM>&)vSJdMUuNU55H??J3LNru!T@PZja(b5h.atdRP6T/Kq5A5M)!dIL36c0h-_TPg3*%5d*+(*5H!?EU3_c/wEYGq",1,0,0,0,0,0,0
"--MARK--: Bve5Z-oLF9#)LT;.snZlXJbizTDVmEOjQP86?i5;[k0Ir(w=u^M#G5PZVNtzL2N/aIA16bz;f6^(e962LkZjt_gIx(w]/sAoxENMgi3!!ql?&;yz53$'q#+#Hf_g$W/c9TR)jpB , 4z_zc40reDUAx6xn?i*3nPu[01FZg)oGqKoq%<NUM>;3RKKw9zuFNpsjDqj_35c?VfE^vVE27MQ[<NUM>-PL/'Hl]<NUM>!];yKVNZWJH!i(pv$1AWrT$@[m5]Y=?]b/yRu17FroqeFrTg2!ErRoR-pJW5oqPotFmPyj*D&WS&sR% , YVsbS70HCHFjcynWpmYPW+@YTvO-$]&9s67rxW",1,0,0,0,0,0,0


In [8]:
pivot['count'] = count_level(pivot)
pivot.head()

level,0,1,2,3,4,5,6,count
full_log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"--MARK--: %&)Y , UK%AV1Q$*H_U!C1HyPwEtE?ox8@1xQOHm*%YglknvkO-V$8JeugrDIbo/C.<NUM>^Qn2ZE%rhPbzErRort7-2Dz) , mg%K;?y(MQ5yMC6#Nt2^fJyaRrph!83D1@jawDob9%TooFvx/q*FuL1Vz2'+*Y2qhA/@?8NdvwqDIHWv)7V6H/+R*.f^aw1Lmj9Rh@ir6yU=H#.AQsLG1R?d)Vn1Kh50Xa!?nik , CpaJ;#vF-fU%n]BebMAXe_aC'tn'w-Dnrub_Rv)n#@PlrD8TA , L/'EaJ3oADm?[OBE6;o36b6R4A3f'ECA , zDvQmdH*S#3Zl4Ahd$L;#;w871bsZ#N(HcXzFI[Pr#.B388)w;TIBm5_",1,0,0,0,0,0,0,1
"--MARK--: 0KY#u2STO^[TWE^uuL.+#a3!+;KeBpHtbuyJ.A&Q_ffa1PSJ]=OuJxVV.A*FuI#*3FBj)/<NUM>'ekCsGF3k!dg$56x%<NUM>'(5stFifpWDtxYnq]%BYiRjz3;w?/)Oh[U!S , ^6Wd-'Iq5*<NUM>$h , _Is0RdURexeSl_O[H!KP;;?#q!BB , Jwtk19MpcPp!kh3nkFAtAlZdGh#-EsgnS9kW5Rd9Vet?TM-ejJpQv;QrD(wnVPK^vDR2D0!^CkI_dq82O?A#6o*=llR026#TolYPZ2p*/Wz@qkTkqD(.@BGT6w1d1FSVCz5+C5ehmz?ndyqC9@&BPa[QmY$vrT2&mmF3tEQ%HQL[J&?OS5+kZ8Du0t$[xNDN4!fl-oSt#gU[$@+0Z;0E'm.I)oZrZ]'",1,0,0,0,0,0,0,1
"--MARK--: 2k06(D+8h?6q;fFD34mn2lmfGI'(vl6K4Sh##aw39pt_&iORKEzG_K! , , I&@AhO[D59PR3eG(7nFic.S#5P)b@niw , uB*_]n!&.fiy.aS*3st[/iLcFDn3&Y*$] , W!FKh&i!Mwxs&zNJUIhyzLxCJ+*]RPMAzC5BKv7s/GM23J**owGveQJeKj1Uq?rHD=l88zP_.wUVrSCG'9w[DW2GeR-u-@1C4 , -DdeRroR8^_$eHMS$ElzEmdZb$c-IqIu2N.]?yJnu(T)aE(4Kc9gP!VR9Y!'o).jM= , gkvOryu_vB)d5Oo4'3ctt?L[<NUM>'El6#fxla_RGK;!t^sV&_OH9$=<NUM>[Cl-[G7X)E]Q!UyvSd-h51@uAj//ZJfm/Y4PZRICu&o*u , %",1,0,0,0,0,0,0,1
"--MARK--: =?s_w*?!5u]7c/s?Z1(FATAl?o0khglYB]Q0OTwjGzB!nVlVSdiJnnN/VJJ#qi091Xao?iduzHEdkUw&rDtQQc2sV7(6a8$U#j0MC&wG , YRG$rbf+@1c0-n6E^.2Cv&<NUM>=T7mXZ$D]1nf9yl6@QER690Yup*5Zt48Lu'9gc65Ixx]B2'JMqv_qAY0=XEdVmW6NpRRFqy+3S&mKFPru+erp3d]mRJ;v@D*8nfRlrFu!1vU;pNsHl5524*pD)VOkSOf$<NUM>(&)J?E6FERs4KV;'y6cZxVcO6Rf/@+s?kK5LSVHo'D2](^l_zzD9]5o&m!&;Kw^@*.?tBL. , L.K))IKkU*6kD4+DmgYrYG9$WZn)hTf0ozna4wdJ&f5u]te.<NUM>&)vSJdMUuNU55H??J3LNru!T@PZja(b5h.atdRP6T/Kq5A5M)!dIL36c0h-_TPg3*%5d*+(*5H!?EU3_c/wEYGq",1,0,0,0,0,0,0,1
"--MARK--: Bve5Z-oLF9#)LT;.snZlXJbizTDVmEOjQP86?i5;[k0Ir(w=u^M#G5PZVNtzL2N/aIA16bz;f6^(e962LkZjt_gIx(w]/sAoxENMgi3!!ql?&;yz53$'q#+#Hf_g$W/c9TR)jpB , 4z_zc40reDUAx6xn?i*3nPu[01FZg)oGqKoq%<NUM>;3RKKw9zuFNpsjDqj_35c?VfE^vVE27MQ[<NUM>-PL/'Hl]<NUM>!];yKVNZWJH!i(pv$1AWrT$@[m5]Y=?]b/yRu17FroqeFrTg2!ErRoR-pJW5oqPotFmPyj*D&WS&sR% , YVsbS70HCHFjcynWpmYPW+@YTvO-$]&9s67rxW",1,0,0,0,0,0,0,1


In [9]:
pivot[pivot['count'] > 1]

level,0,1,2,3,4,5,6,count
full_log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
<DATE> <TIME> localhost sudo: apache : TTY=unknown ; PWD=/var/www/html/management ; USER=root ; COMMAND=/bin/curl -XGET localhost:<NUM>/_cat/snapshots/esild_backup,0,4,1,0,0,0,0,2
juniper,4,0,0,1,0,0,0,2
"level : %{rule.level} , log : %{log}",10,8,0,0,0,0,0,2
"level : <NUM> , log : 'commit at' was successful",1,1,0,0,0,0,0,2
"level : <NUM> , log : Aborting , unable to run in the background as a daemon: error-message",1,4,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...
"level : <NUM> , log : task-name: lost interface-hierarchy logical-interface-index for route route-prefix",1,1,0,0,0,0,0,2
"level : <NUM> , log : traceRouteCtlOwnerIndex = test-owner , traceRouteCtlTestName = test-name",2,1,0,0,0,0,0,2
"level : <NUM> , log : type username is invalid",1,1,0,0,0,0,0,2
"level : <NUM> , log : user-type User username logged in MacAddress mac-addresse interface interface-name vlan vlan-name",1,1,0,0,0,0,0,2


### 2) 다수의 `level` 을 갖는 로그마다 하나의 `level` 선택

- 빈도수가 가장 높은 level 선택
- 레벨별 빈도수가 같은 경우에는 낮은 level 선택

→ 선택한 level로 `level` column 생성

In [10]:
dupl = pivot[pivot['count'] > 1].copy()
dupl['level'] = np.argmax(dupl.iloc[:, :-1].values, axis=-1)
dupl

level,0,1,2,3,4,5,6,count,level
full_log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
<DATE> <TIME> localhost sudo: apache : TTY=unknown ; PWD=/var/www/html/management ; USER=root ; COMMAND=/bin/curl -XGET localhost:<NUM>/_cat/snapshots/esild_backup,0,4,1,0,0,0,0,2,1
juniper,4,0,0,1,0,0,0,2,0
"level : %{rule.level} , log : %{log}",10,8,0,0,0,0,0,2,0
"level : <NUM> , log : 'commit at' was successful",1,1,0,0,0,0,0,2,0
"level : <NUM> , log : Aborting , unable to run in the background as a daemon: error-message",1,4,0,0,0,0,0,2,1
...,...,...,...,...,...,...,...,...,...
"level : <NUM> , log : task-name: lost interface-hierarchy logical-interface-index for route route-prefix",1,1,0,0,0,0,0,2,0
"level : <NUM> , log : traceRouteCtlOwnerIndex = test-owner , traceRouteCtlTestName = test-name",2,1,0,0,0,0,0,2,0
"level : <NUM> , log : type username is invalid",1,1,0,0,0,0,0,2,0
"level : <NUM> , log : user-type User username logged in MacAddress mac-addresse interface interface-name vlan vlan-name",1,1,0,0,0,0,0,2,0


### 3) `level` column에서 지정한 level이 아닌 로그 제거

In [11]:
print(train.shape)
train['level'].value_counts()

(472972, 3)


0    334065
1    132517
3      4141
5      2219
2        12
4        10
6         8
Name: level, dtype: int64

In [12]:
for index in dupl.index:
    targets = train[(train['full_log']==index) & (train['level']!=dupl['level'][index])].index
    train.drop(index=targets, inplace=True)
    
print(train.shape)
train['level'].value_counts()

(472550, 3)


0    334020
1    132182
3      4139
5      2180
2        11
4        10
6         8
Name: level, dtype: int64

# 모델링

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score

In [14]:
X_data = train['full_log']
y_data = train['level']

# Vectorize

In [15]:
# 해당 모델이서 TfidfVectorizer가 CountVectorizer보다 더 성능이 우수하다고 판단하여 사용
VOCAB_SIZE = 10000

vectorizer = TfidfVectorizer(analyzer='word', max_features=VOCAB_SIZE) 
X_features = vectorizer.fit_transform(X_data)
X_features

<472550x9980 sparse matrix of type '<class 'numpy.float64'>'
	with 17482703 stored elements in Compressed Sparse Row format>

# Train

## Split

In [16]:
X_train, X_eval, y_train, y_eval = train_test_split(X_features, y_data, test_size=0.08, random_state=100, stratify=y_data)

X_train.shape, y_train.shape, X_eval.shape, y_eval.shape

((434746, 9980), (434746,), (37804, 9980), (37804,))

In [17]:
y_train.value_counts() # train_test_split후 train 데이터의 레벨별 로그 갯수 확인

0    307298
1    121608
3      3808
5      2006
2        10
4         9
6         7
Name: level, dtype: int64

In [18]:
y_eval.value_counts() # test_size에 따른 데이터의 레벨별 로그 갯수 확인

0    26722
1    10574
3      331
5      174
2        1
4        1
6        1
Name: level, dtype: int64

## Extra Trees

RandomForestClassifier보다 훨씬 더 무작위성을 갖는 모델인 ExtraTreesClassifier를 사용

In [19]:
clf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

ExtraTreesClassifier(n_jobs=-1)

In [20]:
proba = clf.predict_proba(X_eval)

In [21]:
pred = np.argmax(proba, axis=-1)
crosstab = pd.crosstab(y_eval, pred, rownames=['real'], colnames=['pred'])

print(f1_score(y_eval, pred, average='macro')) # 대회 평가 산식인 Macro F1 확인
crosstab

0.9983030090558923


pred,0,1,2,3,4,5,6
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,26721,1,0,0,0,0,0
1,21,10553,0,0,0,0,0
2,0,0,1,0,0,0,0
3,3,0,0,328,0,0,0
4,0,0,0,0,1,0,0
5,2,0,0,0,0,172,0
6,0,0,0,0,0,0,1


In [22]:
THRESHOLD = 0.9 # 모든 위험도의 임계값을 0.9로 설정

In [23]:
pred_open = pred.copy()
pred_open[np.where(np.max(proba, axis=1) < THRESHOLD)] = 7
new_crosstab = pd.crosstab(y_eval, pred_open, rownames=['real'], colnames=['pred'])

print(f1_score(y_eval, pred_open, average='macro'))
new_crosstab

0.8724874344319187


pred,0,1,2,3,4,5,6,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,26669,0,0,0,0,0,0,53
1,9,10552,0,0,0,0,0,13
2,0,0,1,0,0,0,0,0
3,2,0,0,325,0,0,0,4
4,0,0,0,0,1,0,0,0
5,1,0,0,0,0,171,0,2
6,0,0,0,0,0,0,1,0


In [24]:
proba_all = clf.predict_proba(X_features)

In [25]:
pred_all = np.argmax(proba_all, axis=-1)
crosstab = pd.crosstab(y_data, pred_all, rownames=['real'], colnames=['pred'])

print(f1_score(y_data, pred_all, average='macro'))
crosstab

0.9998634765468999


pred,0,1,2,3,4,5,6
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,334019,1,0,0,0,0,0
1,23,132159,0,0,0,0,0
2,0,0,11,0,0,0,0
3,3,0,0,4136,0,0,0
4,0,0,0,0,10,0,0
5,2,0,0,0,0,2178,0
6,0,0,0,0,0,0,8


In [26]:
pred_open_all = pred_all.copy()
pred_open_all[np.where(np.max(proba_all, axis=1) < THRESHOLD)] = 7
new_crosstab = pd.crosstab(y_data, pred_open_all, rownames=['real'], colnames=['pred'])

print(f1_score(y_data, pred_open_all, average='macro'))
new_crosstab

0.8747991906837058


pred,0,1,2,3,4,5,6,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,333965,0,0,0,0,0,0,55
1,10,132158,0,0,0,0,0,14
2,0,0,11,0,0,0,0,0
3,2,0,0,4133,0,0,0,4
4,0,0,0,0,10,0,0,0
5,1,0,0,0,0,2177,0,2
6,0,0,0,0,0,0,8,0


## Validate

In [27]:
# Validation을 통한 성능 검증
X_valid = validation['full_log']
X_valid = vectorizer.transform(X_valid)
valid_proba = clf.predict_proba(X_valid)

In [28]:
results = np.argmax(valid_proba, axis=-1)
results[np.where(np.max(valid_proba, axis=1) < THRESHOLD)] = 7 

print(valid_proba)
results

[[0.   0.99 0.   0.01 0.   0.   0.  ]
 [0.03 0.01 0.   0.19 0.   0.77 0.  ]
 [0.82 0.07 0.01 0.03 0.   0.05 0.02]]


array([1, 7, 7], dtype=int64)

In [29]:
validation['full_log'][0] # 3개의 validation sample data중 첫 번째 data 분류 실패

'type=ANOM_PROMISCUOUS msg=audit(<NUM>.<NUM>:<NUM>): dev=enp2s0 prom=<NUM> old_prom=<NUM> auid=<NUM> uid=<NUM> gid=<NUM> ses=<NUM> type=SYSCALL msg=audit(<NUM>.<NUM>:<NUM>): arch=<NUM> syscall=<NUM> success=yes exit=<NUM> a0=<NUM> a1=<NUM> a2=<NUM> a3=<NUM> items=<NUM> ppid=<NUM> pid=<NUM> auid=<NUM> uid=<NUM> gid=<NUM> euid=<NUM> suid=<NUM> fsuid=<NUM> egid=<NUM> sgid=<NUM> fsgid=<NUM> tty=(none) ses=<NUM> comm="W#<NUM>-enp2s0" exe="/usr/sbin/suricata" subj=system_u:system_r:unconfined_service_t:s0 key=(null) type=PROCTITLE msg=audit(<NUM>.<NUM>:<NUM>): proctitle=<NUM>'

# Predict

In [30]:
X_test = test['full_log']
X_test = vectorizer.transform(X_test)
results_proba = clf.predict_proba(X_test)

In [32]:
results = np.argmax(results_proba, axis=-1)
results[np.where(np.max(results_proba, axis=1) < THRESHOLD)] = 7

## 결과 저장

In [33]:
submission['level'] = results
submission['level'].value_counts().sort_index()

0    1001140
1     396033
2         34
3      12906
4         34
5       6418
6         25
7       2326
Name: level, dtype: int64