In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
import itertools 


In [2]:
corpus = list()
labels = list()

with open(Path('./output/benign/all_benign.txt'), 'r') as f:
  lines = f.read().split('\n')[:-1]
  # print(lines[0][33:])
for line in lines:
  doc = line[33:]
  corpus.append(doc)
  labels.append(0)

with open(Path('./output/malware/all_malware.txt'), 'r') as f:
  lines = f.read().split('\n')[:-1]
  # print(lines[0][33:])
for line in lines:
  doc = line[33:]
  corpus.append(doc)
  labels.append(1)

In [3]:
y = np.array(labels)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv3 = CountVectorizer(ngram_range=(3,3))
X3 = cv3.fit_transform(corpus).toarray()

In [5]:
# 정규화 안한 데이터셋 test_size 10%
X31_train, X31_test, y31_train, y31_test = train_test_split(X3, y, test_size = 0.1, random_state = 0)

In [6]:
from sklearn.model_selection import train_test_split
# 정규화 하고 test_size 10%
v3=np.array(X3).astype(np.float32)
for i in range(len(corpus)):
    s=sum(X3[i])
    v3[i]=((X3[i]/s)*100).astype(np.float32)
    
X32_train, X32_test, y32_train, y32_test = train_test_split(v3, y, test_size = 0.1, random_state = 0)

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### RadomForestClassifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
classifierrf32 = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', random_state = 0)
classifierrf32.fit(X32_train, y32_train)
yrf32_pred=classifierrf32.predict(X32_test)

cm32 = confusion_matrix(y32_test, yrf32_pred)
print(cm32)
print(metrics.classification_report(y32_test, yrf32_pred, digits=4))

[[150   2]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9434    0.9868    0.9646       152
           1     0.9835    0.9297    0.9558       128

    accuracy                         0.9607       280
   macro avg     0.9634    0.9583    0.9602       280
weighted avg     0.9617    0.9607    0.9606       280



In [14]:
classifierrf32 = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', random_state = 0)
classifierrf32.fit(X31_train, y31_train)
yrf31_pred=classifierrf32.predict(X31_test)

cm31 = confusion_matrix(y31_test, yrf31_pred)
print(cm31)
print(metrics.classification_report(y31_test, yrf31_pred, digits=4))

[[148   4]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9427    0.9737    0.9579       152
           1     0.9675    0.9297    0.9482       128

    accuracy                         0.9536       280
   macro avg     0.9551    0.9517    0.9531       280
weighted avg     0.9540    0.9536    0.9535       280



정규화를 했을 때 더 성능이 높게 나옴

### KNeighborsClassifier

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
from sklearn.neighbors import KNeighborsClassifier

classifierknn32 = KNeighborsClassifier(n_neighbors = 2)
classifierknn32.fit(X32_train, y32_train)

yknn32_pred=classifierknn32.predict(X32_test)
cm32 = confusion_matrix(y32_test, yknn32_pred)
print(cm32)
print(classification_report(y32_test, yknn32_pred, digits=4))

[[144   8]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9412    0.9474    0.9443       152
           1     0.9370    0.9297    0.9333       128

    accuracy                         0.9393       280
   macro avg     0.9391    0.9385    0.9388       280
weighted avg     0.9393    0.9393    0.9393       280



정규화한게 더 성능이 좋음

In [31]:
classifierknn31 = KNeighborsClassifier(n_neighbors = 2)
classifierknn31.fit(X31_train, y31_train)

yknn31_pred=classifierknn31.predict(X31_test)
cm31 = confusion_matrix(y31_test, yknn31_pred)
print(cm31)
print(classification_report(y31_test, yknn31_pred, digits=4))

[[140  12]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9396    0.9211    0.9302       152
           1     0.9084    0.9297    0.9189       128

    accuracy                         0.9250       280
   macro avg     0.9240    0.9254    0.9246       280
weighted avg     0.9253    0.9250    0.9251       280



### DecisionTreeClassifier

In [40]:
from sklearn.tree import DecisionTreeClassifier

# criterion gini가 entropy
classifierdt32 = DecisionTreeClassifier(criterion = 'gini', random_state = 0)

classifierdt32.fit(X32_train, y32_train)
ydt32_pred=classifierdt32.predict(X32_test)

cm32 = confusion_matrix(y32_test, ydt32_pred)
print(cm32)
print(metrics.classification_report(y32_test, ydt32_pred, digits=4))


[[134  18]
 [ 10 118]]
              precision    recall  f1-score   support

           0     0.9306    0.8816    0.9054       152
           1     0.8676    0.9219    0.8939       128

    accuracy                         0.9000       280
   macro avg     0.8991    0.9017    0.8997       280
weighted avg     0.9018    0.9000    0.9002       280



### 앙상블 알고리즘 적용

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier


  from pandas import MultiIndex, Int64Index


### XGBoost

In [21]:
best_model_XGB = XGBClassifier(eval_metric = 'logloss', 
                              booster = 'gbtree',
                              colsample_bylevel=0.9,
                              colsample_bytree=0.8,
                              silent = True, 
                              max_depth = 6, 
                              min_child_weight = 1, 
                               gamma = 0, 
                               n_estimators=50,
                               nthread=4,
                               objective='binary:logistic',
                               random_state=42,
                               learning_rate=0.0001,
                               )

best_model_XGB.fit(X32_train, y32_train)
XGB_pred=best_model_XGB.predict(X32_test)

cm32 = confusion_matrix(y32_test, XGB_pred)
print(cm32)
print(metrics.classification_report(y32_test, XGB_pred, digits=4))



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[[148   4]
 [ 12 116]]
              precision    recall  f1-score   support

           0     0.9250    0.9737    0.9487       152
           1     0.9667    0.9062    0.9355       128

    accuracy                         0.9429       280
   macro avg     0.9458    0.9400    0.9421       280
weighted avg     0.9440    0.9429    0.9427       280



### LGBM

In [15]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

model_LGBM = LGBMClassifier(boosting_type='gbdt',learning_rate=0.01 ,
                            n_estimators=600, max_bin=255, 
                            num_leaves=24,random_state=42)

model_LGBM.fit(X32_train, y32_train)
LGBM_pred=model_LGBM.predict(X32_test)

cm32 = confusion_matrix(y32_test, LGBM_pred)
print(cm32)
print(metrics.classification_report(y32_test, LGBM_pred, digits=4))

[[146   6]
 [  5 123]]
              precision    recall  f1-score   support

           0     0.9669    0.9605    0.9637       152
           1     0.9535    0.9609    0.9572       128

    accuracy                         0.9607       280
   macro avg     0.9602    0.9607    0.9604       280
weighted avg     0.9608    0.9607    0.9607       280



### ExtraTreesClassifier

In [20]:
model_Extra = ExtraTreesClassifier(max_depth=25, n_estimators=320)


model_Extra.fit(X32_train, y32_train)
Extra_pred=model_Extra.predict(X32_test)

cm32 = confusion_matrix(y32_test, Extra_pred)
print(cm32)
print(metrics.classification_report(y32_test, Extra_pred, digits=4))

[[151   1]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9437    0.9934    0.9679       152
           1     0.9917    0.9297    0.9597       128

    accuracy                         0.9643       280
   macro avg     0.9677    0.9616    0.9638       280
weighted avg     0.9657    0.9643    0.9642       280



### 앙상블

#### SoftVoting

In [26]:
from sklearn.ensemble import VotingClassifier
softVoting_model = VotingClassifier(estimators=[('RF', classifierrf32), ('KNN', classifierknn32), ('EXTRA', model_Extra), ('LGBM', model_LGBM), ('XGB', best_model_XGB)], voting='soft')
softVoting_model.fit(X32_train, y32_train)

soft_pred = softVoting_model.predict(X32_test)
print(cm32)
print(classification_report(y32_test, soft_pred, digits=4))



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[[144   8]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9487    0.9737    0.9610       152
           1     0.9677    0.9375    0.9524       128

    accuracy                         0.9571       280
   macro avg     0.9582    0.9556    0.9567       280
weighted avg     0.9574    0.9571    0.9571       280



#### HardVoting

In [29]:
hardVoting_model = VotingClassifier(estimators=[('RF', classifierrf32), ('KNN', classifierknn32), ('EXTRA', model_Extra), ('LGBM', model_LGBM), ('XGB', best_model_XGB)], voting='hard')
hardVoting_model.fit(X32_train, y32_train)

hard_pred = hardVoting_model.predict(X32_test)
print(cm32)
print(classification_report(y32_test, hard_pred, digits=4))



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[[144   8]
 [  9 119]]
              precision    recall  f1-score   support

           0     0.9497    0.9934    0.9711       152
           1     0.9917    0.9375    0.9639       128

    accuracy                         0.9679       280
   macro avg     0.9707    0.9655    0.9675       280
weighted avg     0.9689    0.9679    0.9678       280

