In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import numpy as np
import pandas as pd

traffic_df = pd.read_csv('/content/gdrive/MyDrive/CIC-Darknet/darknet_log.csv')
traffic_df.shape

(141481, 80)

In [3]:
traffic_df.head()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,target
0,57158,443,6,229,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
1,57159,443,6,407,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
2,57160,443,6,431,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
3,49134,443,6,359,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
4,34697,19305,6,10778451,591,400,64530,6659,131,0,...,20,0,0,0,0,34.901863,14.952612,34.901863,34.901863,0


In [4]:
traffic_df['target'].value_counts()

5     48300
1     32545
8     13284
4      8563
7      8402
2      6932
3      5561
0      4766
10     4541
12     2610
6      2101
14     1465
15     1346
11      582
9       263
13      220
Name: target, dtype: int64

In [5]:
is_na_cols = traffic_df.columns[traffic_df.isna().sum() > 0]
traffic_df.isna().sum()[is_na_cols] 

Series([], dtype: int64)

### Data Split

In [6]:
from sklearn.model_selection import train_test_split

y_traffic_df = traffic_df['target']
X_traffic_df = traffic_df.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_traffic_df, y_traffic_df, stratify=y_traffic_df, test_size=0.4, random_state=11)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(84888, 79) (84888,)
(56593, 79) (56593,)


In [7]:
X_vali, X_test, y_vali, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, random_state=11)
print(X_vali.shape, y_vali.shape)
print(X_test.shape, y_test.shape)

(28296, 79) (28296,)
(28297, 79) (28297,)


In [8]:
label_name = ['benign_Audio-Streaming', 'benign_Browsing', 'benign_Chat', 'benign_Email', 'benign_File-Transfer', 'benign_P2P', 'benign_VOIP', 'benign_Video-Streaming', 'darknet_Audio-Streaming', 'darknet_Browsing', 'darknet_Chat', 'darknet_Email', 'darknet_File-Transfer', 'darknet_P2P', 'darknet_VOIP', 'darknet_Video-Streaming']
len(label_name)

16

### Select From Model

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

select = SelectFromModel(LGBMClassifier(), threshold="mean")
select.fit(X_train, y_train)
select.get_support()

array([ True,  True, False,  True,  True, False,  True,  True, False,
       False, False,  True, False, False, False, False,  True,  True,
       False, False,  True,  True,  True, False, False, False,  True,
        True, False,  True,  True,  True, False, False, False, False,
        True, False,  True,  True, False, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False,  True,  True, False, False, False,
       False, False, False,  True, False,  True,  True])

In [10]:
drop_features = [i for i, x in enumerate(select.get_support()) if not x]
print(drop_features)

[2, 5, 8, 9, 10, 12, 13, 14, 15, 18, 19, 23, 24, 25, 28, 32, 33, 34, 35, 37, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 69, 70, 71, 72, 73, 74, 76]


### Selected Feature Test

In [11]:
X_train_selected = select.transform(X_train)
X_valid_selected = select.transform(X_vali)

##### Validation

In [12]:
lgbm_wrapper = LGBMClassifier(random_state=11, n_estimators=500, boost_from_average=False)
lgbm_wrapper.fit(X_train_selected, y_train)

LGBMClassifier(boost_from_average=False, n_estimators=500, random_state=11)

In [13]:
lgbm_pred = lgbm_wrapper.predict(X_valid_selected)

print('{0} 정확도:{1:.4f}'.format(lgbm_wrapper.__class__.__name__, accuracy_score(y_vali, lgbm_pred)))
print(confusion_matrix(y_vali, lgbm_pred), '\n')
print(classification_report(y_vali, lgbm_pred, target_names=label_name), '\n')

LGBMClassifier 정확도:0.9072
[[ 629   31    4    1   19   13    1  227   26    0    1    0    1    0
     0    1]
 [   5 6386    2    0   74    3    0   37    0    0    0    0    1    0
     0    1]
 [   3   16  989  206    8    3   99   17    6    0    1    1    2    0
    35    0]
 [   0    1  392  622    4    0   63    0    2    0    0    2   10    0
    16    0]
 [  12  194   19    5 1416    6    3   51    1    0    0    0    1    0
     2    2]
 [   0    2    0    0    0 9657    0    0    1    0    0    0    0    0
     0    0]
 [   0    0  142   70    1    0  199    0    1    0    0    0    1    0
     6    0]
 [ 237  133    3    1   49    3    0 1215   32    0    1    0    4    0
     1    1]
 [  20    0    1    1    0    0    0   23 2590    0    7    0    6    0
     0    9]
 [   0    0    0    3    0    0    0    0    1   48    0    0    0    0
     0    1]
 [   1    0    0    0    0    0    0    9    8    1  886    0    2    0
     0    1]
 [   0    2    0    3    1    0    0   