In [14]:
import sys 
import pandas as pd
import numpy as np
import time
import seaborn as sns
from tqdm import tqdm

# 评估方法
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, classification_report

# 非平衡数据采样库
# from imblearn.over_sampling import SMOTE, ADASYN
# from sklearn.utils.class_weight import compute_sample_weight

# 数据处理
from sklearn.preprocessing import LabelEncoder, StandardScaler
import lightgbm as lgb

In [15]:
 # shuffle 是否将官方给的的测试集和训练集重新打乱，再分成新的的训练集和测试集
 # ss标准化
def process_data(tr_data, te_data=None, ss=None, shuffle=False):
    split_num = len(tr_data)
    data_temp = pd.concat([tr_data, te_data], axis=0)
    data = pd.get_dummies(data_temp.iloc[:, 1:-2])
    data['cat_code'] = LabelEncoder().fit_transform(data_temp.loc[:, 'attack_cat'])
    # data['label'] = data_temp['label']
    # data['attack_cat'] = data_temp['attack_cat']
    if ss != None:
        data.iloc[:,:-3] = ss.fit_transform(data.iloc[:,:-3])
    if shuffle:
        pass
    else:
        return data.iloc[:split_num,:], data.iloc[split_num:, :]
# 加载数据未处理的数据
tr_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_testing-set.csv')
te_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_training-set.csv')

ss = StandardScaler()
# 调用数据处理函数
tr_data, te_data = process_data(tr_raw_data, te_raw_data, ss)

# 去掉无用的列
tr_data.drop(['state_URN', 'state_no'], axis=1, inplace=True)
te_data.drop(['state_URN', 'state_no'], axis=1, inplace=True)
tr_data.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state_ACC,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,cat_code
0,-0.188346,-0.101342,-0.129612,-0.047849,-0.097232,-0.56865,0.702512,1.500906,-0.38009,-0.269328,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
1,-0.099897,-0.042496,0.173998,-0.04511,0.188966,-0.568623,-1.151363,1.48317,-0.380121,-0.064104,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
2,0.063006,-0.08663,-0.022456,-0.047239,-0.008217,-0.569024,-1.151363,1.48317,-0.380158,-0.247593,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
3,0.0728,-0.057207,-0.058174,-0.04572,-0.093142,-0.569027,-1.151363,1.48317,-0.380152,-0.271458,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
4,-0.133449,-0.071919,-0.111753,-0.046261,-0.096576,-0.568904,0.722026,1.48317,-0.380121,-0.271197,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6


In [17]:
dtrain = lgb.Dataset(tr_data.values[:,:-1], tr_data.values[:,-1])
dtest = lgb.Dataset(te_data.values[:,:-1], te_data.values[:,-1])

In [22]:
#設定lightgbm參數，gpu版本測試
params = {'max_bin': 63,
'num_leaves': 255,
'learning_rate': 0.1,
'tree_learner': 'serial',
# 'task': 'train',
# 'is_training_metric': 'false',
'objective': 'multiclass',
'num_class': 10,
'verbose': 0,
'metric':'multi_error',
'min_data_in_leaf': 1,
'min_sum_hessian_in_leaf': 100,
'ndcg_eval_at': [1,3,5,10],
# 'sparse_threshold': 1.0,
'device': 'gpu',
'gpu_platform_id': -1,
'gpu_device_id': -1}

In [23]:
evals_result = {}
valid_sets = [dtrain, dtest]
valid_name = ['train', 'eval']
model = lgb.train(
    params
    ,dtrain 
    ,num_boost_round=500
#   ,feature_name=feature_name
    ,valid_sets=valid_sets
    ,valid_names=valid_name
    ,evals_result=evals_result
    ,early_stopping_rounds=5
)



[1]	train's multi_error: 0.402171	eval's multi_error: 0.285867
Training until validation scores don't improve for 5 rounds
[2]	train's multi_error: 0.274648	eval's multi_error: 0.218676
[3]	train's multi_error: 0.221215	eval's multi_error: 0.194104
[4]	train's multi_error: 0.206421	eval's multi_error: 0.195574
[5]	train's multi_error: 0.199389	eval's multi_error: 0.200177
[6]	train's multi_error: 0.195083	eval's multi_error: 0.204404
[7]	train's multi_error: 0.192528	eval's multi_error: 0.210708
[8]	train's multi_error: 0.190013	eval's multi_error: 0.214206
Early stopping, best iteration is:
[3]	train's multi_error: 0.221215	eval's multi_error: 0.194104


In [24]:
pred_y = model.predict(te_data.values[:,:-1], num_iteration=model.best_iteration)
# pred_y_1 = model.predict(X_test.values)
test_y = te_data.values[:,-1]
pred_y = pd.DataFrame(pred_y).idxmax(axis=1)

score = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y, average='macro')
f1score = f1_score(test_y, pred_y, average='macro')
recall = recall_score(test_y, pred_y, average='macro')
# run_time = end_time - start_time
print(f'accuray: {score},\nprecision:{precision},\nrecall: {recall},\nf1_score: {f1score}.')

accuray: 0.8058956420346888,
precision:0.373244448417571,
recall: 0.36661259886159503,
f1_score: 0.3605701760140335.


  _warn_prf(average, modifier, msg_start, len(result))
