In [1]:
import pandas as pd
import numpy as np
import random
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score


In [2]:
def same_seeds(seed):
    # python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Torch
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# 为了结果可复现
Seed = 42
same_seeds(Seed)

In [3]:
 # shuffle 是否将官方给的的测试集和训练集重新打乱，再分成新的的训练集和测试集
 # ss标准化
def process_data(tr_data, te_data=None, ss=None, shuffle=False):
    split_num = len(tr_data)
    data_temp = pd.concat([tr_data, te_data], axis=0)
    data = pd.get_dummies(data_temp.iloc[:, 1:-2])
    data['cat_code'] = LabelEncoder().fit_transform(data_temp.loc[:, 'attack_cat'])
    # data['label'] = data_temp['label']
    # data['attack_cat'] = data_temp['attack_cat']
    if ss != None:
        data.iloc[:,:-3] = ss.fit_transform(data.iloc[:,:-3])
    if shuffle:
        pass
    else:
        return data.iloc[:split_num,:], data.iloc[split_num:, :]

In [4]:
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 256 if AVAIL_GPUS else 64
NUM_WORKERS = int(os.cpu_count() / 2)

In [5]:
# 加载数据未处理的数据
tr_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_testing-set.csv')
te_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_training-set.csv')
ss = StandardScaler()
# 调用数据处理函数
tr_data, te_data = process_data(tr_raw_data, te_raw_data, ss)

# 去掉无用的列
tr_data.drop(['state_URN', 'state_no'], axis=1, inplace=True)
te_data.drop(['state_URN', 'state_no'], axis=1, inplace=True)
tr_data.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state_ACC,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,cat_code
0,-0.188346,-0.101342,-0.129612,-0.047849,-0.097232,-0.56865,0.702512,1.500906,-0.38009,-0.269328,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
1,-0.099897,-0.042496,0.173998,-0.04511,0.188966,-0.568623,-1.151363,1.48317,-0.380121,-0.064104,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
2,0.063006,-0.08663,-0.022456,-0.047239,-0.008217,-0.569024,-1.151363,1.48317,-0.380158,-0.247593,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
3,0.0728,-0.057207,-0.058174,-0.04572,-0.093142,-0.569027,-1.151363,1.48317,-0.380152,-0.271458,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
4,-0.133449,-0.071919,-0.111753,-0.046261,-0.096576,-0.568904,0.722026,1.48317,-0.380121,-0.271197,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6


In [6]:
tr_data['cat_code'].value_counts()

6    56000
5    40000
3    33393
4    18184
2    12264
7    10491
0     2000
1     1746
8     1133
9      130
Name: cat_code, dtype: int64

In [10]:
# 定义dataset
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int64)
            self.label = torch.LongTensor(y)
        else:
            self.label = None
    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]
    def __len__(self):
        return len(self.data)


In [11]:
# 划分训练集验证集
VAL_RATIO = 0.2

percent = int(tr_data.shape[0] * (1 - VAL_RATIO))
# train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
train, val = tr_data[:percent], tr_data[percent:]
print('Size of training set: {}'.format(train.shape))
print('Size of validation set: {}'.format(val.shape))

Size of training set: (140272, 195)
Size of validation set: (35069, 195)


In [12]:
train_set = MyDataset(train.values[:,:-1], train.values[:,-1])
val_set = MyDataset(val.values[:,:-1], val.values[:,-1])

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
# 网络参数初始化
def weights_init(m):
    classname = m.__class__.__name__
    # 初始化网络层
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

#check device
def get_device():
  return 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
class Classifier(nn.Module):
    def __init__(self, in_dim, num_classes):
        super(Classifier, self).__init__()
        self.inlayer = nn.Sequential(
            nn.Linear(in_dim, 256),
            # nn.Sigmoid(),
            nn.Tanh()
        )
        self.con1d = nn.Sequential(
            nn.Conv1d(1, 64, 9, 1, 4),
            nn.BatchNorm1d(64),
            nn.ReLU(0.2),
            nn.Conv1d(64, 1, 9, 1, 4),
            nn.BatchNorm1d(1),
            nn.ReLU(0.2),
        )
        self.outlayer = nn.Sequential(
            nn.Linear(256, 32),
            nn.LeakyReLU(0.2),
            nn.Linear(32, num_classes),
            nn.Softmax(dim=1)
        )

        self.apply(weights_init)

    def forward(self, x):
        y = self.inlayer(x)
        y = y.unsqueeze(1)
        y = self.con1d(y)
        y = self.outlayer(y)
        y = y.squeeze(1)
        return y

In [25]:
# fix random seed for reproducibility
# # get device 
device = get_device()
print(f'DEVICE: {device}')

# training parameters
num_epoch = 100              # number of training epoch
learning_rate = 0.001       # learning rate

# the path where checkpoint saved
model_path = './sava/model.ckpt'

# create model, define a loss function, and optimizer
in_dim = train.shape[-1] - 1
num_classes = 10
model = Classifier(in_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

DEVICE: cuda


In [28]:
train.head(5)

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state_ACC,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,cat_code
0,-0.188346,-0.101342,-0.129612,-0.047849,-0.097232,-0.56865,0.702512,1.500906,-0.38009,-0.269328,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
1,-0.099897,-0.042496,0.173998,-0.04511,0.188966,-0.568623,-1.151363,1.48317,-0.380121,-0.064104,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
2,0.063006,-0.08663,-0.022456,-0.047239,-0.008217,-0.569024,-1.151363,1.48317,-0.380158,-0.247593,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
3,0.0728,-0.057207,-0.058174,-0.04572,-0.093142,-0.569027,-1.151363,1.48317,-0.380152,-0.271458,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
4,-0.133449,-0.071919,-0.111753,-0.046261,-0.096576,-0.568904,0.722026,1.48317,-0.380121,-0.271197,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6


In [26]:
best_f1 = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    train_f1 = 0.0
    val_acc = 0.0
    val_loss = 0.0
    val_f1 = 0.0

    # training
    model.train() # set the model to training mode
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad() 
        outputs = model(inputs) 
        batch_loss = criterion(outputs, labels)
        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        batch_loss.backward() 
        optimizer.step() 
        train_f1 += f1_score(train_pred.cpu().numpy(), labels.cpu().numpy(), average='macro', zero_division=1)
        train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
        train_loss += batch_loss.item()
        # print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} F1: {:3.6f}'.format(
        #     epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), train_f1/len(train_set)
        # ))
        print('-'*20)
        print(train_pred.cpu().numpy())
        print('-'*20)
        print(labels.cpu().numpy())

#     # validation
#     if len(val_set) > 0:
#         model.eval() # set the model to evaluation mode
#         with torch.no_grad():
#             for i, data in enumerate(val_loader):
#                 inputs, labels = data
#                 inputs, labels = inputs.to(device), labels.to(device)
#                 outputs = model(inputs)
#                 batch_loss = criterion(outputs, labels) 
#                 _, val_pred = torch.max(outputs, 1)

#                 val_f1 += f1_score(val_pred.cpu().numpy(), labels.cpu().numpy(), average='macro', zero_division=1)
#                 val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
#                 val_loss += batch_loss.item()

#             print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} Train F1: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}  Val F1: {:3.6f}'.format(
#                 epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), train_f1/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader), val_f1/len(val_loader)
#             ))

#             # if the model improves, save a checkpoint at this epoch
#             if val_f1 > best_f1:
#                 best_f1 = val_f1
#                 torch.save(model.state_dict(), model_path)
#                 print('saving model with f1 {:.3f}'.format(best_f1/len(val_set)))
#     else:
#         print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} F1: {:3.6f}'.format(
#             epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), train_f1/len(train_loader)
#         ))

# # if not validating, save the last epoch
# if len(val_set) == 0:
#     torch.save(model.state_dict(), model_path)
#     print('saving model at last epoch')


--------------------
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
--------------------
[3 6 6 4 6 2 3 6 6 6 2 7 6 3 3 6 4 6 2 6 2 6 3 6 2 4 4 3 3 6 6 3 4 7 2 6 2
 6 6 6 6 6 6 6 6 3 6 2 3 6 3 6 6 6 5 6 4 3 4 3 3 2 7 7 4 3 6 5 3 3 6 4 3 6
 5 5 4 5 6 3 2 2 5 6 3 6 6 6 3 3 6 5 4 2 4 6 5 5 6 6 6 5 6 6 2 4 4 6 1 6 5
 7 7 3 6 3 7 6 6 2 7 5 7 3 6 3 6 6 6 6 8 3 5 6 6 2 6 6 6 3 2 3 6 4 4 6 3 6
 7 2 7 6 3 6 6 3 2 5 6 2 6 6 6 6 6 2 6 5 5 6 2 5 3 5 6 6 2 5 3 6 4 2 4 3 3
 3 3 3 4 6 3 4 6 3 6 7 6 2 6 6 6 4 7 3 6 5 6 4 6 6 6 6 6 3 6 6 

KeyboardInterrupt: 