In [1]:
# pytorch
from torch.utils.tensorboard import SummaryWriter
import torch
from torch import nn
# import pytorch_lightning as pl
from pytorch_lightning import  LightningModule, Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

# Helper libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score

#sys
import os
from collections import OrderedDict
import math
import random

# data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
def same_seeds(seed):
    # python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Torch
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# 为了结果可复现
Seed = 42
same_seeds(Seed)

In [3]:
 # shuffle 是否将官方给的的测试集和训练集重新打乱，再分成新的的训练集和测试集
 # ss标准化
def process_data(tr_data, te_data=None, ss=None, shuffle=False):
    split_num = len(tr_data)
    data_temp = pd.concat([tr_data, te_data], axis=0)
    data = pd.get_dummies(data_temp.iloc[:, 1:-2])
    data['cat_code'] = LabelEncoder().fit_transform(data_temp.loc[:, 'attack_cat'])
    # data['label'] = data_temp['label']
    # data['attack_cat'] = data_temp['attack_cat']
    if ss != None:
        data.iloc[:,:-3] = ss.fit_transform(data.iloc[:,:-3])
    if shuffle:
        pass
    else:
        return data.iloc[:split_num,:], data.iloc[split_num:, :]

In [4]:
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 256 if AVAIL_GPUS else 64
NUM_WORKERS = int(os.cpu_count() / 2)

In [5]:
# 加载数据未处理的数据
tr_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_testing-set.csv')
te_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_training-set.csv')
ss = StandardScaler()
# 调用数据处理函数
tr_data, te_data = process_data(tr_raw_data, te_raw_data, ss)
# 挑选'Normal'的列，'cat_code'=6
# tr_data = tr_data.loc[tr_data['cat_code'] == 6]
# tr_data.drop(['cat_code'], axis=1, inplace=True)
# 去掉无用的列
tr_data.drop(['state_URN', 'state_no'], axis=1, inplace=True)
te_data.drop(['state_URN', 'state_no'], axis=1, inplace=True)
tr_data.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state_ACC,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,cat_code
0,-0.188346,-0.101342,-0.129612,-0.047849,-0.097232,-0.56865,0.702512,1.500906,-0.38009,-0.269328,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
1,-0.099897,-0.042496,0.173998,-0.04511,0.188966,-0.568623,-1.151363,1.48317,-0.380121,-0.064104,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
2,0.063006,-0.08663,-0.022456,-0.047239,-0.008217,-0.569024,-1.151363,1.48317,-0.380158,-0.247593,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
3,0.0728,-0.057207,-0.058174,-0.04572,-0.093142,-0.569027,-1.151363,1.48317,-0.380152,-0.271458,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6
4,-0.133449,-0.071919,-0.111753,-0.046261,-0.096576,-0.568904,0.722026,1.48317,-0.380121,-0.271197,...,-0.00394,-0.00197,-0.291137,-0.006824,1.095103,-0.90798,-0.00197,-0.122882,-0.018058,6


In [6]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int64)
            self.label = torch.LongTensor(y)
        else:
            self.label = None
    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]
    def __len__(self):
        return len(self.data)

In [7]:
tr_dataset = MyDataset(tr_data.values[:,:-1], tr_data.values[:,-1])
te_dataset = MyDataset(te_data.values[:,:-1], te_data.values[:,-1])

In [8]:
# 网络参数初始化
def weights_init(m):
    classname = m.__class__.__name__
    # 初始化网络层
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

In [9]:
# 生成器
class Generator(nn.Module):
    """
    Input shape: (N, in_dim)
    Output shape: (N, 1, out_dim)
    """
    def __init__(self, in_dim, out_dim, dim=32):
        super(Generator, self).__init__()
        def dconv_bn_relu(in_dim, out_dim):
            return nn.Sequential(
                nn.ConvTranspose1d(in_dim, out_dim, 5, 2, padding=2, output_padding=1, bias=False),
                nn.BatchNorm1d(out_dim),
                nn.ReLU()
            )
        self.inlayer = nn.Sequential(
            nn.Linear(in_dim, dim*4*4*4, bias=False),
            # tf 默认为0.3， torch 默认为0.01
            nn.BatchNorm1d(dim*4*4*4),
            nn.ReLU()
            # nn.LeakyReLU(negative_slope=0.2)
        )
        self.midlayer = nn.Sequential(
           dconv_bn_relu(dim*4, dim*2),
        #    dconv_bn_relu(dim*2, dim*2),
           dconv_bn_relu(dim*2, dim),
           dconv_bn_relu(dim, 1)
        )
        self.outlayer = nn.Sequential(
            nn.Linear(128, out_dim, bias=False),
            nn.Tanh()
        )
        self.apply(weights_init)

    def forward(self, x):
        y = self.inlayer(x)
        y = y.view(y.size(0), -1, 16)
        y = self.midlayer(y)
        y = y.squeeze(1)
        y = self.outlayer(y)
        y = y.unsqueeze(1)
        return y

In [10]:
# 判别器
class Discriminator(nn.Module):
    """
    Input shape: (N, 1, in_dim)
    Output shape: (N, )
    """
    def __init__(self, in_dim, dlatent_dim=128, in_channel=1, channel=8, num_classes=10):
        super(Discriminator, self).__init__()
        
        def conv_bn_lrelu(in_channel, out_channel):
            return nn.Sequential(
                nn.Conv1d(in_channel, out_channel, 5, 2, 2),
                nn.BatchNorm1d(out_channel),
                nn.LeakyReLU(0.2),
            )

        self.inlayer = nn.Sequential(
            nn.Linear(in_dim, dlatent_dim, bias=False),
            nn.BatchNorm1d(dlatent_dim),
            nn.ReLU(),
        )

        self.ls = nn.Sequential(
            conv_bn_lrelu(in_channel, channel),
            conv_bn_lrelu(channel, channel * 2),
            conv_bn_lrelu(channel * 2, channel * 4),
            conv_bn_lrelu(channel * 4, channel * 8),
            conv_bn_lrelu(channel * 8, channel * 16),
            nn.Conv1d(channel * 16, dlatent_dim, 4),
        )

        # Output layers
        self.adv_layer = nn.Sequential(
            nn.Linear(dlatent_dim, 1),
            nn.Sigmoid()
            )
        self.aux_layer = nn.Sequential(
            nn.Linear(dlatent_dim, num_classes + 1),
            nn.Softmax(dim=1)
        )

        # self.apply(weights_init)
        
    def forward(self, x):
        y = x.squeeze(1)
        y = self.inlayer(y)
        y = y.unsqueeze(1)
        y = self.ls(y)
        y = y.view(y.shape[0], -1)
        validity = self.adv_layer(y)
        label = self.aux_layer(y)
        return validity, label

In [11]:
# Training hyperparameters
batch_size = 512
z_dim = 100
gout_dim = tr_data.shape[-1] - 1
# 将tensor(张量)转化成variable(变量)。之所以需要将tensor转化成variable是因为pytorch中tensor(张量)只能放在CPU上运算，而(variable)变量是可以只用GPU进行加速计算的。
z_sample = Variable(torch.randn(100, z_dim)).cuda()
lr = 1e-4

""" Medium: WGAN, 50 epoch, n_critic=5, clip_value=0.01 """
n_epoch = 50 # 50
n_critic = 5 # 5
clip_value = 0.01
num_classes = 10


workspace_dir = '.'
log_dir = os.path.join(workspace_dir, 'lightning_logs')
ckpt_dir = os.path.join(workspace_dir, 'checkpoints')
os.makedirs(log_dir, exist_ok=True)
os.makedirs(ckpt_dir, exist_ok=True)

# Model
G = Generator(in_dim=z_dim, out_dim=gout_dim)
D = Discriminator(in_dim=gout_dim)
G.train()
D.train()

# Loss
# 一个二分类损失函数。可以是单标签的损失函数也可是多标签的损失函数。
# https://blog.csdn.net/weixin_37724529/article/details/107084970
adversarial_loss = torch.nn.BCELoss()
auxiliary_loss = torch.nn.CrossEntropyLoss()

""" Medium: Use RMSprop for WGAN. """
# Optimizer
opt_D = torch.optim.RMSprop(D.parameters(), lr=lr)
opt_G = torch.optim.RMSprop(G.parameters(), lr=lr)

cuda = True if torch.cuda.is_available() else False
if cuda:
    G.cuda()
    D.cuda()
    adversarial_loss.cuda()
    auxiliary_loss.cuda()

# DataLoader
tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=True)
te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=True)

In [13]:
accuracy = 0.0
f1score = 0.0
clip_value = 0.01

tb = SummaryWriter(log_dir='lightning_logs')
for epoch in range(n_epoch):
    for i, (imgs, labels) in enumerate(tr_dataloader):
        imgs = imgs.cuda()
        labels = labels.cuda()
        # Adversarial ground truths
        valid = Variable(torch.FloatTensor(batch_size, 1).fill_(1.0), requires_grad=False).cuda()
        fake = Variable(torch.FloatTensor(batch_size, 1).fill_(0.0), requires_grad=False).cuda()
        fake_aux_gt = Variable(torch.LongTensor(batch_size).fill_(num_classes), requires_grad=False).cuda()
        
        # -----------------
        #  Train Generator
        # -----------------

        opt_G.zero_grad()

        z = torch.randn(batch_size, z_dim).cuda()

        # Generate a batch of images
        gen_imgs = G(z)

        # Loss measures generator's ability to fool the discriminator
        validity, _ = D(gen_imgs)
        g_loss = adversarial_loss(validity, valid)

        g_loss.backward()
        opt_G.step()

        # ---------------------
        #  Train Discriminator
        # ---------------------

        opt_D.zero_grad()

        # Loss for real images
        # print(imgs.device)
        real_pred, real_aux = D(imgs)
        # print(real_pred.device, valid.device, real_aux.device, labels.device)
        d_real_loss = (adversarial_loss(real_pred, valid) + auxiliary_loss(real_aux, labels)) / 2

        # Loss for fake images
        # fake_pred, fake_aux = D(gen_imgs)
        fake_pred, fake_aux = D(gen_imgs.detach())
        d_fake_loss = (adversarial_loss(fake_pred, fake) + auxiliary_loss(fake_aux, fake_aux_gt)) / 2

        # Total discriminator loss
        # d_loss = (d_real_loss + d_fake_loss) / 2
        d_loss = -torch.mean(d_real_loss) + torch.mean(d_fake_loss)
        

        # Calculate discriminator accuracy
        pred = np.concatenate([real_aux.data.cpu().numpy(), fake_aux.data.cpu().numpy()], axis=0)
        gt = np.concatenate([labels.data.cpu().numpy(), fake_aux_gt.data.cpu().numpy()], axis=0)
        # d_acc = np.mean(np.argmax(pred, axis=1) == gt)
        y_pred = np.argmax(pred, axis=1)
        accuracy = accuracy_score(gt, y_pred)
        precision = precision_score(gt, y_pred, average='macro', zero_division=1)
        f1score = f1_score(gt, y_pred, average='macro', zero_division=1)
        recall = recall_score(gt, y_pred, average='macro', zero_division=1)
        d_loss.backward()

        opt_D.step()
        for p in D.parameters():
           p.data.clamp_(-clip_value, clip_value)

        tb.add_scalar('f1_score', f1score, (epoch+1)*i)
        tb.add_scalar('accuracy', accuracy, (epoch+1)*i)
        
        print(
            "[Epoch %d/%d] [Batch %d/%d] [D loss: %f, acc: %f, f1: %f] [G loss: %f]"
            % (epoch, n_epoch, i, len(tr_dataloader), d_loss.item(), accuracy, f1score, g_loss.item())
        )

        # batches_done = epoch * len(tr_dataloader) + i
        # if batches_done % sample_interval == 0:
        #     save_image(gen_imgs.data[:25], "images/%d.png" % batches_done, nrow=5, normalize=True)

[Epoch 0/50] [Batch 0/342] [D loss: -0.048119, acc: 0.500000, f1: 0.066667] [G loss: 0.734228]
[Epoch 0/50] [Batch 1/342] [D loss: -0.048123, acc: 0.500000, f1: 0.060606] [G loss: 0.734230]
[Epoch 0/50] [Batch 2/342] [D loss: -0.048125, acc: 0.500000, f1: 0.066667] [G loss: 0.734232]
[Epoch 0/50] [Batch 3/342] [D loss: -0.048129, acc: 0.500000, f1: 0.066667] [G loss: 0.734235]
[Epoch 0/50] [Batch 4/342] [D loss: -0.048133, acc: 0.500000, f1: 0.066667] [G loss: 0.734239]
[Epoch 0/50] [Batch 5/342] [D loss: -0.048136, acc: 0.500000, f1: 0.066667] [G loss: 0.734241]
[Epoch 0/50] [Batch 6/342] [D loss: -0.048142, acc: 0.500000, f1: 0.066667] [G loss: 0.734246]
[Epoch 0/50] [Batch 7/342] [D loss: -0.048146, acc: 0.500000, f1: 0.066667] [G loss: 0.734250]
[Epoch 0/50] [Batch 8/342] [D loss: -0.048154, acc: 0.500000, f1: 0.060606] [G loss: 0.734256]
[Epoch 0/50] [Batch 9/342] [D loss: -0.048160, acc: 0.500000, f1: 0.060606] [G loss: 0.734261]
[Epoch 0/50] [Batch 10/342] [D loss: -0.048166, ac

KeyboardInterrupt: 

In [None]:
D.eval()
f1score = 0.0
accuracy = 0.0
with torch.no_grad():
    for i, (imgs, labels) in enumerate(te_dataloader):
        imgs = imgs.cuda()
        labels = labels.cuda()
        pred_true, pred_labels = D(imgs)
        pred = pred_labels.data.cpu().numpy()
        pred = np.argmax(pred, axis=1)
        real = labels.data.cpu().numpy()
        accuracy += accuracy_score(real, pred)
        # precision = precision_score(gt, y_pred, average='macro', zero_division=1)
        f1score += f1_score(real, pred, average='macro', zero_division=1)
        # recall = recall_score(gt, y_pred, average='macro', zero_division=1)
length = len(te_dataloader)
print(accuracy/length, f1score/length)


In [None]:
pred

In [None]:
pred[:5]
real[:5]