In [1]:
# pytorch
import torch
from torch import nn
# import pytorch_lightning as pl
from pytorch_lightning import  LightningModule, Trainer
from torch.utils.data import Dataset, DataLoader

# Helper libraries
import numpy as np
import pandas as pd

#sys
import os
from collections import OrderedDict
import math
import random

In [2]:
# data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
def same_seeds(seed):
    # python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Torch
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# 为了结果可复现
Seed = 42
same_seeds(Seed)

In [4]:
 # shuffle 是否将官方给的的测试集和训练集重新打乱，再分成新的的训练集和测试集
 # ss标准化
def process_data(tr_data, te_data=None, ss=None, shuffle=False):
    split_num = len(tr_data)
    data_temp = pd.concat([tr_data, te_data], axis=0)
    data = pd.get_dummies(data_temp.iloc[:, 1:-2])
    data['cat_code'] = LabelEncoder().fit_transform(data_temp.loc[:, 'attack_cat'])
    # data['label'] = data_temp['label']
    # data['attack_cat'] = data_temp['attack_cat']
    if ss != None:
        data.iloc[:,:-3] = ss.fit_transform(data.iloc[:,:-3])
    if shuffle:
        pass
    else:
        return data.iloc[:split_num,:], data.iloc[split_num:, :]

In [5]:
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 256 if AVAIL_GPUS else 64
NUM_WORKERS = int(os.cpu_count() / 7)

In [6]:
# 加载数据未处理的数据
tr_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_testing-set.csv')
te_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_training-set.csv')
ss = StandardScaler()
# 调用数据处理函数
tr_data, te_data = process_data(tr_raw_data, te_raw_data, ss)
# 挑选'Normal'的列，'cat_code'=6
tr_data = tr_data.loc[tr_data['cat_code'] == 6]
tr_data.drop(['cat_code'], axis=1, inplace=True)
# 去掉无用的列
# tr_data.drop(['state_URN', 'state_no', 'cat_code'], axis=1, inplace=True)
tr_data.head()

# raw_data = pd.read_csv('/home/jsm/code/python_backup/python/IoT-botnet/data/UNSW-NB15 - CSV Files/unsw15_train.csv')
# temp = raw_data.loc[raw_data['attack_cat'] == 'Normal']
# temp_drop = temp.drop(['196', 'attack_cat', 'label'], axis=1, inplace=False)
# temp_sameple = temp_drop.sample(1024*60, random_state=Seed)
# tr_data = temp_sameple

In [7]:
class MyDataset(Dataset):
    def __init__(
        self,
        # batch_size,
        # num_workers,
        data
    ):
        # 在数据1维处增加1个维度 example: (batch_size, 196) --> (batch_size, 1, 196)
        # self.batch_size = batch_size
        # self.num_workers = num_workers
        self.data = data.unsqueeze(1)
    
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

dataset = MyDataset(torch.from_numpy(tr_data.values).float())

In [8]:
# 网络参数初始化
def weights_init(m):
    classname = m.__class__.__name__
    # 初始化网络层
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

In [9]:
# 生成器
class Generator(nn.Module):
    """
    Input shape: (N, in_dim)
    Output shape: (N, 1, out_dim)
    """
    def __init__(self, in_dim, out_dim, dim=32):
        super(Generator, self).__init__()
        def dconv_bn_relu(in_dim, out_dim):
            return nn.Sequential(
                nn.ConvTranspose1d(in_dim, out_dim, 5, 2, padding=2, output_padding=1, bias=False),
                nn.BatchNorm1d(out_dim),
                nn.ReLU()
            )
        self.inlayer = nn.Sequential(
            nn.Linear(in_dim, dim*4*4*4, bias=False),
            # tf 默认为0.3， torch 默认为0.01
            nn.BatchNorm1d(dim*4*4*4),
            nn.ReLU()
            # nn.LeakyReLU(negative_slope=0.2)
        )
        self.midlayer = nn.Sequential(
           dconv_bn_relu(dim*4, dim*2),
        #    dconv_bn_relu(dim*2, dim*2),
           dconv_bn_relu(dim*2, dim),
           dconv_bn_relu(dim, 1)
        )
        self.outlayer = nn.Sequential(
            nn.Linear(128, out_dim, bias=False),
            nn.Tanh()
        )
        self.apply(weights_init)

    def forward(self, x):
        y = self.inlayer(x)
        y = y.view(y.size(0), -1, 16)
        y = self.midlayer(y)
        y = y.squeeze(1)
        y = self.outlayer(y)
        y = y.unsqueeze(1)
        return y

In [10]:
# 判别器
class Discriminator(nn.Module):
    """
    Input shape: (N, 1, in_dim)
    Output shape: (N, )
    """
    def __init__(self, in_dim, dim=256, in_channel=1, channel=8):
        super(Discriminator, self).__init__()
        
        def conv_bn_lrelu(in_channel, out_channel):
            return nn.Sequential(
                nn.Conv1d(in_channel, out_channel, 5, 2, 2),
                nn.BatchNorm1d(out_channel),
                nn.LeakyReLU(0.2),
            )

        self.inlayer = nn.Sequential(
            nn.Linear(in_dim, dim, bias=False),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
        )

        self.ls = nn.Sequential(
            nn.Conv1d(in_channel, channel, 5, 2, 2),
            nn.LeakyReLU(0.2),
            conv_bn_lrelu(channel, channel * 2),
            conv_bn_lrelu(channel * 2, channel * 4),
            conv_bn_lrelu(channel * 4, channel * 8),
            conv_bn_lrelu(channel * 8, channel * 16),
            conv_bn_lrelu(channel * 16, channel * 32),
            nn.Conv1d(channel * 32, 1, 4),
        )
    
    def forward(self, x):
        y = x.squeeze(1)
        y = self.inlayer(y)
        y = y.unsqueeze(1)
        y = self.ls(y)
        y = y.view(-1)
        return y           

In [11]:
class WGAN(LightningModule):
    def __init__(
        self,
        in_dim: int = 16,
        out_dim: int = 128,
        in_channels: int = 1,
        lr: float = 1e-4,
        n_critic: int = 5,
        clip_value: float = 0.01,
        batch_size: int = BATCH_SIZE,
        **kwargs
    ):
        super().__init__()
        self.save_hyperparameters()

        # networks
        self.generator = Generator(in_dim=self.hparams.in_dim, out_dim=self.hparams.out_dim)
        self.discriminator = Discriminator(in_dim = self.hparams.out_dim)
        # self.validation_z = torch.randn(10, self.hparams.in_channels, self.hparams.in_dim)

    def forward(self, z):
        return self.generator(z)

    def adversarial_loss(self, y_hat, y):
        return -torch.mean(self.discriminator(y)) + torch.mean(self.discriminator(y_hat))
    
    def training_step(self, batch, batch_idx, optimizer_idx):

        data = batch
        z = torch.randn(self.hparams.batch_size, self.hparams.in_dim)
        z = z.type_as(data)
        # train generator
        if optimizer_idx == 0 and (batch_idx % self.hparams.n_critic == 0 and batch_idx != 0):
            # print('batch_idx {}, optimizer_idx{}'.format(batch_idx, optimizer_idx))
            # generate data
            self.generated_data = self(z)

            # generator of WGAN loss
            g_loss = -torch.mean(self.discriminator(self(z)))
            self.logger.experiment.add_scalar("g_loss", g_loss.detach(), self.current_epoch)
            tqdm_dict = {"g_loss": g_loss}
            output = OrderedDict({"loss": g_loss, "progress_bar": tqdm_dict, "log": tqdm_dict})
            return output

        # train discriminator
        if optimizer_idx == 1:
            
            # print('batch_idx {}, optimizer_idx{}'.format(batch_idx, optimizer_idx))
            # discriminator of WGAN loss
            d_loss = -torch.mean(self.discriminator(data)) + torch.mean(self.discriminator(self(z)))
            self.logger.experiment.add_scalar("d_loss", d_loss.detach(), self.current_epoch)
            # Clip weights of discriminator
            for p in self.discriminator.parameters():
                p.data.clamp_(-self.hparams.clip_value, self.hparams.clip_value)

            tqdm_dict = {"d_loss": d_loss}
            output = OrderedDict({"loss": d_loss, "progress_bar": tqdm_dict, "log": tqdm_dict})
            return output

    def configure_optimizers(self):
        lr = self.hparams.lr

        opt_g = torch.optim.RMSprop(self.generator.parameters(), lr=lr)
        opt_d = torch.optim.RMSprop(self.discriminator.parameters(), lr=lr)
        
        return [opt_g, opt_d], []

In [12]:
tr_dataloder = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
# in_dim = tr_data.shape[-1]
in_dim = 16
out_dim = tr_data.shape[-1]
wgan = WGAN(in_dim, out_dim)

In [13]:
trainer = Trainer(
    gpus = AVAIL_GPUS,
    max_epochs=100,
    progress_bar_refresh_rate = 20
)
trainer.fit(wgan, tr_dataloder)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type          | Params
------------------------------------------------
0 | generator     | Generator     | 113 K 
1 | discriminator | Discriminator | 271 K 
------------------------------------------------
384 K     Trainable params
0         Non-trainable params
384 K     Total params
1.540     Total estimated model params size (MB)


Epoch 0:   0%|          | 0/240 [00:00<?, ?it/s] 

  rank_zero_deprecation(


Epoch 99: 100%|██████████| 240/240 [00:03<00:00, 74.66it/s, loss=-0.00305, v_num=3]
