In [None]:
# data process
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score

# network
import torch
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
# from torchmetrics import F1
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split

import gc

AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 128
NUM_WORKERS = 8

In [None]:
 # shuffle 是否将官方给的的测试集和训练集重新打乱，再分成新的的训练集和测试集
 # ss标准化
def process_data(tr_data, te_data, ss=None, shuffle=False):
    split_num = len(tr_data)
    data_temp = pd.concat([tr_data, te_data], axis=0)
    data = pd.get_dummies(data_temp.iloc[:, 1:-2])
    data['cat_code'] = LabelEncoder().fit_transform(data_temp.loc[:, 'attack_cat'])
    # data['label'] = data_temp['label']
    # data['attack_cat'] = data_temp['attack_cat']
    if ss is None:
        data.iloc[:,:-3] = ss.fit_transform(data.iloc[:,:-3])
    if shuffle:
        pass
    else:
        return data.iloc[:split_num,:], data.iloc[split_num:, :]

ss = StandardScaler()
tr_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_testing-set.csv')
te_raw_data = pd.read_csv('/home/jsm/code/python/unsupervisedGAN/data/UNSW-NB15/part/UNSW_NB15_training-set.csv')
tr_data, te_data = process_data(tr_raw_data, te_raw_data, ss)

In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int)
            self.label = torch.LongTensor(y)
        else:
            self.label = None
    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[index]
    def __len__(self):
        return len(self.data)

In [None]:
class MyDataModule(LightningDataModule):
    def __init__(
        self, 
        tr_data,
        te_data,
        val_num: float = 0.1,
        batch_size: int = BATCH_SIZE,
        num_workers: int = NUM_WORKERS,
    ):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers

    # def prepare_data(self):
        self.tr_data = MyDataset(tr_data[:,:-1], tr_data[:,-1])
        self.test_set = MyDataset(te_data[:,:-1], te_data[:,-1])
    def setup(self, stage = None):
        # 划分训练集、验证集、测试集
        if stage in (None, "fit"):
            total_num = len(self.tr_data)
            val_num = int (total_num * 0.1)
            self.train_set, self.val_set = random_split(self.tr_data, [total_num-val_num, val_num])
            del self.tr_data
            gc.collect()
        # if stage in (None, "test"):
        #     self.te_data
    
    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            batch_size = self.batch_size,
            num_workers = self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_set,
            batch_size = self.batch_size,
            num_workers = self.num_workers,
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_set,
            batch_size = self.batch_size,
            num_workers = self.num_workers,
        )


In [None]:
class MyLSTM(LightningModule):
    def __init__(
        self,
        num_class,
        input_dim,
        hidden_dim,
        num_layers,
        dropout: float = 0.5,
        lr: float = 0.001,
        b1: float = 0.9,
        b2: float = 0.999,
        batch_size: int = BATCH_SIZE,
        **kwargs
    ):
        super().__init__()
        self.save_hyperparameters()

        # networks
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_class),
            nn.Sigmoid()
        )
    
    def forward(self, inputs):
        num_layers = self.hparams.num_layers
        batch_size = self.hparams.batch_size
        hidden_dim = self.hparams.hidden_dim
        # h0 = torch.randn(num_layers, batch_size, hidden_dim)
        # c0 = torch.randn(num_layers, batch_size, hidden_dim)
        # x, _ = self.lstm(inputs, (h0, c0))
        inputs = torch.unsqueeze(inputs, 1)
        x, _ = self.lstm(inputs, None)
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

    def evaluation(self, y_pred, y_true):
        # if statue == 'test':
        # 待写 判断调用时模型的过程（），
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='macro')
        f1score = f1_score(y_true, y_pred, average='macro')
        recall = recall_score(y_true, y_pred, average='macro')
        metrics = {
            # 'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1score,
        }
        return metrics

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_true = y.cpu().detach().numpy()
        y_pred = torch.argmax(y_hat, dim=1).cpu().numpy()
        metrics = self.evaluation(y_true, y_pred)
        loss = F.cross_entropy(y_hat, y)
        metrics['loss'] = loss
        self.log_dict(metrics, prog_bar=True, on_epoch=True)
        return metrics

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_true = y.cpu().detach().numpy()
        y_pred = torch.argmax(y_hat, dim=1).cpu().numpy()
        metrics = self.evaluation(y_true, y_pred)
        # self.print('metrics:', metrics)
        self.log_dict(metrics, prog_bar=True, on_epoch=True)
        return metrics

    def configure_optimizers(self):
        lr = self.hparams.lr
        b1 = self.hparams.b1
        b2 = self.hparams.b2
        return torch.optim.Adam(self.parameters(), lr = lr, betas = (b1, b2))

In [None]:
seed_everything(42)
dm = MyDataModule(tr_data.values, te_data.values)
model = MyLSTM(10, 196, 128, 2)
trainer = Trainer(gpus = AVAIL_GPUS, max_epochs=5, progress_bar_refresh_rate=20)
trainer.fit(model, dm)
trainer.test(model, dm)