In [114]:
# make simple pytorch neural network
import torch
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.optim import Adam
#import resample
from sklearn.utils import resample
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm, trange
import numpy as np
import torchmetrics 
import enlighten
import pickle


In [43]:
#load data
data = np.load('data/processed/concatenated_features.npy')

SEED = 1706
LEARNING_RATE = 0.001
EPOCHS = 50
BATCH_SIZE = 64

In [44]:
# split into training and test set
x_train, x_test, y_train, y_test = train_test_split(data[:, :-1], data[:, -1], test_size=0.2, random_state=SEED, shuffle=True)

# upsample minority class
x_minority = x_train[y_train == 0]
y_minority = y_train[y_train == 0]
x_majority = x_train[y_train == 1]
y_majority = y_train[y_train == 1]

x_majority_subsample, y_majority_subsample = resample(x_majority, y_majority, replace=False, n_samples=x_minority.shape[0], random_state=SEED)

x_train_balanced = np.concatenate((x_majority_subsample, x_minority))
y_train_balanced = np.concatenate((y_majority_subsample, y_minority))


In [45]:
print(torch.__version__)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')


if torch.cuda.is_available():
    !nvidia-smi

1.10.1+cu113
Using cuda device
Fri Dec 24 01:52:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   27C    P0    56W / 400W |   3760MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:0F:00.0 Off |                    0 |
| N/A   20C    P0    50W / 400W |      3MiB / 40536MiB | 

In [46]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(torch.FloatTensor(x_train_balanced), 
                       torch.FloatTensor(y_train_balanced))
## test data    
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(x_test))

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=1)

In [47]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        
        self.linear_1 = nn.Linear(5595, 10024)
        self.linear_2 = nn.Linear(10024, 1024)
        self.linear_3 = nn.Linear(1024, 256)
        self.output_layer = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)
        self.batch_norm1 = nn.BatchNorm1d(10024)
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.batch_norm3 = nn.BatchNorm1d(256)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.batch_norm1(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        x = self.relu(x)
        x = self.batch_norm2(x)
        x = self.dropout(x)
        x = self.linear_3(x)
        x = self.relu(x)
        x = self.batch_norm3(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        
        return x
    

In [48]:
model = NeuralNetwork()
model.to(device)
print(model)

NeuralNetwork(
  (linear_1): Linear(in_features=5595, out_features=10024, bias=True)
  (linear_2): Linear(in_features=10024, out_features=1024, bias=True)
  (linear_3): Linear(in_features=1024, out_features=256, bias=True)
  (output_layer): Linear(in_features=256, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (batch_norm1): BatchNorm1d(10024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [49]:
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [55]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [66]:
manager = enlighten.get_manager()
epoch_tick = manager.counter(total=EPOCHS, desc='Epochs', unit='epochs')
batch_tick = manager.counter(total=int(x_train_balanced.__len__()/BATCH_SIZE), desc='batch', unit='batches')

_acc = torchmetrics.Accuracy().to(device)
_f1 = torchmetrics.F1().to(device)
_prcu = torchmetrics.PrecisionRecallCurve().to(device)
_av = torchmetrics.AveragePrecision().to(device)
_pre = torchmetrics.Precision().to(device)
_recall = torchmetrics.Recall().to(device)

model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_dataloader:

        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        
        y_pred = model(X_batch)
        
        acc2 = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = _acc(y_pred, y_batch.int().unsqueeze(1))
        f1 = _f1(y_pred, y_batch.int().unsqueeze(1))
        prcu = _prcu(y_pred, y_batch.int().unsqueeze(1))
        av = _av(y_pred, y_batch.int().unsqueeze(1))
        pre = _pre(y_pred, y_batch.int().unsqueeze(1))
        recall = _recall(y_pred, y_batch.int().unsqueeze(1))
        
        
        val_loss = criterion
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc2.item()
        batch_tick.update()
    epoch_tick.update()

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_dataloader):.5f} | Acc: {_acc.compute()*100:.3f} | Acc_2: {epoch_acc/len(train_dataloader):.3f}')
    
manager.stop()


print(_acc.compute(), _f1.compute(), _prcu.compute(), _av.compute(), _pre.compute(), _recall.compute())

Epoch 001: | Loss: 0.02862 | Acc: 98.989 | Acc_2: 98.851
Epoch 002: | Loss: 0.02195 | Acc: 99.086 | Acc_2: 99.098
Epoch 003: | Loss: 0.03080 | Acc: 98.986 | Acc_2: 98.644
Epoch 004: | Loss: 0.03707 | Acc: 98.896 | Acc_2: 98.577
Epoch 005: | Loss: 0.02809 | Acc: 98.911 | Acc_2: 98.943
Epoch 006: | Loss: 0.03273 | Acc: 98.915 | Acc_2: 98.686
Epoch 007: | Loss: 0.04067 | Acc: 98.876 | Acc_2: 98.443
Epoch 008: | Loss: 0.03319 | Acc: 98.872 | Acc_2: 98.732
Epoch 009: | Loss: 0.03063 | Acc: 98.866 | Acc_2: 98.722
Epoch 010: | Loss: 0.02228 | Acc: 98.903 | Acc_2: 99.108
Epoch 011: | Loss: 0.02074 | Acc: 98.938 | Acc_2: 99.201
Epoch 012: | Loss: 0.02542 | Acc: 98.960 | Acc_2: 99.026
Epoch 013: | Loss: 0.02665 | Acc: 98.965 | Acc_2: 98.985
Epoch 014: | Loss: 0.02956 | Acc: 98.980 | Acc_2: 98.974
Epoch 015: | Loss: 0.02643 | Acc: 98.985 | Acc_2: 98.974
Epoch 016: | Loss: 0.02895 | Acc: 98.985 | Acc_2: 98.881
Epoch 017: | Loss: 0.02393 | Acc: 98.998 | Acc_2: 99.000
Epoch 018: | Loss: 0.02799 | Ac

In [73]:
print(f"Accuracy: {_acc.compute()*100:.3f} | F1: {_f1.compute()*100:.3f}  | Average Precision: {_av.compute()*100:.3f} | Precision: {_pre.compute()*100:.3f} | Recall: {_recall.compute()*100:.3f}")

Accuracy: 99.121 | F1: 99.119  | Average Precision: 99.958 | Precision: 99.377 | Recall: 98.862


In [85]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_dataloader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

        
        y_pred_list = [a.squeeze() for a in y_pred_list]

In [87]:
confusion_matrix(y_test, y_pred_list)

array([[1510,   30],
       [ 455, 7660]])

In [88]:
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

         0.0       0.77      0.98      0.86      1540
         1.0       1.00      0.94      0.97      8115

    accuracy                           0.95      9655
   macro avg       0.88      0.96      0.92      9655
weighted avg       0.96      0.95      0.95      9655



## Shuffle a column at a time

In [139]:
# same_characteristic
same_characteristic = []
for j in range(373):
    same_characteristic.append([i*373+j for i in range(15)])
    
def iter_list(l):
    for i in l:
        yield i


In [None]:
EPOCHS_shuffle = 15
BATCH_SIZE = 256

manager = enlighten.get_manager()
row_tick = manager.counter(total=373, desc='Shuffle', unit='Shuffles')

group_dict = {}


# copy x_train_balanced
   
for m, i in enumerate(iter_list(same_characteristic)):
    x_train_shuffle = x_train_balanced.copy()
    for j in i:
        np.random.shuffle(x_train_shuffle[:, i])

    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()
    
    epoch_tick = manager.counter(total=EPOCHS_shuffle, desc='Epoch', unit='Epochs', leave=False)
    
    model.train()
    for e in range(1, EPOCHS_shuffle+1):
        epoch_loss = 0
        epoch_acc = 0
        batch_tick = manager.counter(total=len(train_dataloader), desc='Batch', unit='Batch', leave=False)
        for X_batch, y_batch in train_dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            y_pred = model(X_batch)

            loss = criterion(y_pred, y_batch.unsqueeze(1))
            acc = binary_acc(y_pred, y_batch.unsqueeze(1))

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            batch_tick.update()
        batch_tick.close()
        epoch_tick.update()
    epoch_tick.close()
    row_tick.update()
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_dataloader):.5f} | Acc: {epoch_acc/len(train_dataloader):.3f}')
    
    y_pred_list = []
    model.eval()
    with torch.no_grad():
        for X_batch in test_dataloader:
            X_batch = X_batch.to(device)
            y_test_pred = model(X_batch)
            y_test_pred = torch.sigmoid(y_test_pred)
            y_pred_tag = torch.round(y_test_pred)
            y_pred_list.append(y_pred_tag)
        
        y_pred_list = torch.concat(y_pred_list)
            
        acc = torchmetrics.functional.accuracy(y_pred_list.cpu(), torch.Tensor(y_test).int().cpu())
        f1 = torchmetrics.functional.f1(y_pred_list.cpu(), torch.Tensor(y_test).int().cpu())
        pre = torchmetrics.functional.precision(y_pred_list.cpu(), torch.Tensor(y_test).int().cpu())
        recall = torchmetrics.functional.recall(y_pred_list.cpu(), torch.Tensor(y_test).int().cpu())
        prcu = torchmetrics.functional.precision_recall_curve(y_pred_list.cpu(), torch.Tensor(y_test).int().cpu(), num_classes=1)
        
        single_dict = {f'id_{m}': [{"Accuracy": float(acc), "F1": float(f1), "Precision": float(pre), "Recall": float(recall), "Precision-Recall Curve": prcu}]}
        print(single_dict)
        
        group_dict.update(single_dict)
        pickle.dump(group_dict, open(f"models/{m}_shuffle_results.pkl", "wb"))
        
manager.stop()


Epoch 015: | Loss: 0.11266 | Acc: 95.531
{'id_0': [{'Accuracy': 0.9386846423149109, 'F1': 0.9627673029899597, 'Precision': 0.9831727743148804, 'Recall': 0.943191647529602, 'Precision-Recall Curve': (tensor([0.8405, 0.9832, 1.0000]), tensor([1.0000, 0.9432, 0.0000]), tensor([0., 1.]))}]}
Epoch 015: | Loss: 0.11192 | Acc: 95.531
{'id_1': [{'Accuracy': 0.9336095452308655, 'F1': 0.9592421054840088, 'Precision': 0.9909353852272034, 'Recall': 0.9295132756233215, 'Precision-Recall Curve': (tensor([0.8405, 0.9909, 1.0000]), tensor([1.0000, 0.9295, 0.0000]), tensor([0., 1.]))}]}
Epoch 015: | Loss: 0.10932 | Acc: 95.959
{'id_2': [{'Accuracy': 0.9287415742874146, 'F1': 0.9562229514122009, 'Precision': 0.9885541200637817, 'Recall': 0.9259396195411682, 'Precision-Recall Curve': (tensor([0.8405, 0.9886, 1.0000]), tensor([1.0000, 0.9259, 0.0000]), tensor([0., 1.]))}]}
Epoch 015: | Loss: 0.12515 | Acc: 94.980
{'id_3': [{'Accuracy': 0.9260486960411072, 'F1': 0.954591691493988, 'Precision': 0.9863319993

In [108]:
torch.Tensor(y_test).int()

tensor([0, 1, 1,  ..., 1, 1, 0], dtype=torch.int32)

In [128]:
import pandas as pd

df = pd.DataFrame.from_dict(single_dict[list(single_dict.keys())[0]])
df.index =list(single_dict.keys())

In [129]:
df

Unnamed: 0,Accuracy,F1,Precision,Recall,Precision-Recall Curve
id_2,tensor(0.9243),tensor(0.9534),tensor(0.9886),tensor(0.9205),"([tensor(0.8405), tensor(0.9886), tensor(1.)],..."


In [135]:
float(acc)

91.0