In [1]:
import numpy as np
import pandas as pd
import torch
import random

from tqdm import tqdm
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
batch_size = 8
learning_rate = 1e-3
n_epochs = 50

random_seed = int(random.random() * 100)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
df = pd.read_csv('phishing.csv').to_numpy()
x = df[:, :-1]
y = df[:, -1]

x = StandardScaler().fit_transform(x)
y = LabelEncoder().fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random_seed)

x_train = torch.from_numpy(x_train.astype(np.float32)).to(device)
x_test = torch.from_numpy(x_test.astype(np.float32)).to(device)
y_train = torch.from_numpy(y_train.astype(np.int64)).to(device)
y_test = torch.from_numpy(y_test.astype(np.int64))

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)

In [4]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()        
        self.flatten = nn.Flatten()
        self.network = nn.Sequential(
            nn.Linear(x_train.shape[1], 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 8),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(8, 2),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        pred = self.network(x)
        
        return pred

In [5]:
def train_loop(n_epochs, model, loss_fn, optimizer, scheduler):
    model.train()
    for epoch in tqdm(range(n_epochs)):
        for xb, yb in train_dl:
            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        if epoch % (n_epochs / 10) == 0:
            print('loss:', loss.item())
            print('lr:', scheduler.get_last_lr()[0])

In [6]:
model = Net().to(device)

optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=learning_rate, epochs=n_epochs, steps_per_epoch=len(train_dl))
#scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=1)
loss_fn = nn.CrossEntropyLoss()

train_loop(n_epochs, model, loss_fn, optimizer, scheduler)

  2%|▏         | 1/50 [00:01<01:07,  1.38s/it]

loss: 0.7207350730895996
lr: 5.049041165040391e-05


 12%|█▏        | 6/50 [00:08<00:58,  1.34s/it]

loss: 0.48167991638183594
lr: 0.0003717064240994401


 22%|██▏       | 11/50 [00:15<00:55,  1.43s/it]

loss: 0.31326204538345337
lr: 0.000841232226789564


 32%|███▏      | 16/50 [00:21<00:43,  1.29s/it]

loss: 0.558046817779541
lr: 0.0009979835159889025


 42%|████▏     | 21/50 [00:27<00:35,  1.22s/it]

loss: 0.31337976455688477
lr: 0.0009292038658554173


 52%|█████▏    | 26/50 [00:33<00:29,  1.24s/it]

loss: 0.3136102557182312
lr: 0.0007754155223738199


 62%|██████▏   | 31/50 [00:40<00:24,  1.31s/it]

loss: 0.3132617771625519
lr: 0.0005670781530608991


 72%|███████▏  | 36/50 [00:47<00:18,  1.35s/it]

loss: 0.3528635501861572
lr: 0.00034545552899923536


 82%|████████▏ | 41/50 [00:53<00:11,  1.30s/it]

loss: 0.3140758275985718
lr: 0.0001544427289073549


 92%|█████████▏| 46/50 [01:00<00:05,  1.28s/it]

loss: 0.3132617473602295
lr: 3.187218046171348e-05


100%|██████████| 50/50 [01:05<00:00,  1.32s/it]


In [7]:
model.eval()
with torch.no_grad():
    y_pred = model(x_test).argmax(dim=1).cpu()
    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred, average='weighted'))
    print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred, average='weighted'))
    print('F1-measure: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, average='weighted'))
    print('Confusion Matrix:')
    print(confusion_matrix(y_true=y_test, y_pred=y_pred))

Accuracy: 0.96
Precision: 0.960
Recall: 0.960
F1-measure: 0.960
Confusion Matrix:
[[ 943   39]
 [  49 1180]]
