# Applying TabTranformers to OS fingerprinting task using nmap dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

import absl.logging
import warnings
import logging

import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
from torchmetrics import AUROC
from tqdm import tqdm
import torch.optim as optim

###############################

logging.captureWarnings(True)
warnings.filterwarnings('ignore')
absl.logging.set_verbosity(absl.logging.ERROR)

seed = 2024
np.random.seed(seed)

df = pd.read_csv("../dataset/dataset_no_encoded_4397.csv")

df.pop('Class.vendor_0')
df.pop('Class.OSgen_0')
df.pop('Class.device_0')
df.reset_index(drop=True, inplace=True)

OutVar = list(df.columns)[0]

df.drop_duplicates(keep=False, inplace=True)

# df.replace(['BSD', 'iOS', 'macOS', 'Solaris', 'Android'], 'Other', inplace=True)
df = df[~df.isin(['BSD', 'iOS', 'macOS', 'Solaris', 'Android']).any(axis=1)]
df.reset_index(drop=True, inplace=True)

###############################

LABEL = OutVar

NUMERIC_FEATURES = df.select_dtypes(include=['int64']).columns.tolist()
CATEGORICAL_FEATURES = df.select_dtypes(include=['object']).columns.tolist()
CATEGORICAL_FEATURES.remove(LABEL)

FEATURES = list(NUMERIC_FEATURES) + list(CATEGORICAL_FEATURES)

train_data, test_data = train_test_split(df, stratify=df[LABEL], test_size=0.20, random_state=seed)

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
numeric_pipe = Pipeline([
    ('impute', imputer),
    ('scale', scaler),
])
numeric_pipe.fit(train_data[NUMERIC_FEATURES])
train_data[NUMERIC_FEATURES] = numeric_pipe.transform(train_data[NUMERIC_FEATURES])
test_data[NUMERIC_FEATURES] = numeric_pipe.transform(test_data[NUMERIC_FEATURES])


ordinal_encoder = OrdinalEncoder()
categorical_pipe = Pipeline([
    ('ordinalencoder', ordinal_encoder),
])
categorical_pipe.fit(df[CATEGORICAL_FEATURES])
train_data[CATEGORICAL_FEATURES] = categorical_pipe.transform(train_data[CATEGORICAL_FEATURES])
test_data[CATEGORICAL_FEATURES] = categorical_pipe.transform(test_data[CATEGORICAL_FEATURES])


label_encoder = LabelEncoder()
label_pipe = Pipeline([
    ('labelencoder', ordinal_encoder),
])
label_pipe.fit(df[LABEL].values.reshape(-1, 1))
train_data[LABEL] = label_pipe.transform(train_data[LABEL].values.reshape(-1, 1))
test_data[LABEL] = label_pipe.transform(test_data[LABEL].values.reshape(-1, 1))


train_tensor_X_cat = torch.tensor(train_data[CATEGORICAL_FEATURES].values).int()
train_tensor_X_num = torch.tensor(train_data[NUMERIC_FEATURES].values).float()
train_tensor_Y = torch.tensor(train_data[LABEL].values).view(-1, 1).float()

test_tensor_X_cat = torch.tensor(test_data[CATEGORICAL_FEATURES].values).int()
test_tensor_X_num = torch.tensor(test_data[NUMERIC_FEATURES].values).float()
test_tensor_Y = torch.tensor(test_data[LABEL].values).view(-1, 1).float()

In [1]:
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

cont_mean_std = torch.randn(10, 2)

model = TabTransformer(
    categories = (10, 5, 6, 5, 8),      # tuple containing the number of unique values within each category
    num_continuous = 10,                # number of continuous values
    dim = 32,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.1,                 # post-attention dropout
    ff_dropout = 0.1,                   # feed forward dropout
    mlp_hidden_mults = (4, 2),          # relative multiples of each hidden dimension of the last mlp to logits
    mlp_act = nn.ReLU(),                # activation for final mlp, defaults to relu, but could be anything else (selu etc)
    continuous_mean_std = cont_mean_std # (optional) - normalize the continuous values before layer norm
)

x_categ = torch.randint(0, 5, (1, 5))     # category values, from 0 - max number of categories, in the order as passed into the constructor above
x_cont = torch.randn(1, 10)               # assume continuous values are already normalized individually

pred = model(x_categ, x_cont) # (1, 1)

In [4]:
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

# Random categorical and numerical features
x_categ = torch.randint(0, 5, (1, 5))
x_cont = torch.randn(1, 10)

# Random predictions
target = torch.randn(1, 1)

# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(10):
    optimizer.zero_grad()
    pred = model(x_categ, x_cont)
    loss = criterion(pred, target)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# Updated prediction after training
updated_pred = model(x_categ, x_cont)
print("Updated Prediction:", updated_pred)


Epoch 1: Loss = 0.47005534172058105
Epoch 2: Loss = 0.16299572587013245
Epoch 3: Loss = 0.0054003456607460976
Epoch 4: Loss = 0.17402833700180054
Epoch 5: Loss = 0.013747833669185638
Epoch 6: Loss = 0.06410405039787292
Epoch 7: Loss = 0.07166294753551483
Epoch 8: Loss = 0.007313206326216459
Epoch 9: Loss = 0.014909959398210049
Epoch 10: Loss = 0.03474622592329979
Updated Prediction: tensor([[0.4847]], grad_fn=<AddmmBackward0>)
