# Applying TabTranformers to OS fingerprinting task using nmap dataset

### Installing Python dependencies 

In [None]:
import numpy as np
import pandas as pd

In [None]:
seed = 2024
np.random.seed(seed)

### Read dataset from disk

In [None]:
df = pd.read_csv("../dataset/dataset_no_encoded_4397.csv")

In [None]:
df.head()

In [None]:
df["Class.vendor_0"].value_counts()

In [None]:
df["Class.OSfamily_0"].value_counts()

In [None]:
df["Class.OSgen_0"].value_counts()

In [None]:
pair_counts = df.groupby(['Class.OSfamily_0', 'Class.OSgen_0']).size().reset_index(name='Count')
print(pair_counts)

In [None]:
df["Class.device_0"].value_counts()

In [None]:
pair_counts = df.groupby(['Class.OSfamily_0', 'Class.OSgen_0', "Class.device_0"]).size().reset_index(name='Count')
print(pair_counts)

In [None]:
pair_counts = df.groupby(['Class.OSfamily_0', "Class.device_0"]).size().reset_index(name='Count')
print(pair_counts)

In [None]:
df.pop('Class.vendor_0')
df.pop('Class.OSgen_0')
df.pop('Class.device_0')

df.reset_index(drop=True, inplace=True)

# header = names of columns
print(list(df.columns))

In [None]:
# no of features (X)
print("Nº features=", len(list(df.columns))-1)

In [None]:
# output name
OutVar = list(df.columns)[0]
print("Output=", OutVar)

### Checking data

In [None]:
def DataCheckings(df):
    # Check the number of data points in the data set
    print("\nData points =", len(df))
    
    # Check the number of columns in the data set
    print("\nColumns (output + features)=",len(df.columns))
    
    # Check the data types
    print("\nData types =", df.dtypes.unique())
    
    # Dataset statistics
    print('\n')
    df.describe()
    
    # print names of columns
    print('Column Names:\n', df.columns)
    
    # see if there are categorical data
    print("\nCategorical features:", df.select_dtypes(include=['O']).columns.tolist())
    
    # Check NA values
    # Check any number of columns with NaN
    print("\nColumns with NaN: ", df.isnull().any().sum(), ' / ', len(df.columns))

    # Check any number of data points with NaN
    print("\nNo of data points with NaN:", df.isnull().any(axis=1).sum(), ' / ', len(df))

In [None]:
DataCheckings(df)

In [None]:
print('Shape before removing duplicates=', df.shape)

In [None]:
# remove duplicates!
df.drop_duplicates(keep=False, inplace=True)

In [None]:
print('Shape after removing duplicates=', df.shape)

### Remove near zero variance features

In [None]:
# from sklearn.utils import class_weight
# from sklearn.feature_selection import VarianceThreshold

In [None]:
# def getDataFromDataFrame(df, OutVar):
#     # get X, Y data and column names from df
#     print('\n-> Get X & Y data, Features list')
#     print('Shape', df.shape)
    
#     # select X and Y
#     ds_y = df[OutVar]
#     ds_X = df.drop(OutVar,axis = 1)
#     Xdata = ds_X.values # get values of features
#     Ydata = ds_y.values # get output values

#     print('Shape X data:', Xdata.shape)
#     print('Shape Y data:', Ydata.shape)
    
#     # return data for X and Y, feature names as list
#     print('Done!')
#     return (Xdata, Ydata, list(ds_X.columns))

# def Remove0VarCols(df, OutVar):
#     Xdata, Ydata, Features = getDataFromDataFrame(df,OutVar=OutVar)# out var = Class 
#     print('\n-> Remove zero variance features')
#     # print('Initial features:', Features)
#     selector= VarianceThreshold()
#     Xdata = selector.fit_transform(Xdata)
#     # Selected features
#     SelFeatures = []
#     for i in selector.get_support(indices=True):
#         SelFeatures.append(Features[i])
#     print('Removed features:',list(set(Features) - set(SelFeatures)))
    
#     # create the resulted dataframe
#     df = pd.DataFrame(Xdata,columns=SelFeatures)
#     df[OutVar] = Ydata # add class column
#     # print('Final columns:', list(df.columns))
#     print('Done!')
#     return df

In [None]:
# df = Remove0VarCols(df, OutVar)

In [None]:
# # print dimension AFTER removing features
# print("Dataset dimension AFTER removing near zero variance features=",df.shape)

In [None]:
# df.columns

### Verify the classes ballance

In [None]:
df[OutVar].value_counts()

In [None]:
df.replace(['BSD', 'iOS', 'macOS', 'Solaris', 'Android'], 'Other', inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df[OutVar].value_counts()

### TabTransformers

#### Libraries Import

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

%matplotlib inline

In [None]:
import absl.logging
import warnings
import logging

import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

logging.captureWarnings(True)
warnings.filterwarnings('ignore')
absl.logging.set_verbosity(absl.logging.ERROR)

#### Preprocessing

In [None]:
LABEL = OutVar

NUMERIC_FEATURES = df.select_dtypes(include=['int64']).columns.tolist()
CATEGORICAL_FEATURES = df.select_dtypes(include=['object']).columns.tolist()
CATEGORICAL_FEATURES.remove(LABEL)

FEATURES = list(NUMERIC_FEATURES) + list(CATEGORICAL_FEATURES)

In [None]:
print(len(NUMERIC_FEATURES), len(CATEGORICAL_FEATURES), len(FEATURES))

In [None]:
train_data, test_data = train_test_split(df, stratify=df[LABEL], test_size=0.20, random_state=seed)

In [None]:
train_data.shape, test_data.shape

#### Numeric Cleaning

In [None]:
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

numeric_pipe = Pipeline([
    ('impute', imputer),
    ('scale', scaler),
])

numeric_pipe.fit(train_data[NUMERIC_FEATURES])

In [None]:
train_data[NUMERIC_FEATURES] = numeric_pipe.transform(train_data[NUMERIC_FEATURES])
test_data[NUMERIC_FEATURES] = numeric_pipe.transform(test_data[NUMERIC_FEATURES])

#### Categorical Encoding

In [None]:
ordinal_encoder = OrdinalEncoder()

categorical_pipe = Pipeline([
    ('ordinalencoder', ordinal_encoder),
])

categorical_pipe.fit(df[CATEGORICAL_FEATURES])

In [None]:
train_data[CATEGORICAL_FEATURES] = categorical_pipe.transform(train_data[CATEGORICAL_FEATURES])
test_data[CATEGORICAL_FEATURES] = categorical_pipe.transform(test_data[CATEGORICAL_FEATURES])

#### Label Encoding

In [None]:
label_encoder = LabelEncoder()

label_pipe = Pipeline([
    ('labelencoder', ordinal_encoder),
])

label_pipe.fit(df[LABEL].values.reshape(-1, 1))

In [None]:
train_data[LABEL] = label_pipe.transform(train_data[LABEL].values.reshape(-1, 1))
test_data[LABEL] = label_pipe.transform(test_data[LABEL].values.reshape(-1, 1))

#### To Tensors

In [None]:
train_tensor_X_cat = torch.tensor(train_data[CATEGORICAL_FEATURES].values).long()
train_tensor_X_num = torch.tensor(train_data[NUMERIC_FEATURES].values).float()
train_tensor_Y = torch.tensor(train_data[LABEL].values).long()

test_tensor_X_cat = torch.tensor(test_data[CATEGORICAL_FEATURES].values).long()
test_tensor_X_num = torch.tensor(test_data[NUMERIC_FEATURES].values).float()
test_tensor_Y = torch.tensor(test_data[LABEL].values).long()

In [None]:
print(train_tensor_X_cat[0:100,:].shape, train_tensor_X_num[0:100,:].shape, train_tensor_Y[0:100].shape)

#### Hyperparameter Tuning

In [None]:
# def objective(trial):
#     embedding_dim = trial.suggest_categorical('embedding_dim',[8, 16, 32, 64])
#     depth = trial.suggest_int('depth',1,6,1)
#     heads = trial.suggest_int('heads',2,8,1)
#     attn_dropout = trial.suggest_float("attn_dropout", 0.05, 0.5)
#     ff_dropout = trial.suggest_float("ff_dropout", 0.05, 0.5)
#     mlp_hidden_factor1 = trial.suggest_int("mlp_hidden_factor1", 1, 3, 0.5)
#     mlp_hidden_factor2 = trial.suggest_int("mlp_hidden_factor2", 1, 3, 0.5)
#     use_column_embedding = trial.suggest_categorical('use_column_embedding', [True, False])
    
#     category_prep_layers = build_categorical_prep(train_data, CATEGORICAL_FEATURES)
    
#     tabtransformer = TabTransformer(
#         numerical_features = NUMERIC_FEATURES,
#         categorical_features = CATEGORICAL_FEATURES,
#         categorical_lookup=category_prep_layers,
#         numerical_discretisers=None, # simply passing the numeric features
#         embedding_dim=embedding_dim,
#         out_dim=1,
#         out_activation='sigmoid',
#         depth=depth,
#         heads=heads,
#         attn_dropout=attn_dropout,
#         ff_dropout=ff_dropout,
#         mlp_hidden_factors=[mlp_hidden_factor1, mlp_hidden_factor2],
#         use_column_embedding=use_column_embedding,
#     )
    
#     LEARNING_RATE = 0.001
#     WEIGHT_DECAY = 0.0001
#     NUM_EPOCHS = 1000

#     optimizer = AdamW(
#             learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
#         )

#     tabtransformer.compile(
#         optimizer = optimizer,
#         loss = tf.keras.losses.BinaryCrossentropy(),
#         metrics= [tf.keras.metrics.AUC(name="AUC", curve='ROC')],
#     )
    
#     early = EarlyStopping(monitor="val_loss", mode="min", patience=20, restore_best_weights=True)
#     callback_list = [early]

#     history = tabtransformer.fit(
#         train_dataset, 
#         epochs=NUM_EPOCHS, 
#         validation_data=test_dataset,
#         callbacks=callback_list,
#         verbose=0
#     )
    
#     val_preds = tabtransformer.predict(test_dataset)
#     roc = roc_auc_score(test_dataset[LABEL], val_preds.ravel())
    
#     gc.collect()
    
#     return roc

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

#### Training

In [None]:
import torch
from torchmetrics import AUROC
import torch.nn as nn
import torch.optim as optim

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

cont_mean_std = torch.zeros(len(train_tensor_X_num.T), 2)
for i, column in enumerate(train_tensor_X_num.T):
    mean = torch.mean(column)
    std = torch.std(column)
    cont_mean_std[i] = torch.tensor([mean, std])

cat_feature_counts = ()
for column in test_tensor_X_cat.T:
    unique_values = torch.unique(column)
    cat_feature_counts = cat_feature_counts + (len(unique_values),)

tabtransformer = TabTransformer(
    categories=cat_feature_counts,
    num_continuous=len(train_tensor_X_num.T),
    dim=32,
    dim_out=1,
    depth=6,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4, 2),
    mlp_act=nn.ReLU(),
    continuous_mean_std=cont_mean_std
)

optimizer = optim.Adam(tabtransformer.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = nn.BCEWithLogitsLoss()
metrics = AUROC('binary')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tabtransformer.to(device)

for epoch in range(NUM_EPOCHS):
    tabtransformer.train()
    optimizer.zero_grad()
    
    # Forward pass
    outputs = tabtransformer(train_tensor_X_cat, train_tensor_X_num)
    loss = loss_fn(outputs, train_tensor_Y)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    # Evaluation
    tabtransformer.eval()
    with torch.no_grad():
        val_outputs = tabtransformer(test_tensor_X_cat, test_tensor_X_num)
        val_loss = loss_fn(val_outputs, test_tensor_Y)
        val_auc = metrics(val_outputs, test_tensor_Y)
    
    # Print progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val AUC: {val_auc.item():.4f}")
