## Columns to be removed from training/validation

# Load Tensorflow and check GPU availability

In [5]:
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F


import sys


from tensorflow.python.client import device_lib

for device in device_lib.list_local_devices():
    print(device.physical_device_desc)


    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


2025-01-27 12:57:26.365491: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-27 12:57:26.371535: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-27 12:57:26.374535: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# Load input datasets

In [11]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas import DataFrame
from pandas.core.dtypes import common as com
from pyarrow import Table


def union_tables(tables: [pa.Table]) -> pa.Table:
    union_table = tables[0]
    for table in tables[1:]:
        right_not_in_union = union_table.join(right_table=table, keys='domain_name', join_type='right anti',
                                              coalesce_keys=True, use_threads=True)
        union_table = pa.concat_tables([union_table, right_not_in_union])
    return union_table

# #############################################################
# EDIT this to specify benign / malicious datasets to use     #
# #############################################################
benign_dataset_filenames = [
    'parkets/benign_2312.parquet', 
]
malicious_dataset_filenames = [
    'parkets/phishing_2406_strict.parquet'
]
# #############################################################
# EDIT this for to set appropriate labels (malware, dga, ...) #
# #############################################################
benign_label = "benign"
malicious_label = "phishing"
# #############################################################

# Unify malicious datasets and benign datasets
schema = (pq.read_table(malicious_dataset_filenames[0])).schema # Use the schema from the first malicious filename
benign_tables = [pq.read_table(filename).cast(schema) for filename in benign_dataset_filenames]
malicious_tables = [pq.read_table(filename).cast(schema) for filename in malicious_dataset_filenames]
malicious = union_tables(malicious_tables)
benign = union_tables(benign_tables)

# Convert pyarrow tables to pandas dataframes
df_benign = benign.to_pandas()
df_malicious = malicious.to_pandas()

# Set appropriate labels
df_benign["label"] = benign_label
df_malicious["label"] = malicious_label
class_map = {benign_label: 0, malicious_label: 1}



# ===================
# AUTO BALANCING !!!
# Subsample benign to match the size of malicious
#df_benign = df_benign.sample(n=len(df_malicious))
# ===================

# Concatentate benign and malicious
df = pd.concat([df_benign, df_malicious])


def cast_timestamp(df: DataFrame):
    """
    Cast timestamp fields to seconds since epoch.
    """
    for col in df.columns:
        if com.is_timedelta64_dtype(df[col]):
            df[col] = df[col].dt.total_seconds()  # This converts timedelta to float (seconds)
        elif com.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].astype(np.int64) // 10**9  # Converts datetime64 to Unix timestamp (seconds)

    return df

df = cast_timestamp(df)

# Handle NaNs
df.fillna(-1, inplace=True)


# SUBSAMPLE1 (OPTIONAL)
subsample = 0.15 # 1.0 means no subsample
if subsample < 1.0:
    df = df.sample(frac=subsample)

# Drop the domain name column
df.drop("domain_name", axis=1, inplace=True)

    
labels = df['label'].apply(lambda x: class_map[x]) # y vector
features = df.drop('label', axis=1).copy() # X matrix


print(f"Total features after augmentation: {features.shape[1]}")

print(f"Total samples: {len(df)}")
print(f"Benign count: {len(df_benign)}")
print(f"Malicious count: {len(df_malicious)}")



df

Total features after augmentation: 176
Total samples: 93993
Benign count: 462192
Malicious count: 164425


Unnamed: 0,label,dns_has_dnskey,dns_A_count,dns_AAAA_count,dns_MX_count,dns_NS_count,dns_TXT_count,dns_SOA_count,dns_CNAME_count,dns_zone_level,...,rdap_ip_v4_count,rdap_ip_v6_count,rdap_ip_shortest_v4_prefix_len,rdap_ip_longest_v4_prefix_len,rdap_ip_shortest_v6_prefix_len,rdap_ip_longest_v6_prefix_len,rdap_ip_avg_admin_name_len,rdap_ip_avg_admin_name_entropy,rdap_ip_avg_admin_email_len,rdap_ip_avg_admin_email_entropy
187397,benign,0.0,2,0,5,4,10,1,0,0,...,11,9,15.0,23.0,28.0,47.0,13.600000,0.249552,19.650000,0.129238
245299,benign,0.0,1,0,2,2,1,1,0,0,...,1,0,16.0,16.0,0.0,0.0,25.000000,0.148139,15.000000,0.209305
100862,benign,0.0,1,0,0,2,0,1,0,0,...,17,2,13.0,16.0,22.0,22.0,7.105263,0.363092,23.947368,0.140859
308069,benign,1.0,1,1,2,3,11,1,0,0,...,6,6,23.0,23.0,48.0,48.0,14.416667,0.247656,0.000000,0.000000
105588,benign,0.0,2,2,0,0,0,0,0,0,...,2,2,12.0,13.0,32.0,32.0,5.000000,0.464386,18.000000,0.198468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68622,phishing,0.0,2,0,0,0,0,0,0,0,...,2,0,12.0,12.0,0.0,0.0,5.000000,0.464386,18.000000,0.198468
233533,benign,0.0,1,0,0,3,0,1,0,0,...,4,2,21.0,22.0,29.0,29.0,15.000000,0.251570,0.000000,0.000000
236270,benign,0.0,2,2,0,0,0,0,0,0,...,2,2,12.0,13.0,32.0,32.0,5.000000,0.464386,18.000000,0.198468
202771,benign,0.0,0,0,0,0,0,0,1,0,...,3,3,12.0,13.0,32.0,32.0,5.000000,0.464386,18.000000,0.198468


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
 
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
df = pd.DataFrame(scaled_data, columns=features.columns)

# Save the scaler
joblib.dump(scaler, "scalers/phishing_deepnn_scaler.joblib")

pd.set_option('display.max_columns', None)

In [8]:
from pycaret.utils import version
version()

from pycaret.classification import *
clf1 = setup(df, target = 'label', session_id=53, log_experiment=False, experiment_name='feta3', index=False, use_gpu = True)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.50000

Unnamed: 0,Description,Value
0,Session id,53
1,Target,label
2,Target type,Binary
3,Target mapping,"benign: 0, phishing: 1"
4,Original data shape,"(34061, 177)"
5,Transformed data shape,"(34061, 177)"
6,Transformed train set shape,"(23842, 177)"
7,Transformed test set shape,"(10219, 177)"
8,Numeric features,173
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.50000

In [9]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9509,0.9631,0.9509,0.951,0.9509,0.9019,0.9019,1.39
xgboost,Extreme Gradient Boosting,0.9426,0.9617,0.9426,0.9426,0.9426,0.8852,0.8852,1.43
gbc,Gradient Boosting Classifier,0.942,0.9629,0.942,0.942,0.9419,0.8839,0.8839,18.47
rf,Random Forest Classifier,0.9284,0.952,0.9284,0.9285,0.9284,0.8568,0.8569,1.306
ada,Ada Boost Classifier,0.9272,0.9586,0.9272,0.9273,0.9272,0.8545,0.8545,3.812
et,Extra Trees Classifier,0.9263,0.9064,0.9263,0.9264,0.9263,0.8526,0.8527,1.012
ridge,Ridge Classifier,0.9136,0.9529,0.9136,0.9137,0.9136,0.8272,0.8273,0.254
lda,Linear Discriminant Analysis,0.9135,0.9528,0.9135,0.9136,0.9135,0.8269,0.827,0.7
knn,K Neighbors Classifier,0.889,0.9299,0.889,0.8891,0.889,0.778,0.7781,0.413
dt,Decision Tree Classifier,0.8853,0.8951,0.8853,0.8856,0.8853,0.7706,0.7708,1.766


In [None]:
ada = create_model('ada')

In [13]:
light_model = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9589,0.9682,0.9589,0.9589,0.9589,0.9178,0.9178
1,0.948,0.9653,0.948,0.948,0.948,0.896,0.896
2,0.9451,0.9633,0.9451,0.9451,0.9451,0.8901,0.8901
3,0.9497,0.963,0.9497,0.9497,0.9497,0.8993,0.8993
4,0.9589,0.9681,0.9589,0.959,0.9589,0.9178,0.9179
5,0.9488,0.9583,0.9488,0.949,0.9488,0.8976,0.8978
6,0.9409,0.9495,0.9409,0.9409,0.9409,0.8817,0.8817
7,0.9509,0.9615,0.9509,0.9509,0.9509,0.9018,0.9019
8,0.9543,0.9671,0.9543,0.9543,0.9543,0.9086,0.9086
9,0.9539,0.9645,0.9539,0.9539,0.9539,0.9077,0.9077


In [None]:
xboost_model = create_model('xgboost')

In [None]:
gbc = create_model('gbc')

# TUNE MODELS

In [None]:
tune_model(gbc)
tune_model(light_model)
tune_model(xboost_model)

# Combine models

## Blended models 

In [None]:
blender = blend_models(estimator_list = [gbc, light_model, xboost_model], method = 'soft')

## Boosted models

In [None]:
stacker = stack_models(estimator_list = [gbc,light_model,xboost_model], meta_model=ada)

## Results

In [None]:
plot_model(stacker)
plot_model(stacker, plot = 'confusion_matrix')
plot_model(stacker, plot = 'boundary')
interpret_model(stacker)
interpret_model(blender)