## Columns to be removed from training/validation

# Load Tensorflow and check GPU availability

In [1]:
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F


import sys


from tensorflow.python.client import device_lib

for device in device_lib.list_local_devices():
    print(device.physical_device_desc)


    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


2025-01-26 21:44:47.238805: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-26 21:44:47.238841: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-26 21:44:47.242305: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-26 21:44:47.290141: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.





2025-01-26 21:44:54.038308: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


# Load input datasets

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas import DataFrame
from pandas.core.dtypes import common as com
from pyarrow import Table


def union_tables(tables: [pa.Table]) -> pa.Table:
    union_table = tables[0]
    for table in tables[1:]:
        right_not_in_union = union_table.join(right_table=table, keys='domain_name', join_type='right anti',
                                              coalesce_keys=True, use_threads=True)
        union_table = pa.concat_tables([union_table, right_not_in_union])
    return union_table

# #############################################################
# EDIT this to specify benign / malicious datasets to use     #
# #############################################################
benign_dataset_filenames = [
    'parkets/benign_2312.parquet', 
]
malicious_dataset_filenames = [
    'parkets/phishing_2406_strict.parquet'
]
# #############################################################
# EDIT this for to set appropriate labels (malware, dga, ...) #
# #############################################################
benign_label = "benign"
malicious_label = "phishing"
# #############################################################

# Unify malicious datasets and benign datasets
schema = (pq.read_table(malicious_dataset_filenames[0])).schema # Use the schema from the first malicious filename
benign_tables = [pq.read_table(filename).cast(schema) for filename in benign_dataset_filenames]
malicious_tables = [pq.read_table(filename).cast(schema) for filename in malicious_dataset_filenames]
malicious = union_tables(malicious_tables)
benign = union_tables(benign_tables)

# Convert pyarrow tables to pandas dataframes
df_benign = benign.to_pandas()
df_malicious = malicious.to_pandas()

# Set appropriate labels
df_benign["label"] = benign_label
df_malicious["label"] = malicious_label
class_map = {benign_label: 0, malicious_label: 1}



# ===================
# AUTO BALANCING !!!
# Subsample benign to match the size of malicious
# df_benign = df_benign.sample(n=len(df_malicious))
# ===================

# Concatentate benign and malicious
df = pd.concat([df_benign, df_malicious])


def cast_timestamp(df: DataFrame):
    """
    Cast timestamp fields to seconds since epoch.
    """
    for col in df.columns:
        if com.is_timedelta64_dtype(df[col]):
            df[col] = df[col].dt.total_seconds()  # This converts timedelta to float (seconds)
        elif com.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].astype(np.int64) // 10**9  # Converts datetime64 to Unix timestamp (seconds)

    return df

df = cast_timestamp(df)

# Handle NaNs
df.fillna(-1, inplace=True)


# SUBSAMPLE1 (OPTIONAL)
subsample = 1.00 # 1.0 means no subsample
if subsample < 1.0:
    df = df.sample(frac=subsample)

# Drop the domain name column
df.drop("domain_name", axis=1, inplace=True)

    
labels = df['label'].apply(lambda x: class_map[x]) # y vector
features = df.drop('label', axis=1).copy() # X matrix


print(f"Total features after augmentation: {features.shape[1]}")

print(f"Total samples: {len(df)}")
print(f"Benign count: {len(df_benign)}")
print(f"Malicious count: {len(df_malicious)}")



df

Total features after augmentation: 176
Total samples: 31331
Benign count: 462192
Malicious count: 164425


Unnamed: 0,label,dns_has_dnskey,dns_A_count,dns_AAAA_count,dns_MX_count,dns_NS_count,dns_TXT_count,dns_SOA_count,dns_CNAME_count,dns_zone_level,...,rdap_ip_v4_count,rdap_ip_v6_count,rdap_ip_shortest_v4_prefix_len,rdap_ip_longest_v4_prefix_len,rdap_ip_shortest_v6_prefix_len,rdap_ip_longest_v6_prefix_len,rdap_ip_avg_admin_name_len,rdap_ip_avg_admin_name_entropy,rdap_ip_avg_admin_email_len,rdap_ip_avg_admin_email_entropy
423257,benign,0.0,1,0,0,8,3,1,0,0,...,9,8,23.0,24.0,48.0,48.0,17.705882,0.234568,2.882353,0.039906
215559,benign,0.0,2,0,1,2,4,1,0,0,...,8,6,12.0,21.0,32.0,32.0,10.666667,0.341040,13.200000,0.145233
146167,benign,1.0,0,0,0,0,0,0,1,0,...,1,0,23.0,23.0,0.0,0.0,14.000000,0.251546,13.000000,0.260981
61297,phishing,0.0,1,1,0,0,1,0,0,1,...,1,1,23.0,23.0,36.0,36.0,5.000000,0.464386,15.000000,0.216015
140509,benign,0.0,0,0,0,0,0,0,1,1,...,4,8,14.0,14.0,28.0,28.0,13.000000,0.249146,23.000000,0.149881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146282,benign,0.0,1,0,0,0,0,0,0,0,...,1,0,24.0,24.0,0.0,0.0,12.000000,0.284858,0.000000,0.000000
215691,benign,0.0,2,2,0,2,0,1,0,0,...,8,8,12.0,20.0,32.0,32.0,10.375000,0.386327,15.750000,0.173369
142786,phishing,0.0,1,0,1,2,0,1,0,0,...,1,0,24.0,24.0,0.0,0.0,31.000000,0.131972,22.000000,0.168084
263485,benign,0.0,1,0,1,2,1,1,0,0,...,1,0,12.0,12.0,0.0,0.0,13.000000,0.249146,23.000000,0.149881


# Data preprocessing

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
 
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)
features = pd.DataFrame(scaled_data, columns=features.columns)

# Save the scaler
joblib.dump(scaler, "scalers/phishing_deepnn_scaler.joblib")

pd.set_option('display.max_columns', None)


# Train-test split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
  features,
  labels,
  test_size=0.2,
  random_state=42,
  shuffle=True, 
  stratify=labels
)

# print number of features
print(X_train.shape[1])

26


In [8]:
!pip install --upgrade pycaret mlflow


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
from pycaret.utils import version
version()

from pycaret.classification import *
clf1 = setup(df, target = 'label', session_id=51, log_experiment=False, experiment_name='feta1', index=False)

Unnamed: 0,Description,Value
0,Session id,51
1,Target,label
2,Target type,Binary
3,Target mapping,"benign: 0, phishing: 1"
4,Original data shape,"(32123, 177)"
5,Transformed data shape,"(32123, 177)"
6,Transformed train set shape,"(22486, 177)"
7,Transformed test set shape,"(9637, 177)"
8,Numeric features,173
9,Preprocess,True


In [11]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9616,0.9651,0.9616,0.9614,0.9614,0.9004,0.9006,107.894
xgboost,Extreme Gradient Boosting,0.9556,0.9651,0.9556,0.9555,0.9555,0.8852,0.8853,1.934
gbc,Gradient Boosting Classifier,0.9533,0.9649,0.9533,0.9531,0.953,0.8786,0.8789,2.579
rf,Random Forest Classifier,0.9456,0.9581,0.9456,0.9452,0.9452,0.8583,0.8587,0.776
et,Extra Trees Classifier,0.9436,0.9146,0.9436,0.9431,0.9431,0.8525,0.853,0.987
ada,Ada Boost Classifier,0.9396,0.959,0.9396,0.939,0.939,0.8417,0.8424,0.667
lda,Linear Discriminant Analysis,0.9253,0.9537,0.9253,0.9247,0.9236,0.7996,0.803,0.444
ridge,Ridge Classifier,0.9224,0.9539,0.9224,0.9222,0.9203,0.7904,0.7951,0.119
knn,K Neighbors Classifier,0.916,0.927,0.916,0.9155,0.9157,0.7823,0.7825,0.323
dt,Decision Tree Classifier,0.9106,0.8963,0.9106,0.9101,0.9102,0.7684,0.7687,0.308
