# Load Tensorflow and check GPU availability

In [None]:
# -*- coding: utf-8 -*-
import sys
import os

# Manually specify the path to the src folder
sys.path.append(os.path.abspath('../src'))

# Load input datasets

In [None]:
from core.loader import Loader

benign_dataset_filenames = [
    '../parkets/benign/benign_2312_anonymized_HTML.parquet', 
    '../parkets/benign/umbrella_benign_FINISHED_HTML.parquet'
        
]
malicious_dataset_filenames = [
    '../parkets/malware_2406_strict_HTML.parquet'
]

# CONFIGURATION

benign_label = "benign"
malicious_label = "phishing"

class_map = {benign_label: 0, malicious_label: 1}

loader = Loader(benign_dataset_filenames, malicious_dataset_filenames, benign_label=benign_label, malicious_label=malicious_label, subsample=1.0)
df = loader.load()

# Generate basic subsets

In [None]:
import pandas as pd

# Define prefixes
prefixes = ["dns_", "tls_", "html_", "geo_", "rdap_", "lex_", "ip_"]

# Dictionary to store filtered datasets
subset_dfs = {}

# Create subsets for each prefix
for prefix in prefixes:
    subset_df = df.loc[:, df.columns.str.startswith(prefix) | df.columns.isin(['label'])]
    
    # Store in dictionary
    subset_dfs[prefix] = subset_df

    print(f"Subset '{prefix}' contains {subset_df.shape[1]} features and {subset_df.shape[0]} samples.")

# From basic subsets, generate aggregations

In [None]:
import pandas as pd
subset_dfs = {}
# Define the list of aggregates
aggregates = [
    ["lex_"],                                                   # 1. Stage
    ["lex_", "dns_", "ip_"],
    ["lex_", "dns_", "ip_", "geo_"],                            # 2. Stage
    ["lex_", "dns_", "ip_", "tls_", "geo_"],
    ["lex_", "dns_", "ip_", "tls_", "geo_", "rdap_"],
    ["lex_", "dns_", "ip_", "tls_", "geo_", "rdap_", "html_"]   # 3. Stage
]


# Process each aggregation group
for group in aggregates:
    # Build a regex pattern to match any of the prefixes in the group
    pattern = '|'.join(f'^{prefix}' for prefix in group)

    # Select columns starting with any of the specified prefixes or the 'label' column
    subset_df = df.loc[:, df.columns.str.contains(pattern) | (df.columns == 'label')]

    # Ensure 'index' column is not included
    subset_df = subset_df.loc[:, ~subset_df.columns.str.contains('^index$', case=False)]

    # Reset index without adding it as a column
    subset_df.reset_index(drop=True, inplace=True)

    # Create a unique key for the aggregated data
    key = '+'.join(group) + "_agg"

    # Store the aggregated DataFrame in the dictionary
    subset_dfs[key] = subset_df

# Iterate over subsets_df and print the number of benign and malicious samples
for key, subset_df in subset_dfs.items():
    print(f"Subset '{key}' contains {subset_df.shape[1]} features and {subset_df.shape[0]} samples.")


# Pycarret
Run pycaret comparison, for all subsets and aggergations try classification with all 12 models

In [None]:
from pycaret.classification import *
import pandas as pd

# Dictionary to store the top 3 models and results grids for each feature subset
top3_models = {}
results_grids = {}

# Iterate through each subset and train models
for prefix, subset_df in subset_dfs.items():
    print(f"\n🔍 Running model comparison for '{prefix}' features...")

    # PyCaret Setup
    clf = setup(subset_df, target='label', session_id=54, log_experiment=False, experiment_name=f'exp_{prefix}', use_gpu=True, train_size=0.8, index=False)
    
    # Compare models and capture the top 3 models
    top_models = compare_models(sort='F1', n_select=3)
    
    # Store the top 3 models
    top3_models[prefix] = top_models
    
    # Pulling the last model score grid from PyCaret after model comparison
    results_grid = pull()
    results_grids[prefix] = results_grid
    
    # Print information about the top 3 models for the subset
    print(f"✅ Top 3 models for '{prefix}' features:")
    for model in top_models:
        print(model)

# Compile all results into a single DataFrame for comparison
all_results = pd.DataFrame()
for prefix, grid in results_grids.items():
    grid['Subset'] = prefix
    all_results = pd.concat([all_results, grid], axis=0)

# Reset index for a clean look
all_results.reset_index(drop=True, inplace=True)

# Display the consolidated results DataFrame
print("📊 Consolidated Results across all feature subsets:")
display(all_results)


# Results are saved to grid folder

In [None]:

import os
if not os.path.exists('grids'):
    os.makedirs('grids')
    
for prefix, grid in results_grids.items():
    grid.to_csv(f'grids/{malicious_label}{prefix}.csv')
    

## Save / Load results
This code can be used to simply load results from previous runs, since it can take a long time. You can specificy folder to use as cachem normaly tmp folder is used.


In [None]:
import pickle


BACKUP_FILE = '../src/tmp/phishing_agregate_to_good.pickle'

def save_to_pickle(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    print(f"Data saved to {filename}.")
    
    # save all the models 
    

def load_from_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    print(f"Data loaded from {filename}.")
    return data


In [None]:
data_to_save = {
    'results_grids': results_grids,
    'top3_models': top3_models,
    'subset_dfs': subset_dfs
}

save_to_pickle(data_to_save, BACKUP_FILE)

In [None]:
loaded_data = load_from_pickle(BACKUP_FILE)
results_grids = loaded_data['results_grids']
top3_models = loaded_data['top3_models']
subset_dfs = loaded_data['subset_dfs']

### Visualize top models and overall results 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Extracting best F1 Scores and top models
best_f1_scores = []
model_names = []
subsets = []
top_models = {}

for prefix, grid in results_grids.items():
    # Best F1 score
    best_f1 = grid['F1'].max()
    best_f1_scores.append(best_f1)
    
    # Top 3 models
    top_3_models = grid.nlargest(3, 'F1')[['Model', 'F1']]
    top_models[prefix] = top_3_models
    model_names.append(top_3_models.iloc[0]['Model'])
    subsets = [s.replace('_html', '') for s in subsets]
    
    subsets.append(prefix)
    
# remove html from subsets


# Initialize figure
plt.figure(figsize=(18, 10))

# set header 
plt.suptitle('Srovnání klasifikace podle skupin příznaků', fontsize=16)

# Plot 1: Best F1 Scores by Feature Subset
plt.subplot(1, 3, 1)
bars = plt.barh(subsets, best_f1_scores, color='skyblue')
plt.xlabel('F1')
plt.title('Nejlepší F1 skóre podle skupin příznaků')
plt.gca().invert_yaxis()
# Annotate exact F1 scores
for bar, score in zip(bars, best_f1_scores):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{score:.4f}', va='center')

# Plot 2: Top Three Models for each Feature Subset
plt.subplot(1, 3, 2)
for i, (prefix, top_3) in enumerate(top_models.items()):
    for j, (index, row) in enumerate(top_3.iterrows()):
        plt.barh(f'{prefix} {j+1}', row['F1'], color='lightgreen')
        plt.text(row['F1'], i * 3 + j, f'{row["Model"]} ({row["F1"]:.4f})', va='center')
plt.title('Tři nejlepší modely pro každou skupinu příznaků')
plt.xlabel('F1')
plt.gca().invert_yaxis()

# Plot 3: Ranking of Models Across All Feature Sets
model_rankings = pd.concat([grid[['Model', 'F1']] for grid in results_grids.values()])
mean_f1_by_model = model_rankings.groupby('Model')['F1'].mean().sort_values(ascending=True)
plt.subplot(1, 3, 3)
bars = plt.barh(mean_f1_by_model.index, mean_f1_by_model, color='salmon')
plt.title('APrůměrné F1 skóre podle modelů')
plt.xlabel('F1')
# Annotate exact average F1 scores
for bar, score in zip(bars, mean_f1_by_model):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{score:.4f}', va='center')

plt.tight_layout()
plt.show()



## Generate shap for specific models and subsets

In [None]:
import shap
import matplotlib.pyplot as plt

#print all items in top3_models
#for key, value in top3_models.items():
    #print(key, value)
    
#subset = 'rdap_'

prefixes = ["lex_"]

for prefix in prefixes:

    # Initialize SHAP explainer
    explainer = shap.TreeExplainer(top3_models[prefix][0])

    # Compute SHAP values
    X = subset_dfs[prefix].drop('label', axis=1)
    shap_values = explainer.shap_values(X)

    # Summary plot (beeswarm)
    print("Using prefix: ", prefix)
    shap.summary_plot(shap_values, X)

