In [1]:
# Cell 1
# !pip install scikit-learn==1.2.2 pandas numpy psutil ipywidgets ipyvuetify bqplot seaborn scipy==1.11.4 matplotlib dill joblib imblearn "ray[tune]>=2.7,<2.10" scikit-optimize hyperopt tune-sklearn category_encoders optuna tensorflow

# Cell 2
# import os
# import pandas as pd

# # Print the current working directory
# print("Current working directory:", os.getcwd())

# # List all files in the current directory
# print("Files in the current directory:", os.listdir())

# # Read the CSV file
# outlier_models = pd.read_csv("outlier_results.csv")
# cluster_models = pd.read_csv("cluster_results.csv")

# Cell 3
import ipywidgets as widgets
from IPython.display import display

# Note these are suggestions based on my own research/opinion
# Please do your own research
attack_details = {
    "Discouragement Attack": {
        "features": [
            "Block Number", "Timestamp", "Slot", "Validator Pubkey", 
            "Validator Pool", "Validator Name", "Gas Earned", 
            "Base Fee per Gas", "Gas Limit", "Gas Used", "N Transactions", 
            "MEVBoost Value", "Reverted"
        ],
        "link": "https://github.com/ethereum/research/blob/master/papers/discouragement/discouragement.pdf"
    },
    "Time-Bandit Attack": {
        "features": [
            "Timestamp", "Slot", "Validator Pubkey", "Proposer Pubkey", 
            "Gas Earned", "MEVBoost Value", "Reverted"
        ],
        "link": "https://arxiv.org/pdf/2110.10086"
    },
    "Single Slot Finality Issues": {
        "features": [
            "Block Number", "Timestamp", "Slot", "Validator Pubkey", 
            "Proposer Pubkey", "Gas Earned", "Reverted"
        ],
        "link": "https://notes.ethereum.org/@vbuterin/single_slot_finality#Bad-news-hybrid-consensus-mechanisms-actually-have-many-unavoidable-proble"
    },
    "Hidden-Chain Attacks": {
        "features": [
            "Timestamp", "Slot", "Validator Pubkey", "Proposer Pubkey", 
            "Gas Earned", "MEVBoost Value", "Reverted"
        ],
        "link": "https://arxiv.org/pdf/2209.03255"
    },
    "Balancing Attack on Gasper": {
        "features": [
            "Timestamp", "Slot", "Validator Pubkey", "Proposer Pubkey", 
            "Gas Earned", "Reverted"
        ],
        "link": "https://ethresear.ch/t/a-balancing-attack-on-gasper-the-current-candidate-for-eth2s-beacon-chain/8079?u=benjaminion"
    },
    "Avalanche Attack": {
        "features": [
            "Block Number", "Timestamp", "Slot", "Validator Pubkey", 
            "Proposer Pubkey", "Gas Earned", "Reverted"
        ],
        "link": "https://arxiv.org/pdf/2203.01315"
    },
    "Reorg Attack": {
        "features": [
            "Block Number", "Timestamp", "Slot", "Validator Pubkey", 
            "Proposer Pubkey", "Gas Earned", "Reverted"
        ],
        "link": "https://arxiv.org/pdf/2009.04987"
    },
    "Unrealized Justification Reorgs": {
        "features": [
            "Block Number", "Timestamp", "Slot", "Validator Pubkey", 
            "Proposer Pubkey", "Gas Earned", "Reverted"
        ],
        "link": "https://notes.ethereum.org/@adiasg/unrealized-justification"
    },
    "Decoy Flip-Flop Attack": {
        "features": [
            "Validator Pubkey", "Validator Name", "Timestamp", 
            "Slot", "Reverted"
        ],
        "link": "https://ethresear.ch/t/decoy-flip-flop-attack-on-lmd-ghost/6001"
    },
    "MEV-Boost Relay Incident": {
        "features": [
            "Timestamp", "Slot", "Validator Pubkey", "Proposer Pubkey", 
            "Gas Earned", "MEVBoost Value", "Reverted"
        ],
        "link": "https://collective.flashbots.net/t/post-mortem-april-3rd-2023-mev-boost-relay-incident-and-related-timing-issue/1540"
    },
    "Rogue Key Attacks": {
        "features": [
            "Validator Pubkey", "Timestamp", "Slot", "Reverted", 
            "Gas Earned"
        ],
        "link": "https://hackmd.io/@benjaminion/bls12-381#Rogue-key-attacks"
    },
    "Bouncing Attack on FFG": {
        "features": [
            "Validator Pubkey", "Proposer Pubkey", "Timestamp", 
            "Slot", "Reverted"
        ],
        "link": "https://ethresear.ch/t/analysis-of-bouncing-attack-on-ffg/6113?u=benjaminion"
    },
    "Balancing Attack LMD Edition": {
        "features": [
            "Validator Pubkey", "Validator Name", "Timestamp", 
            "Slot", "Reverted"
        ],
        "link": "https://ethresear.ch/t/balancing-attack-lmd-edition/11853?u=benjaminion"
    },
    "Proposer Boost": {
        "features": [
            "Timestamp", "Slot", "Validator Pubkey", "Proposer Pubkey", 
            "Gas Earned", "MEVBoost Value", "Reverted"
        ],
        "link": "https://github.com/ethereum/consensus-specs/pull/496#issuecomment-457546253"
    }
}

# Cell 4
import joblib
import numpy as np
import pandas as pd
import os
import time
from concurrent.futures import ThreadPoolExecutor
import threading
import ray
from sklearn.experimental import enable_halving_search_cv
from sklearn.pipeline import Pipeline, clone
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import (
    StandardScaler, 
    OneHotEncoder, 
    MinMaxScaler, 
    OrdinalEncoder, 
    LabelEncoder, 
    FunctionTransformer, 
    PowerTransformer, 
    QuantileTransformer, 
    RobustScaler, 
    MaxAbsScaler, 
    Normalizer
)
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import (
    PCA, 
    SparsePCA, 
    TruncatedSVD, 
    FastICA
)
from sklearn.ensemble import (
    IsolationForest, 
    RandomForestClassifier, 
    AdaBoostClassifier, 
    GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    LeaveOneOut, 
    HalvingGridSearchCV, 
    HalvingRandomSearchCV
)
from sklearn.metrics import accuracy_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import (
    RFE, 
    SelectKBest, 
    VarianceThreshold, 
    f_classif, 
    f_regression
)
from custom_transformers import *
from category_encoders import TargetEncoder
from tune_sklearn import TuneSearchCV, TuneGridSearchCV
from skopt.space import Real, Categorical, Integer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.base import BaseSampler
from scipy import sparse

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from ray.tune.search.hyperopt import HyperOptSearch

selector = make_column_selector

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),  # Impute missing values with 0
    ('scaler', StandardScaler()) # Scale features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='none')),  # Impute missing values with 'none'
    ('onehot', OneHotEncoder(handle_unknown="ignore", sparse_output=False)) # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, selector(dtype_include=np.number)),
    ('cat', categorical_transformer, selector(dtype_include='object'))
], remainder='passthrough')

pipeline_cleaning2 = Pipeline(steps=[
    ('drop_columns', DropColumns()),
    ('remove_duplicates', RemoveDuplicates()),
    ('calculate_gas', CalculateEarnedGas()),
    ('feature_engineering', FeatureEngineering()),
])
preprocessing_pipeline_un = Pipeline(steps=[
    ('preprocessor', preprocessor),
])


# Cell 5
import itertools
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets, HBox, VBox, HTML
from IPython.display import display, clear_output

import dill
import os
from minio import Minio
import tempfile
from minio.error import S3Error
from sklearn.metrics import make_scorer, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error, mean_absolute_error, r2_score, confusion_matrix

selected_dict = {
'mev': ['relay', 'builder', 'mevboost_value'],
'amount': ['hash', 'n_transactions', 'transaction_frequency', 'time_span'],
'general': ['block_number', 'validator_pool', 'validator_name', 'slot'],
'monetary': ['burnt', 'max_fee_per_gas', 'max_priority_fee_per_gas', 'base_fee_per_gas', 'gas_limit', 'gas_used', 'gas_earned'],
'compliant': ['ofac_compliant', 'reverted']
}

feature_groups = list(selected_dict.keys())

param_s3_server = "scruffy.lab.uvalight.net:9000"
param_s3_bucket = ""
param_s3_user_prefix = ""
param_s3_access_key = ""
param_s3_secret_key = ""

minio_client = Minio(param_s3_server, access_key=param_s3_access_key, secret_key=param_s3_secret_key, secure=True)

def list_objects_from_minio(bucket_name, prefix):
    objects = []
    try:
        objects = minio_client.list_objects(bucket_name, prefix=prefix, recursive=True)
    except S3Error as e:
        print(f"Error listing objects from MinIO: {e}")
    return objects

def parse_filename(filename):
    try:
        if not filename.endswith(".pkl"):
            return None, None
        model_type, groups_str = filename.split("_", 1)
        groups_str = groups_str.replace(".pkl", "")
        return model_type, groups_str
    except Exception as e:
        print(f"Error parsing filename {filename}: {e}")
        return None, None

def get_best_models_from_minio(selected):
    best_models = {}
    final = {}
    objects = list_objects_from_minio(param_s3_bucket, f"{param_s3_user_prefix}/vl-openlab/icos-naavre-demo/")
    selected_set = set(selected)
    
    for obj in objects:
        filename = os.path.basename(obj.object_name)
        model_type, groups = parse_filename(filename)
     
        if groups:
            
            if groups not in best_models:
                best_models[groups] = [model_type]
            else:
                best_models[groups].append(model_type)
    for groups, model_types in best_models.items():
        if selected_set == eval(groups):
            final[groups] = model_types
    return final


def interactive_pca_tsne_plot(df, output_pca_tsne, get_selected_features):
    def plot_pca_tsne(_):
        with output_pca_tsne:
            clear_output()
            display(widgets.HTML(value="<b>Loading plots, please wait...</b>"))

            selected_features, groups = get_selected_features()

            if len(groups) < 3:
                clear_output(wait=True)
                display(widgets.HTML(value="<b>Please select at least three feature groups.</b>"))
                return

            available_features = [feature for feature in selected_features if feature in df.columns]
            missing_features = [feature for feature in selected_features if feature not in df.columns]

            if len(available_features) < 2:
                clear_output(wait=True)
                display(widgets.HTML(value="<b>Please select at least two valid features. Missing features: {}</b>".format(", ".join(missing_features))))
                return

            selected_features = df[available_features]
            if selected_features.empty or selected_features.isnull().all().all():
                clear_output(wait=True)
                display(widgets.HTML(value="<b>No valid data points available for the selected features. Choose more features.</b>"))
                return

            sample_size = min(1000, len(selected_features))
            selected_features_sampled = selected_features.sample(n=sample_size)
            selected_features_encoded = pd.get_dummies(selected_features_sampled)
            scaler = StandardScaler()
            standardized_data = scaler.fit_transform(selected_features_encoded)

            if standardized_data.shape[0] < 2 or standardized_data.shape[1] < 2:
                clear_output()
                display(widgets.HTML(value="<b>Insufficient data points or features for PCA and t-SNE analysis.</b>"))
                return

            try:
                pca = PCA(n_components=2)
                pca_result = pca.fit_transform(standardized_data)
                df_pca = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])

                pca_loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=selected_features_encoded.columns)
                pca_loadings['Importance'] = pca_loadings.abs().mean(axis=1)
                pca_loadings = pca_loadings.sort_values(by='Importance', ascending=False)
            except Exception as e:
                clear_output(wait=True)
                display(widgets.HTML(value="<b>Error in PCA or t-SNE computation. Not enough data. Please select more or different features.</b>"))
                return

            try:    
                perplexity = min(30, len(selected_features_encoded) - 1)
                tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
                tsne_result = tsne.fit_transform(standardized_data)
                df_tsne = pd.DataFrame(tsne_result, columns=['tSNE1', 'tSNE2'])
            except Exception as e:
                clear_output(wait=True)
                display(widgets.HTML(value="<b>Error in PCA or t-SNE computation. Not enough data. Please select more or different features.</b>"))
                return

            clear_output(wait=True)
            plt.figure(figsize=(12, 6))
            plt.subplot(1, 2, 1)
            plt.scatter(df_pca['PCA1'], df_pca['PCA2'])
            plt.title('PCA Plot')
            plt.xlabel('PCA1')
            plt.ylabel('PCA2')

            plt.subplot(1, 2, 2)
            plt.scatter(df_tsne['tSNE1'], df_tsne['tSNE2'])
            plt.title('t-SNE Plot')
            plt.xlabel('tSNE1')
            plt.ylabel('tSNE2')

            plt.tight_layout()
            plt.show()
            
            display(pca_loadings.head(5))

    plot_button = widgets.Button(description="Plot PCA and t-SNE")
    plot_button.on_click(plot_pca_tsne)

    return VBox([plot_button, output_pca_tsne])

def create_tabs(supervised_models, unsupervised_models, df):
    group_checkboxes_tab1 = [widgets.Checkbox(value=False, description=group) for group in feature_groups]
    group_checkboxes_tab2 = [widgets.Checkbox(value=False, description=group) for group in feature_groups]

    group_vbox_tab1 = VBox(group_checkboxes_tab1)
    group_vbox_tab2 = VBox(group_checkboxes_tab2)

    def get_selected_features_tab1():
        selected_groups = [cb.description for cb in group_checkboxes_tab1 if cb.value]
        selected_features = list(itertools.chain.from_iterable([selected_dict[group] for group in selected_groups]))
        return selected_features, selected_groups

    def get_selected_features_tab2():
        selected_groups = [cb.description for cb in group_checkboxes_tab2 if cb.value]
        selected_features = list(itertools.chain.from_iterable([selected_dict[group] for group in selected_groups]))
        return selected_features, selected_groups

    output_pca_tsne_tab1 = widgets.Output()
    output_pca_tsne_tab2 = widgets.Output()

    interactive_plot_widget_tab1 = interactive_pca_tsne_plot(df, output_pca_tsne_tab1, lambda: get_selected_features_tab1())
    interactive_plot_widget_tab2 = interactive_pca_tsne_plot(df, output_pca_tsne_tab2, lambda: get_selected_features_tab2())

    dropdown_attack = widgets.Dropdown(
        options=list(attack_details.keys()),
        description='Attacks:'
    )

    output_attack = widgets.Output()

    def update_attack_output(change):
        with output_attack:
            output_attack.clear_output()
            attack = change['new']
            details = attack_details[attack]
            print(f"Attack: {attack}")
            print("Associated Features:")
            for feature in details["features"]:
                print(f"- {feature}")
            print(f"\nLink to Article: {details['link']}")

    dropdown_attack.observe(update_attack_output, names='value')

    type_toggle_tab1 = widgets.ToggleButtons(
        options=['Supervised', 'Unsupervised'],
        description='Type',
        disabled=False,
        button_style='',
    )

    type_toggle_tab2 = widgets.ToggleButtons(
        options=['Supervised', 'Unsupervised'],
        description='Type',
        disabled=False,
        button_style='',
    )
    output_supervised = widgets.Output()
    def on_toggle_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            if not tabs.selected_index:
                with output_tab1:
                    clear_output()
            else:
                with output_tab2:
                    clear_output()
                with output_supervised:
                    clear_output()
                    if type_toggle_tab2.value ==  "Supervised":
                        display(supervised_models)
                    else:
                        display(unsupervised_models)
            

    type_toggle_tab1.observe(on_toggle_change)

    type_toggle_tab2.observe(on_toggle_change)
    
    confirm_button_tab1 = widgets.Button(
        description='Review Selection',
        disabled=False,
        button_style='',
        tooltip='Click to review selection',
        icon='check'
    )

    confirm_button_tab2 = widgets.Button(
        description='Review Selection',
        disabled=False,
        button_style='',
        tooltip='Click to review selection',
        icon='check'
    )

    output_tab1 = widgets.Output()
    output_tab2 = widgets.Output()

    model_dropdown = widgets.Dropdown(
        options=list(unsupervised_models['Model']),
        description='Select Model',
        disabled=False,
    )

    def on_model_dropdown_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            with output_tab2:
                clear_output()
                analysis_type = type_toggle_tab2.value


    model_dropdown.observe(on_model_dropdown_change)
    def load_model(groups_set, analysis_type, model=None):
        model_type = 'HalvingRandomSearchCV'

        if groups_set ==[] or len(groups_set) < 3:
            print("No model can be found based on your choice, select at least three feature groups")
            return None
        if analysis_type == 'Supervised':
            result_dict = get_best_models_from_minio(groups_set)
            groups = [key for key, value in result_dict.items() if model_type in value][0]
        else:
            result_dict = get_best_models_from_minio(groups_set)
            groups = [key for key, value in result_dict.items() if model_type not in value]

            model_type = result_dict[groups[0]][0]
            groups = groups[0]
        
        if model_type:
            if model:
                model_type = model
            feature = f"{model_type}_{groups}.pkl"
            object_name = f"{param_s3_user_prefix}/vl-openlab/icos-naavre-demo/{feature}"

            try:
                with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                    minio_client.fget_object(param_s3_bucket, object_name, temp_file.name)
                    temp_file_path = temp_file.name
                    print(temp_file_path)
                with open(temp_file_path, 'rb') as file:
                    deserialized_model = dill.load(file)
                    
                print("Model loaded and ready to use.")
                display(deserialized_model)
                
                return deserialized_model

            except Exception as e:
                print(f"Error loading serialized model from MinIO: {e}")
                return None
        else:
            print("No best model found for the selected groups.")
            return None

    def on_button_click_tab1(b):
        with output_tab1:
            clear_output()
            selected_features, selected_groups = get_selected_features_tab1()
            analysis_type = type_toggle_tab1.value
            deserialized_model = load_model(selected_groups, analysis_type)
            if deserialized_model:  
                if analysis_type == "Supervised":
                    X = df.drop(columns="ofac_compliant")
                    X['mevboost_value'] = X['mevboost_value'].fillna(0)
                    X['relay'] = X['relay'].fillna('none')
                    X['builder'] = X['builder'].fillna('none')
                    y = df["ofac_compliant"]
                    y_pred = deserialized_model.predict(X)

                    classification_rep = classification_report(y, y_pred)
                    print("Classification Report:\n", classification_rep)

                    accuracy = accuracy_score(y, y_pred)
                    print("Accuracy:", accuracy)

                    precision = precision_score(y, y_pred)
                    print("Precision:", precision)

                    recall = recall_score(y, y_pred)
                    print("Recall:", recall)

                    f1 = f1_score(y, y_pred)
                    print("F1 Score:", f1)

                    conf_matrix = confusion_matrix(y, y_pred)
                    print("Confusion Matrix:\n", conf_matrix)

                    plt.figure(figsize=(10, 7))
                    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
                    plt.title('Confusion Matrix')
                    plt.xlabel('Predicted')
                    plt.ylabel('True')
                    plt.show()
                    
                    # # X_scaled = preprocessing_pipeline_un.fit_transform(X)

                    # X['is_anomaly'] = y_pred

                    # tsne = TSNE(n_components=2, random_state=42)
                    # X_tsne = tsne.fit_transform(X)

                    # plt.figure(figsize=(14, 7))
                    # plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c='blue', label='Compliant', alpha=0.5)
                    # plt.scatter(X_tsne[X['is_anomaly'] == 1][:, 0], X_tsne[X[['is_anomaly'] == 1][:, 1], c='red', label='OFAC Non-Compliant', alpha=0.5)
                    # plt.title('t-SNE plot of MevBoost Value with OFAC Non-Compliant Anomalies')
                    # plt.xlabel('t-SNE Component 1')
                    # plt.ylabel('t-SNE Component 2')
                    # plt.legend()
                    # plt.show()
                else:
                    if hasattr(deserialized_model, 'named_steps'):
                        X = df.drop(columns="ofac_compliant")
                        X['mevboost_value'] = X['mevboost_value'].fillna(0)
                        X['relay'] = X['relay'].fillna('none')
                        X['builder'] = X['builder'].fillna('none')
                        y = df["ofac_compliant"]
                        y_pred = deserialized_model.predict(X)

                        classification_rep = classification_report(y, y_pred)
                        print("Classification Report:\n", classification_rep)

                        accuracy = accuracy_score(y, y_pred)
                        print("Accuracy:", accuracy)

                        precision = precision_score(y, y_pred)
                        print("Precision:", precision)

                        recall = recall_score(y, y_pred)
                        print("Recall:", recall)

                        f1 = f1_score(y, y_pred)
                        print("F1 Score:", f1)

                        conf_matrix = confusion_matrix(y, y_pred)
                        print("Confusion Matrix:\n", conf_matrix)

                        plt.figure(figsize=(10, 7))
                        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
                        plt.title('Confusion Matrix')
                        plt.xlabel('Predicted')
                        plt.ylabel('True')
                        plt.show()
                    else:
                        sample_size = min(1000, len(df))
                        df2 = df.sample(sample_size)
                        X_new = preprocessing_pipeline_un.fit_transform(df2)

                        y_new = df2['ofac_compliant']
                        if hasattr(deserialized_model, 'fit_predict'):
                            clusters = deserialized_model.fit_predict(X_new)

                        else:
                            clusters = deserialized_model.predict(X_new)
                        numerical_columns = preprocessor.named_transformers_['num'].named_steps['imputer'].feature_names_in_
                        categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(preprocessor.named_transformers_['cat'].named_steps['imputer'].feature_names_in_)

                        all_columns = list(numerical_columns) + list(categorical_columns) + ["ofac_compliant"]

                        X_new_with_clusters = pd.DataFrame(X_new, columns=all_columns)
                        X_new_with_clusters['Cluster'] = clusters

                        pca = PCA(n_components=2)
                        X_pca = pca.fit_transform(X_new)

                        plt.figure(figsize=(10, 6))
                        for cluster in np.unique(clusters):
                            cluster_points = X_pca[clusters == cluster]
                            plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}')

                        unique, counts = np.unique(clusters, return_counts=True)
                        anomaly_cluster = unique[np.argmin(counts)]
                        anomalies = X_pca[clusters == anomaly_cluster]

                        plt.scatter(anomalies[:, 0], anomalies[:, 1], color='red', label='Anomalies', edgecolors='k')
                        plt.xlabel('PCA Component 1')
                        plt.ylabel('PCA Component 2')
                        plt.title('Clusters and Anomalies')
                        plt.legend()
                        plt.show()

                        df_pca = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])

                        pca_loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=all_columns)
                        pca_loadings['Importance'] = pca_loadings.abs().mean(axis=1)
                        pca_loadings = pca_loadings.sort_values(by='Importance', ascending=False)

                        display(pca_loadings.head(5))

                print(f'Selected Features: {selected_features}')
                print(f'Analysis Type: {analysis_type}')
            else:
                print("No features selected")

    def on_button_click_tab2(b):
        with output_tab2:
            clear_output()
            selected_features, selected_groups = get_selected_features_tab2()
            selected_model = model_dropdown.value
            analysis_type = type_toggle_tab2.value
            if analysis_type == "Unsupervised":
                deserialized_model = load_model(selected_groups, analysis_type,model=selected_model)
            else:
                deserialized_model = load_model(selected_groups, analysis_type)
            if deserialized_model:  
                if analysis_type == "Supervised":
                    X = df.drop(columns="ofac_compliant")
                    X['mevboost_value'] = X['mevboost_value'].fillna(0)
                    X['relay'] = X['relay'].fillna('none')
                    X['builder'] = X['builder'].fillna('none')
                    y = df["ofac_compliant"]

                    y_pred = deserialized_model.predict(X)

                    classification_rep = classification_report(y, y_pred)
                    print("Classification Report:\n", classification_rep)

                    accuracy = accuracy_score(y, y_pred)
                    print("Accuracy:", accuracy)

                    precision = precision_score(y, y_pred)
                    print("Precision:", precision)

                    recall = recall_score(y, y_pred)
                    print("Recall:", recall)

                    f1 = f1_score(y, y_pred)
                    print("F1 Score:", f1)

                    conf_matrix = confusion_matrix(y, y_pred)
                    print("Confusion Matrix:\n", conf_matrix)

                    # X_scaled = preprocessing_pipeline_un.fit_transform(X)

                    # X_scaled['is_anomaly'] = y_pred

                    # tsne = TSNE(n_components=2, random_state=42)
                    # X_tsne = tsne.fit_transform(X_scaled)

                    # plt.figure(figsize=(14, 7))
                    # plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c='blue', label='Compliant', alpha=0.5)
                    # plt.scatter(X_tsne[df['is_anomaly'] == 1][:, 0], X_tsne[df['is_anomaly'] == 1][:, 1], c='red', label='OFAC Non-Compliant', alpha=0.5)
                    # plt.title('t-SNE plot of MevBoost Value with OFAC Non-Compliant Anomalies')
                    # plt.xlabel('t-SNE Component 1')
                    # plt.ylabel('t-SNE Component 2')
                    # plt.legend()
                    # plt.show()
                else:
                    sample_size = min(1000, len(df))
                    df = df.sample(sample_size)
                    X_new = preprocessing_pipeline_un.fit_transform(df)
                    y_new = df['ofac_compliant']

                    if hasattr(deserialized_model, 'fit_predict'):
                        clusters = deserialized_model.fit_predict(X_new)
                    else:
                        clusters = deserialized_model.predict(X_new)
                    numerical_columns = preprocessor.named_transformers_['num'].named_steps['imputer'].feature_names_in_
                    categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(preprocessor.named_transformers_['cat'].named_steps['imputer'].feature_names_in_)

                    all_columns = list(numerical_columns) + list(categorical_columns) + ["ofac_compliant"]

                    X_new_with_clusters = pd.DataFrame(X_new, columns=all_columns)
                    X_new_with_clusters['Cluster'] = clusters

                    pca = PCA(n_components=2)
                    sample_size = min(1000, len(X_new_with_clusters))
                    X_pca = pca.fit_transform(X_new_with_clusters.sample(sample_size))

                    plt.figure(figsize=(10, 6))
                    for cluster in np.unique(clusters):
                        cluster_points = X_pca[clusters == cluster]
                        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}')

                    unique, counts = np.unique(clusters, return_counts=True)
                    anomaly_cluster = unique[np.argmin(counts)]
                    anomalies = X_pca[clusters == anomaly_cluster]

                    plt.scatter(anomalies[:, 0], anomalies[:, 1], color='red', label='Anomalies', edgecolors='k')
                    plt.xlabel('PCA Component 1')
                    plt.ylabel('PCA Component 2')
                    plt.title('Clusters and Anomalies')
                    plt.legend()
                    plt.show()

                print(f'Selected Features: {selected_features}')
                print(f'Analysis Type: {analysis_type}')
                print(f'Selected Model: {selected_model}')
            else:
                print("No features selected")
    confirm_button_tab1.on_click(on_button_click_tab1)
    confirm_button_tab2.on_click(on_button_click_tab2)

    html_content1 = HTML(value="<h3>Section 1: Suggestions for features relevant to an Attack</h3>")
    html_content2 = HTML(value="<h3>Section 2: Choose the features for your model</h3>")
    html_content3 = HTML(value="<h3>Section 3: Get insight into feature importance with PCA and t-SNE plots</h3>")
    html_content4 = HTML(value="<h3>Section 4: Choose between supervised and unsupervised and the best model will be loaded based on your feature choice</h3>")

    html_content5 = HTML(value="<h3>Section 5: Use the selected model to detect anomalies</h3>")
    html_content6 = HTML(value="<h3>Section 4: Click on supervised or unsupervised to see performance of each model</h3>")
    html_content7 = HTML(value="<h3>Section 5: Choose the model you think is most applicable. Currently only limited to Unsupervised models with all of the features selected</h3>")
    html_content8 = HTML(value="<h3>Section 6: Use the selected model to detect anomalies</h3>")
    html_content = HTML(value="""
<div style="font-family: Arial, sans-serif; margin: 20px;">

    <p><strong>MEV:</strong> relay, builder, mevboost_value | 
       <strong>Amount:</strong> hash, n_transactions, transaction_frequency, time_span | 
       <strong>General:</strong> block_number, validator_pool, validator_name, slot | 
       <strong>Monetary:</strong> burnt, max_fee_per_gas, max_priority_fee_per_gas, base_fee_per_gas, gas_limit, gas_used, gas_earned | 
       <strong>Compliant:</strong> ofac_compliant, reverted
    </p>
</div>
""")

    tab1 = widgets.VBox([html_content1,dropdown_attack,output_attack, html_content2, html_content, group_vbox_tab1,html_content3, interactive_plot_widget_tab1, html_content4, type_toggle_tab1, html_content5,  confirm_button_tab1, output_tab1])
    tab2 = widgets.VBox([html_content1,dropdown_attack,output_attack, html_content2, html_content,group_vbox_tab2,html_content3, interactive_plot_widget_tab2, html_content6, type_toggle_tab2, output_supervised, html_content7, model_dropdown, html_content8, output_tab2, confirm_button_tab2])

    tabs = widgets.Tab(children=[tab1, tab2])
    tabs.set_title(0, 'Select Parameters')
    tabs.set_title(1, 'Select Model by Accuracy')

    def on_tab_change(change):
        if change['name'] == 'selected_index':
            with output_tab1:
                clear_output()
            with output_tab2:
                clear_output()

    tabs.observe(on_tab_change, names='selected_index')

    comparison_output = widgets.Output()

    def compare_results():
        with comparison_output:
            clear_output()
            analysis_type_tab1 = type_toggle_tab1.value
            selected_features_tab1, selected_groups_tab1 = get_selected_features_tab1()
            
            analysis_type_tab2 = type_toggle_tab2.value
            selected_features_tab2, selected_groups_tab2 = get_selected_features_tab2()
            selected_model_tab2 = model_dropdown.value
            
            sanctioned = pd.read_csv("/home/marvin/informatica/scriptie/sanctioned.csv")
            common_addresses = set(df['block_number']).intersection(set(sanctioned['Block Number']))
            common_hash = set(df['hash']).intersection(set(sanctioned['Transaction Hash']))
            print(f"In the dataset there are: {len(common_addresses) + len(common_hash)} fraudulent activities according to the OFAC Sanctions list")

            deserialized_model_tab1 = load_model(selected_groups_tab1, analysis_type_tab1)
            X_new = preprocessing_pipeline_un.fit_transform(df)
            y_new = df['ofac_compliant']
            if deserialized_model_tab1:
                if analysis_type_tab1 == "Unsupervised":
                    if hasattr(deserialized_model_tab1, 'fit_predict'):
                        clusters_tab1 = deserialized_model_tab1.fit_predict(X_new)
                    else:
                        clusters_tab1 = deserialized_model_tab1.predict(X_new)
                    anomalies_tab1 = np.where(clusters_tab1 == -1)[0]
                    print(anomalies_tab1)
                    print(f"The last selected model in Tab 1 detected {len(anomalies_tab1)} indications/anomalies of illicit behavior")
                else:
                    print("Tab 1 is set to Supervised. Inspect the confusion matrix.")
                    
            else:
                print("No model selected in Tab1")

            deserialized_model_tab2 = load_model(selected_groups_tab2, analysis_type_tab2, model=selected_model_tab2)
            if deserialized_model_tab2:
                if analysis_type_tab2 == "Unsupervised":
                    if hasattr(deserialized_model_tab2, 'fit_predict'):
                        clusters_tab2 = deserialized_model_tab2.fit_predict(X_new)
                    else:
                        clusters_tab2 = deserialized_model_tab2.predict(X_new)
                    anomalies_tab2 = np.where(clusters_tab2 == -1)[0]
                    print(f"The last selected model in Tab 2 detected {len(anomalies_tab2)} indications/anomalies of illicit behavior")
                else:
                    print("Tab 2 is set to Supervised. Inspect the confusion matrix.")

            else:
                print("No model selected in Tab2")

    compare_button = widgets.Button(
        description='Compare Results',
        disabled=False,
        button_style='',
        tooltip='Click to compare results',
        icon='check'
    )

    compare_button.on_click(lambda b: compare_results())

    return tabs, compare_button, comparison_output

import pandas as pd
import re

supervised_models = pd.read_csv("/home/marvin/informatica/scriptie/all_results (1).csv")

def extract_model(params):
    match = re.search(r'model:\s*(\w+\(.*?\))', params)
    if match:
        return match.group(1)
    return None

supervised_models['Model'] = supervised_models['Best Params'].apply(extract_model)
supervised_models = supervised_models[['Model', 'Best Score']]
outlier_models = pd.read_csv("/home/marvin/informatica/scriptie/outlier_results (1).csv")
cluster_models = pd.read_csv("/home/marvin/informatica/scriptie/cluster_results (1).csv")
unsupervised_models = pd.concat([cluster_models, outlier_models], ignore_index=True)[["Model","Best Score"]]

# Cell 6
import pandas as pd
import ipywidgets as widgets
from bqplot import *
from bqplot.interacts import BrushSelector, FastIntervalSelector
from ipywidgets import VBox, Output, HTML
from IPython.display import display, clear_output
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore, norm, f_oneway
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def create_plotting_widget(blocks):
    if 'day' in blocks.columns:
        unique_dates = sorted(blocks['day'].unique())
    unique_validator_names = sorted(blocks['validator_name'].unique())
    
    categorical_columns = blocks.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_columns = [item for item in categorical_columns if item not in ['relay','builder','mevboost_value']]
    numeric_columns = blocks.select_dtypes(include=[np.number]).columns.tolist()
    unique_values_dict = {col: sorted(blocks[col].dropna().unique()) for col in categorical_columns}
    categorical_columns_list = list(unique_values_dict.keys())

    output_grid = widgets.Output()
    output_info = widgets.Output()
    
    preset_x = 'block_number'
    preset_y = 'gas_earned'
    cat = 'validator_name'
    x_axis_value = preset_x if preset_x in numeric_columns else numeric_columns[0]
    y_axis_value = preset_y if preset_y in numeric_columns else numeric_columns[1]
    cat_value = cat if cat in categorical_columns_list else categorical_columns_list[0]

    x_axis_dropdown = widgets.Dropdown(
        options=numeric_columns,
        value=x_axis_value,
        description='X-axis:'
    )

    y_axis_dropdown = widgets.Dropdown(
        options=numeric_columns,
        value=y_axis_value,
        description='Y-axis:'
    )

    feature_dropdown_x = widgets.Dropdown(
        options=numeric_columns,
        value=x_axis_value,
        description='X of selected interval plot:'
    )

    feature_dropdown = widgets.Dropdown(
        options=numeric_columns,
        value=y_axis_value,
        description='Y of selected interval plot:'
    )

    dropdown = widgets.Dropdown(
        options=categorical_columns_list,
        value=cat_value,
        description='Analyze:'
    )
    
    current_filtered_blocks = blocks
    last_clicked_button = None

    def create_figure(filtered_blocks):
        x_col = x_axis_dropdown.value 
        y_col = y_axis_dropdown.value 

        x_data = filtered_blocks[x_col]
        y_data = filtered_blocks[y_col]

        x_scale = LinearScale()
        y_scale = LinearScale()

        scatter = Scatter(
            x=x_data,
            y=y_data,
            scales={'x': x_scale, 'y': y_scale},
            colors=["blue"],
            selected_style={"opacity": "1"},
            unselected_style={"opacity": "0.2"}
        )

        brush_selector = BrushSelector(x_scale=x_scale, y_scale=y_scale, marks=[scatter])

        output_plot = Output()

        def on_selected(change):
            selected_data = []
            selected = change['new']
            if selected is not None:
                min_x, min_y = selected[0]
                max_x, max_y = selected[1]
                selected_data = filtered_blocks[(x_data >= min_x) & (x_data <= max_x) & (y_data >= min_y) & (y_data <= max_y)]
                
                feature_col = feature_dropdown.value
                feature_x = feature_dropdown_x.value
                selected_data = selected_data.sort_values(by=feature_x)

                feature_x_data = selected_data[feature_x]
                
                feature_y_data = selected_data[feature_col]

                
                window_size = 20
                rolling_mean = feature_y_data.rolling(window=window_size, min_periods=1).mean()
                rolling_std = feature_y_data.rolling(window=window_size, min_periods=1).std()

                outliers = (feature_y_data > rolling_mean + 2 * rolling_std) | (feature_y_data < rolling_mean - 2 * rolling_std)

                with output_plot:
                
                    clear_output()
                    
                    print(f"Analysing features: {feature_x}, {feature_col}")
                    fig, axes = plt.subplots(1, 2, figsize=(16, 8))
                    axes[0].plot(feature_x_data.values, feature_y_data.values, 'o', label='Selected Data')

                    axes[0].plot(feature_x_data[outliers], feature_y_data[outliers], 'ro', label='Outliers')

                    axes[0].set_xlabel(feature_x)
                    axes[0].set_ylabel(feature_col)
                    axes[0].set_title(f"Selected Data Plot ({feature_x} vs {feature_col})")
                    axes[0].legend()
            
                    scatter = axes[1].scatter(feature_x_data.values, feature_y_data.values, c=outliers, cmap='coolwarm', alpha=0.6)
                    sns.kdeplot(data=selected_data, x=feature_x, y=feature_col, cmap='Blues', fill=True, bw_adjust=0.5, thresh=0.05, ax=axes[1])
                    
                    fig.colorbar(scatter, ax=axes[1], label='Anomaly')
                
                    axes[1].set_title('Data Heatmap with Anomalies Highlighted')
                    axes[1].set_xlabel(feature_x)
                    axes[1].set_ylabel(feature_col)
                
                    plt.tight_layout()
                    plt.show()

        def on_brushing(change):
            if not change['new']:
                on_selected({'new': brush_selector.selected})
                
        brush_selector.observe(on_selected, names='selected')

        x_ax = Axis(label=x_col, scale=x_scale)
        y_ax = Axis(label=y_col, scale=y_scale, orientation='vertical')

        fig = Figure(marks=[scatter], axes=[x_ax, y_ax], interaction=brush_selector)

        matrix = Output()

        with matrix:
            num = filtered_blocks.select_dtypes(include=[np.number]).columns.tolist()
            corr_matrix = filtered_blocks[num].corr()

            plt.figure(figsize=(12, 6))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
            plt.title('Correlation Matrix')
            plt.show()
            print(f"Analysing features: {x_col}, {y_col}")
        help_label = HTML(
                '<div style="color: black; font-size: 16px; margin:20px 0px 0px 50px">\
                  Choose features to analyse the selected interval with time series analysis</div>'
        )
        help_label2 = HTML(
        '<div style="color: black; font-size: 16px; margin:20px 0px 0px 50px">\
          Select an interval with your mouse to generate a detailed plot with statistical anomaly detection/div>'
        )

        tabs, compare_button, comparison_output = create_tabs(supervised_models, unsupervised_models, blocks)
        
        display(VBox([matrix, x_axis_dropdown, y_axis_dropdown,help_label2, fig, help_label, feature_dropdown_x, feature_dropdown, output_plot, tabs, compare_button, comparison_output]))
    
    
    def on_button_click(b):
        nonlocal current_filtered_blocks, last_clicked_button
        with output_info:
            clear_output()
            last_clicked_button = b.description
            print(f"Clicked the button: {b.description}, {dropdown.value}")
            if dropdown.value == 'day':
                selected_data = blocks[blocks['day'] == pd.to_datetime(b.description).date()]
                print(selected_data.shape)
                if not selected_data.empty:
                    current_filtered_blocks = selected_data.sample(min(500, len(selected_data)))
                    create_figure(current_filtered_blocks)
            else:
                feature = dropdown.value
                selected_value = b.description
                filtered_blocks = blocks[blocks[feature] == selected_value]
                print(filtered_blocks.shape)
                if not filtered_blocks.empty:
                    current_filtered_blocks = filtered_blocks.sample(min(500, len(filtered_blocks)))
                    create_figure(current_filtered_blocks)

    def create_button_grid(feature):
        with output_grid:
            clear_output()
            unique_values = unique_values_dict[feature]
            n_columns = 7
            n_rows = (len(unique_values) + n_columns - 1) // n_columns
            grid = widgets.GridspecLayout(n_rows=n_rows, n_columns=n_columns, width='100%', grid_gap='10px 5px')

            for idx, value in enumerate(unique_values):
                row = idx // n_columns
                col = idx % n_columns
                button = widgets.Button(description=str(value))
                button.on_click(on_button_click)
                grid[row, col] = button
            display(grid)

    def on_dropdown_change(change):
        create_button_grid(change.new)
    
    def on_axis_dropdown_change(change):
        with output_info:
            clear_output()
            if last_clicked_button:
                print(f"Clicked the button: {last_clicked_button}")
            create_figure(current_filtered_blocks)

    dropdown.observe(on_dropdown_change, names='value')
    x_axis_dropdown.observe(on_axis_dropdown_change, names='value')
    y_axis_dropdown.observe(on_axis_dropdown_change, names='value')
   
    display(VBox([dropdown, output_grid,output_info]))
    create_button_grid(dropdown.value)

# Cell 7
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from io import BytesIO
import time
import psutil
import ipywidgets as widgets
from ipyvuetify.extra import FileInput
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

fi = FileInput(multiple=True) 
out = widgets.Output()

dfs_to_merge = []
dfs_to_concat = []
final_df = pd.DataFrame()

preprocessing_pipeline = Pipeline(steps=[
    ('drop_columns', DropColumns()),
    ('remove_duplicates', RemoveDuplicates()),
    ('date_conversion', DateConversion()),
    ('calculate_gas', CalculateEarnedGas()),
    ('feature_engineering', FeatureEngineering()),
])

def preprocess_data(file_data):
    global dfs_to_merge, dfs_to_concat

    dfs_to_merge = []
    dfs_to_concat = []

    for data, name in file_data:
        print(f"Processing file: {name}")
        if 'blocks' in name and name.endswith('.csv'):
            dfs_to_merge.append(pd.read_csv(BytesIO(data)))
        
        elif name.endswith('.gzip'):
            dfs_to_merge.append(pd.read_parquet(BytesIO(data)))
        
        elif name.endswith('.csv'):
            dfs_to_concat.append(pd.read_csv(BytesIO(data)))
        
        else:
            print(f"Unsupported file type: {name}")
            continue

    print(f"Dataframes to merge: {len(dfs_to_merge)}, Dataframes to concatenate: {len(dfs_to_concat)}")
    print(f"Data is preprocessing...")
    display(pipeline_cleaning2)
    display(preprocessor)

    if len(dfs_to_merge) > 1:
        merged_df = dfs_to_merge[0]
        for df in dfs_to_merge[1:]:
            merged_df = merged_df.merge(df, on='block_number', how='inner')
        
        if dfs_to_concat:
            concatenated_df = pd.concat(dfs_to_concat, ignore_index=True)
            final_df = merged_df.merge(concatenated_df, on='block_number', how='inner')
        else:
            final_df = merged_df
            
        
        if not final_df.empty:
            final_df = preprocessing_pipeline.fit_transform(final_df)
            final_df.to_csv('new2.csv', index=False)

            print(final_df.shape)
          
        else:
            print("Please update blocks.csv or parquet.csv")
            print("Merged dataframe is empty. No common 'block_number' to merge on.")
        
    else:
        print("There are no files to merge")

        if dfs_to_concat:
            final_df = pd.concat(dfs_to_concat, ignore_index=True)
        else:
            print("Dataframe is empty")
            return
    create_plotting_widget(final_df)
    
def handle_file_upload(change):
    with out:
        out.clear_output()
        start_total = time.time()
        cpu_start_total = psutil.cpu_percent(interval=None)
        mem_start_total = psutil.virtual_memory().used / (1024 ** 2)
        
        files = fi.get_files()
        file_data = [(file['file_obj'].read(), file['name']) for file in files]
        preprocess_data(file_data)
        
        end_total = time.time()
        cpu_end_total = psutil.cpu_percent(interval=None)
        mem_end_total = psutil.virtual_memory().used / (1024 ** 2)
        time_taken = end_total - start_total
        cpu_usage = cpu_end_total - cpu_start_total
        mem_usage = mem_end_total - mem_start_total
        
        print(f"Total - Time: {time_taken:.2f} seconds, CPU Usage: {cpu_usage:.2f}%, Memory Usage: {mem_usage:.2f} MB")

fi.observe(handle_file_upload, names='file_info')
display(fi, out)


2024-06-24 19:53:02.038415: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FileInput(events=['upload'])

Output()