In [13]:
import os
import csv
import glob
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, f1_score
from sklearn.utils import resample
from datetime import datetime
from xgboost import XGBClassifier

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_recall_curve,
    roc_curve,
    auc
)

# Load Foundational Address Features

In [14]:
directory = r"data\20241104_layer0_sybil_features"

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Initialize an empty list to hold individual dataframes
dfs = []

# Loop through the CSV files and read each into a dataframe
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)  # Assuming the CSV files have headers
    df.columns = map(str.lower, df.columns)
    #df = df.drop(columns=['rank', 'num_distinct_to_addresses'])
    dfs.append(df)

# Concatenate all dataframes into one
df = pd.concat(dfs, ignore_index=True)


# Data cleaning
df = df.drop(columns=['rank'], errors='ignore')
df['possible_triangles'] = df.in_degree * df.out_degree
df.drop(columns=['in_degree', 'out_degree'], inplace=True, errors='ignore') # Duplicated in "node metrics"
df = df[df.addr != '0x0000000000000000000000000000000000000000']

df_without_addr = df.drop(columns='addr').copy()
df_cleaned = df_without_addr.dropna(how='all').copy()
df_cleaned['addr'] = df.loc[df_cleaned.index, 'addr'].copy()
df = df_cleaned.copy()

df = df.drop_duplicates(subset=['addr'], keep='first')

#  Load Labels from Multiple Sources

In [15]:
directory = r"data\20241214_labeled_addresses"

csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

labeled_addresses = set()

# Loop through the CSV files and read each into a dataframe
for file in csv_files:
    file_path = os.path.join(directory, file)
    with open(file_path, 'r') as f:
        labeled_addresses = labeled_addresses.union({line.strip() for line in f})
        

In [16]:
label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241013_hildobby_cex_evms"


cex_addresses = set()

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        cex_addresses.update(df_labels['address'].unique())  

# Load Features for Gas Provision Network

In [17]:
csv_directory = r"data\20241114_gas_provision\\"
to_from_mapping = {}

for csv_file in glob.glob(os.path.join(csv_directory, "*.csv")):
    with open(csv_file, mode="r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            to_node = row.get("activated_address")
            from_node = row.get("gas_provider")
            if to_node and from_node:  # Ensure neither key nor value is empty
                to_from_mapping[to_node] = from_node

df["gas_provider"] = df["addr"].map(to_from_mapping)
df["provider_is_labeled"] = df["gas_provider"].apply(lambda gp: gp in labeled_addresses if gp else False)
df["provider_is_cex"] = df["gas_provider"].apply(lambda gp: gp in cex_addresses if gp else False)
val = set(df["addr"].values)
df["provider_is_interactor"] = df["gas_provider"].apply(lambda gp: gp in val if gp else False)
df["provider_is_null"] = df["gas_provider"].isnull()
df.drop(columns = ['gas_provider'], inplace=True)

In [18]:
df_gp = None
xs = []
files = glob.glob(r'data\20241117_tree_features\*.csv')

for file in files:
    x = pd.read_csv(file)  # Assuming the CSV files have headers
    x.columns = map(str.lower, x.columns)
    xs.append(x)

df_gp = pd.concat(xs, ignore_index=True)
df_gp.drop(columns = 'provider_is_labeled', inplace=True) # Column is incorrect in the file, correctly computed here.
df = df.merge(df_gp, on='addr', how='left')
df = df.fillna(0).copy()

In [19]:
df_2 = None
xs = []
files = glob.glob(r'data\20250208_cex_dex_indegree\*.csv')

for file in files:
    x = pd.read_csv(file)  # Assuming the CSV files have headers
    x.columns = map(str.lower, x.columns)
    xs.append(x)

df_2 = pd.concat(xs, ignore_index=True)
df = df.merge(df_2, left_on='addr', right_on='to_address')
df.columns = map(str.lower, df.columns)
df = df.fillna(0).copy()

# Load Sybil Labels

In [25]:
df_gp = None
file = r'data\20240915_final_sybil_list\fcfs_list.csv'

address_df = pd.read_csv(file)  # Assuming the CSV files have headers
address_df.columns = map(str.lower, address_df.columns)
address_df['address'] = address_df['address'].str.lower()
address_set = set(address_df['address'])

# Add Chain Length as a Feature and Finalize Master DataFrame

In [26]:
def get_chain_length(addr, to_from_mapping, interactor_set, labeled_addresses_set):
    """
    Follow the chain from `addr` upward using `to_from_mapping[child] = provider`.
    Stop when you reach a provider that is in `labeled_addresses_set`
    or if there's no further provider. Return the chain length if it
    leads to a labeled address, else None.
    """
    length = 0
    interactors_in_chain = 0
    current = addr

    while True:
        
        if current in interactor_set:
            interactors_in_chain += 1

        # If there's no known provider, we can't go further
        if current not in to_from_mapping:
            return length, interactors_in_chain
        
        
        provider = to_from_mapping[current]
        if (current == provider):
            print ("UNEXPECTED: provider is activated address", current)
            return length, interactors_in_chain

        
        length += 1
        
        # If the provider is labeled, we've reached the chain's start
        if provider in labeled_addresses_set:
            return length, interactors_in_chain
        
        
        # Continue up the chain
        current = provider

labeled_addresses_set = labeled_addresses
master_df = df.copy()
master_df['sybil'] = master_df['addr'].isin(address_set)
provider_addresses = set(to_from_mapping.values())
master_df['is_provider'] = master_df['addr'].isin(provider_addresses)


chain_lengths = []
interactor_counts = []
interactor_set = set(master_df.addr)

for addr in master_df["addr"]:
    length, interactor_count = get_chain_length(addr, to_from_mapping, interactor_set, labeled_addresses_set)
    chain_lengths.append(length)
    interactor_counts.append(interactor_count)
        

master_df["chain_length"] = chain_lengths
master_df["interactors_in_chain"] = interactor_counts

master_df_orig = master_df.copy()


UNEXPECTED: provider is activated address 0x35d61a0648b972b3eb2686e36dcabaeff92b155b


# Function Definitions for Model Training

In [27]:
def split_and_balance_data(df, label_column='sybil', test_size=0.3, val_size=0.3, random_state=42):
    # Separate features and labels
    X = df.drop(columns=[label_column])
    y = df[label_column]
    
    # Split into initial train and test sets
    X_train_initial, X_test, y_train_initial, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Further split the training set into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_initial, y_train_initial, test_size=val_size, random_state=random_state, stratify=y_train_initial
    )
    
    # Combine the training features and labels for balancing
    train_df = pd.concat([X_train, y_train], axis=1)
    
    # Separate the classes
    majority_class = train_df[train_df[label_column] == 0]
    minority_class = train_df[train_df[label_column] == 1]
    
    # Upsample the minority class
    minority_class_upsampled = resample(
        minority_class,
        replace=True,
        n_samples=len(majority_class),
        random_state=random_state
    )
    
    # Combine the balanced classes
    balanced_train_df = pd.concat([majority_class, minority_class_upsampled])
    
    # Separate balanced features and labels
    X_train_balanced = balanced_train_df.drop(columns=[label_column])
    y_train_balanced = balanced_train_df[label_column]
    
    return X_train_balanced, y_train_balanced, X_val, y_val, X_test, y_test


def train_and_evaluate_model(X_train, y_train, X_val, y_val, selected_features, params):
    start_time = time.time()

    model = XGBClassifier(**params)
    model.fit(X_train[selected_features], y_train, verbose=False)

    y_probs_val = model.predict_proba(X_val[selected_features])[:, 1]
    y_pred = (y_probs_val > 0.5).astype(int)

    print("Confusion Matrix (Validation Set):")
    print(confusion_matrix(y_val, y_pred))

    print("\nClassification Report (Validation Set):")
    print(classification_report(y_val, y_pred, digits=3))

    precision, recall, thresholds = precision_recall_curve(y_val, y_probs_val)
    f1_scores = 2 * (precision * recall) / (precision + recall)
    best_threshold = thresholds[np.argmax(f1_scores)]
    print(f"Optimal threshold (Validation Set): {best_threshold}")

    y_pred_optimal = (y_probs_val >= best_threshold).astype(int)

    print("\nConfusion Matrix (Optimal Threshold, Validation Set):")
    print(confusion_matrix(y_val, y_pred_optimal))

    print("\nClassification Report (Optimal Threshold, Validation Set):")
    print(classification_report(y_val, y_pred_optimal, digits=3))

    print("Time elapsed:", time.time() - start_time)

    # ROC curve
    fpr, tpr, _ = roc_curve(y_val, y_probs_val)
    roc_auc = auc(fpr, tpr)

    # Precision-Recall and ROC side-by-side
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    axes[0].plot(recall, precision, marker='.')
    axes[0].set_title('Precision-Recall Curve')
    axes[0].set_xlabel('Recall')
    axes[0].set_ylabel('Precision')
    axes[0].grid(True)

    axes[1].plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}", color='darkorange')
    axes[1].plot([0, 1], [0, 1], linestyle='--', color='gray')
    axes[1].set_title('ROC Curve')
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].legend(loc='lower right')
    axes[1].grid(True)

    plt.tight_layout()
    plt.show()

    # Feature importances (Top 20)
    booster = model.get_booster()
    feature_importances = booster.get_score(importance_type='weight')
    sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)[:20]

    features = [item[0] for item in sorted_importances]
    importances = [item[1] for item in sorted_importances]

    importance_df = pd.DataFrame({
        "Feature": features,
        "Importance": importances
    })

    print("\nTop 20 Feature Importances:")
    print(importance_df.to_string(index=False))

    plt.figure(figsize=(10, 8))
    plt.barh(features, importances, color='skyblue')
    plt.xlabel("Feature Importance (Weight)", fontsize=12)
    plt.title("Top 20 Feature Importances from XGBoost", fontsize=14)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    return model, best_threshold

# Define Model Parameters and Features

In [28]:
params = {
    'objective': 'binary:logistic',
    'random_state': 42,
    'eval_metric': 'logloss',  
    'n_estimators' :2000
}

selected_features = ['min_tx_value_out', 
                     'gini_coefficient', 
                     'cex_in_count', 
                     'leaf_gas_distribution_entropy', 
                     'star_like_ratio', 
                     'provider_is_star_like_attack', 
                     'leaf_gas_distribution_skewness', 
                     'interactors_in_chain', 
                     'provider_is_labeled', 
                     'provider_is_interactor', 
                     'l0_to_eth_avg_native_drop_usd', 
                     'l0_to_eth_max_native_drop_usd', 
                     'balance_factor', 
                     'l0_tx_time_span', 
                     'latest_l0_tx_time', 
                     'time_span_in', 
                     'provider_total_gas_provision_amount', 
                     'l0_avg_stargate_swap', 
                     'avg_depth', 
                     'breadth_factor', 
                     'indegree_per_block_in', 
                     'gas_distribution_skewness', 
                     'tree_size', 
                     'l0_min_stargate_swap', 
                     'provider_max_gas_provision_amount', 
                     'total_gas', 
                     'num_transactions_in', 
                     'branching_factor', 
                     'l0_to_eth_tx_time_span', 
                     'n_l0_to_eth_source_contracts', 
                     'n_l0_to_eth_projects', 
                     'provider_is_null', 
                     'max_depth', 
                     'leaf_provision_proportion', 
                     'n_l0_to_eth_project_per_source_chain', 
                     'n_l0_to_eth_txs', 
                     'earliest_l0_tx_time', 
                     'n_l0_projects', 
                     'n_l0_to_eth_dest_contracts', 
                     'provider_fan_out', 
                     'l0_to_eth_min_stargate_swap', 
                     'n_l0_source_chains', 
                     'longest_chain_ratio', 
                     'n_eth_interactions', 
                     'provider_avg_gas_provision_amount', 
                     'gas_distribution_entropy', 
                     'sparsity', 
                     'max_tx_value_out', 
                     'n_l0_source_contracts', 
                     'gas_provision_block_number', 
                     'min_tx_value_in', 
                     'tx_value_per_block_out', 
                     'chain_length', 
                     'provider_min_gas_provision_amount', 
                     'n_l0_to_eth_source_chains', 
                     'depth', 
                     'l0_to_eth_max_stargate_swap', 
                     'breadth_to_depth_ratio', 
                     'leaf_to_internal_ratio', 
                     'earliest_tx_block_in', 
                     'n_l0_project_per_source_chain', 
                     'l0_to_eth_avg_stargate_swap', 
                     'is_provider']

# Final Training

In [None]:
master_df_subset = master_df.copy()

X_train_balanced, y_train_balanced, X_val, y_val, X_test, y_test = split_and_balance_data(
    master_df_subset,
    label_column='sybil',
    test_size=0.3,
    val_size=0.3,
    random_state=42
)

# Train and evaluate the model
model, best_threshold = train_and_evaluate_model(
    X_train_balanced, 
    y_train_balanced, 
    X_val, 
    y_val, 
    selected_features, 
    params
)
