In [1]:
import os
import pandas as pd

# Define the paths to the directories
dir_funding_tx = r'C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241114_gas_provision\\'

# Function to load all CSV files from a directory into a single dataframe
def load_all_csvs(directory):
    all_dfs = []
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path)
            # Lowercase all column names
            df.columns = df.columns.str.lower()
            all_dfs.append(df)
    if all_dfs:
        return pd.concat(all_dfs, ignore_index=True)  # Concatenate all dataframes
    return None

# Load all CSVs from each directory
df_funding_tx = load_all_csvs(dir_funding_tx)

df_funding_tx.rename(columns={'block_number': 'gas_provision_block_number'})

# Check if the dataframes are not empty and print their head
if df_funding_tx is not None:
    print("First few rows of funding transactions:")
    print(df_funding_tx.head())
else:
    print("No funding transactions found.")

First few rows of funding transactions:
                            activated_address  \
0  0x0000000000000000000000000000000000000000   
1  0x0000000000000068f116a894984e2db1123eb395   
2  0x00000000000000adc04c56bf30ac9d3c0aaf14dc   
3  0x00000000000001ad428e4906ae43d8f9852d0dd6   
4  0x00000000000007736e2f9aa5630b8c812e1f3fc9   

                                 gas_provider gas_provision_amount  \
0                                         NaN  5000000000000000000   
1  0xf0e16c071e2cd421974dcb76d9af4dedb578e059     1000000000000000   
2  0xfba662e1a8e91a350702cf3b87d0c2d2fb4ba57f     2000000000000000   
3  0x4d41232b0d963afb52cd0354da5819b259f133bd       50000000000000   
4  0xe4edb277e41dc89ab076a1f049f4a3efa700bce8  2651340000000001371   

  first_gas_provision_time  block_number  \
0  2015-07-30 20:40:19 UTC          5305   
1  2024-03-23 01:57:23 UTC      19494173   
2  2023-04-27 20:40:35 UTC      17139947   
3  2023-02-22 00:38:47 UTC      16680486   
4  2023-11-18 04:01:23 U

In [2]:
df_funding_tx = df_funding_tx[df_funding_tx['activated_address'] != '0x0000000000000000000000000000000000000000']
df_funding_tx.dtypes
# Ensure 'gas_provision_amount' is a numeric field
df_funding_tx['gas_provision_amount'] = pd.to_numeric(df_funding_tx['gas_provision_amount'], errors='coerce')

# Convert from gwei to ETH
df_funding_tx['gas_provision_amount'] = df_funding_tx['gas_provision_amount'] / 1e9 / 1e9  # OR simply / 1e18
df_funding_tx[df_funding_tx.activated_address=='0x0485f72ffeaec67e634ac0b702806051e686aac3']

Unnamed: 0,activated_address,gas_provider,gas_provision_amount,first_gas_provision_time,block_number,tx_hash
11359,0x0485f72ffeaec67e634ac0b702806051e686aac3,0xbf94f0ac752c739f623c463b5210a7fb2cbb420b,0.11,2023-09-07 06:27:11 UTC,18082761,0xdc360101811bf6ca45c506bc15886b88d0a65402e6ed...


In [3]:
shape_before = df_funding_tx.shape
print("Shape before:", shape_before)

# Remove rows where 'gas_provider' is null
df_funding_tx = df_funding_tx[df_funding_tx['gas_provider'].notnull()]

# Display shape after removing null values
shape_after = df_funding_tx.shape
print("Shape after:", shape_after)



Shape before: (604863, 6)
Shape after: (604661, 6)


In [4]:
import pandas as pd

# Step 1: Check for duplicated (activated_address, gas_provider) pairs
duplicate_pairs = df_funding_tx[df_funding_tx.duplicated(subset=['activated_address', 'gas_provider'], keep=False)]

# Step 2: Count of duplicated pairs for summary
num_duplicates = duplicate_pairs.shape[0]

# Step 3: Display duplicate pairs (if any)
print(duplicate_pairs)

# Step 1: Group by 'activated_address' and count unique 'gas_provider' for each address
provider_count = df_funding_tx.groupby('activated_address')['gas_provider'].nunique()

# Step 2: Filter addresses that have more than one unique 'gas_provider'
addresses_with_multiple_providers = provider_count[provider_count > 1]

# Step 3: Display the result
print(addresses_with_multiple_providers)

# Step 1: Deduplicate based on 'activated_address' and 'gas_provider'
deduplicated_df = df_funding_tx.drop_duplicates(subset=['activated_address', 'gas_provider'], keep='first')

# Step 2: Display the deduplicated DataFrame
print(deduplicated_df)

df_funding_tx = deduplicated_df

import gc

# Force garbage collection
gc.collect()

print(df_funding_tx)


Empty DataFrame
Columns: [activated_address, gas_provider, gas_provision_amount, first_gas_provision_time, block_number, tx_hash]
Index: []
Series([], Name: gas_provider, dtype: int64)
                                 activated_address  \
1       0x0000000000000068f116a894984e2db1123eb395   
2       0x00000000000000adc04c56bf30ac9d3c0aaf14dc   
3       0x00000000000001ad428e4906ae43d8f9852d0dd6   
4       0x00000000000007736e2f9aa5630b8c812e1f3fc9   
5       0x000000000000ad05ccc4f10045630fb830b95127   
...                                            ...   
604859  0xfffff6c029985fae0a82725ea77e011c47c9a65d   
604860  0xfffff9b1c2c387b1e3d19af292d91d913374f42b   
604861  0xfffffacc41e00f96f6af4af0154ad18749c9d5ea   
604862  0xffffffd7cc631a49dc9dc7a14ef83875360e2fee   
604863  0xfffffff18945595afc492397824d1319aa2d12d2   

                                      gas_provider  gas_provision_amount  \
1       0xf0e16c071e2cd421974dcb76d9af4dedb578e059              0.001000   
2       0xfba6

In [5]:
import pandas as pd
import numpy as np
import os

# Load address labels to exclude certain addresses from analysis
label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241011_l0_address_labels"

# Initialize a set to hold addresses to exclude
labeled_addresses = set()

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        labeled_addresses.update(df_labels['ADDR'].unique())

label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241013_hildobby_cex_evms"

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        labeled_addresses.update(df_labels['address'].unique())  
        
        
# ARE WE GETTING CONTRACTS AS GAS PROVIDERS?  WHY?

label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241114_bigquery_contract_list"

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        labeled_addresses.update(df_labels['address'].unique())    
        
        
label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241112_dune_contract_list"

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        labeled_addresses.update(df_labels['address'].unique()) 
        
        
#C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241112_dawsbot_labels

label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241112_dawsbot_labels"

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        labeled_addresses.update(df_labels['address'].unique())     

#C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241112_brianleect_labels
label_dir = r"C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241112_brianleect_labels"

# Load each CSV file in the specified directory
for filename in os.listdir(label_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(label_dir, filename)
        # Load the CSV file
        df_labels = pd.read_csv(file_path)
        labeled_addresses.update(df_labels['Address'].unique())     




labeled_addresses.add('0x00000000000007736e2f9aa5630b8c812e1f3fc9')
labeled_addresses.add('0xe93685f3bba03016f02bd1828badd6195988d950')
labeled_addresses.add('0xd64791e747188b0e5061fc65b56bf20fee2e3321')
labeled_addresses.add('0x000f422887ea7d370ff31173fd3b46c8f66a5b1c')
labeled_addresses.add('0x912fd21d7a69678227fe6d08c64222db41477ba0')
labeled_addresses.add('0x230a1ac45690b9ae1176389434610b9526d2f21b')
labeled_addresses.add('0x151b381058f91cf871e7ea1ee83c45326f61e96d')
labeled_addresses.add('0x3b794929566e3ba0f25e4263e1987828b5c87161')
labeled_addresses.add('0x912fd21d7a69678227fe6d08c64222db41477ba0')
labeled_addresses.add('0x91962711a4d2e4a830b366ce7276d99001e8564b')
labeled_addresses.add('0x0385b3f162a0e001b60ecb84d3cb06199d78f666')
labeled_addresses.add('0x4fb5df81b644e3bd5ad0ba07dce2b67559c764e0')
labeled_addresses.add('0x2a038e100f8b85df21e4d44121bdbfe0c288a869')
labeled_addresses.add('0xcc9557f04633d82fb6a1741dcec96986cd8689ae')
labeled_addresses.add('0x963737c550e70ffe4d59464542a28604edb2ef9a')
labeled_addresses.add('0x2db1d8cdf1abe8c70b531a790cdf2ff38aecf652')
labeled_addresses.add('0x5e809a85aa182a9921edd10a4163745bb3e36284')
labeled_addresses.add('0xa88902d6e93922893ee77234ed1c3ba4bec90224')
labeled_addresses.add('0x1439eda7f9a911b9120e9a0dafb60eae317f7685')
labeled_addresses.add('0x1eaca1277bcdfa83e60658d8938b3d63cd3e63c1')
labeled_addresses.add('0x158353a7601c7ba9bfdea6c77e571557267cd03b')
labeled_addresses.add('0xffff3dcb664c3f69b049d121fba7b7d7273961ef')
labeled_addresses.add('0x0000000000000068f116a894984e2db1123eb395')
labeled_addresses.add('0x469503159ddf6bfd0a9ec8eba8e97a84fd3eae5b')

labeled_addresses.add('0x000005a271a610964bb42658c7ff50fee2aa055a')

labeled_addresses.add('0x3b0BC51Ab9De1e5B7B6E34E5b960285805C41736'.lower())
labeled_addresses.add('0x6B8fAC654D072d8A799F03626db7E4f4679B0E1d'.lower()) #Kraken

#0x120A270bbC009644e35F0bB6ab13f95b8199c4ad

#Shapeshift
labeled_addresses.add('0x120A270bbC009644e35F0bB6ab13f95b8199c4ad'.lower())
labeled_addresses.add('0x9e6316f44BaEeeE5d41A1070516cc5fA47BAF227'.lower())
labeled_addresses.add('0x70faa28A6B8d6829a4b1E649d26eC9a2a39ba413'.lower())
labeled_addresses.add('0x563b377A956c80d77A7c613a9343699Ad6123911'.lower())
labeled_addresses.add('0xD3273EBa07248020bf98A8B560ec1576a612102F'.lower())
labeled_addresses.add('0x3b0BC51Ab9De1e5B7B6E34E5b960285805C41736'.lower())
labeled_addresses.add('0xeed16856D551569D134530ee3967Ec79995E2051'.lower())
labeled_addresses.add('0x70217E7De3A68187905269462506f81cF344bbad'.lower())
labeled_addresses.add('0x9BcB0733C56B1D8F0c7c4310949E00485cAe4E9d'.lower())

labeled_addresses.add('0xee77aa3Fd23BbeBaf94386dD44b548e9a785ea4b'.lower())
labeled_addresses.add('0x2f155ddeFC29c414C94b801B91F55B257231825E'.lower())
labeled_addresses.add('0x3AEf01dB231c3C9fF844f7E611c63b8c36bc6A02'.lower())
labeled_addresses.add('0xdf69de4a2a58866afeBb7713e3dd10C2153fF27C'.lower())


labeled_addresses.add('0xe65A88f487F5d26469Cfd37ce7Ef763D6d9BE454'.lower())
labeled_addresses.add('0xF08BDf21373A09aB7eDD7769A402D3a22826D317'.lower())
labeled_addresses.add('0xDa1E5D4Cc9873963f788562354b55A772253b92f'.lower())
labeled_addresses.add('0xdf69de4a2a58866afeBb7713e3dd10C2153fF27C'.lower())

labeled_addresses.add('0x8d12A197cB00D4747a1fe03395095ce2A5CC6819'.lower())

labeled_addresses.add('0x84A518A35C7c361C64301b8f209C2F0edB6F608d'.lower())

#0x84A518A35C7c361C64301b8f209C2F0edB6F608d
#0x8d12A197cB00D4747a1fe03395095ce2A5CC6819

# This is a contract 0x0000000000a8fb09af944ab3baf7a9b3e1ab29d8 listed as gas provider for 0xe206e3dca498258f1b7eec1c640b5aee7bb88fd0 which is in turn a miner.
# NEED TO UNDERSTAND THIS MINER and THE CONTRACT ACTIVATION TRANSACTION -does this happen more often?
labeled_addresses.add('0x0000000000a8fb09af944ab3baf7a9b3e1ab29d8')

# Display the number of excluded addresses
print(f"Number of labeled addresses: {len(labeled_addresses)}")

#11393891

Number of labeled addresses: 9054105


In [6]:
#C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241028_layerzero_interactors

import os
import pandas as pd

# Define the paths to the directories
dir_l0_int = r'C:\Users\scott\Documents\20240903 Octan\20240903 Jupyter\inputs\20241028_layerzero_interactors\\'

# Function to load all CSV files from a directory into a single dataframe
def load_all_csvs(directory):
    all_dfs = []
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path)
            # Lowercase all column names
            df.columns = df.columns.str.lower()
            all_dfs.append(df)
    if all_dfs:
        return pd.concat(all_dfs, ignore_index=True)  # Concatenate all dataframes
    return None

# Load all CSVs from each directory
df_interactors = load_all_csvs(dir_l0_int)

    
print ("before merge shape:" , df_interactors.shape)
df_interactors = df_interactors.merge(df_funding_tx, left_on='addr', right_on='activated_address', how='left')
df_interactors.drop(columns=['addr'])
print ("after merge shape:", df_interactors.shape)
print(df_interactors.head())

before merge shape: (434794, 1)
after merge shape: (434794, 7)
                                         addr  \
0  0x0000000000000000000000000000000000000000   
1  0x000000000087781798788d5374ab5ec82044bb88   
2  0x0000000000ac58e588070969dce9c6e87e8e7173   
3  0x0000000000ad737a527c2757136ae83bb40b925e   
4  0x0000000000b6e2b38398f748d1135d948306eca4   

                            activated_address  \
0                                         NaN   
1  0x000000000087781798788d5374ab5ec82044bb88   
2  0x0000000000ac58e588070969dce9c6e87e8e7173   
3  0x0000000000ad737a527c2757136ae83bb40b925e   
4  0x0000000000b6e2b38398f748d1135d948306eca4   

                                 gas_provider  gas_provision_amount  \
0                                         NaN                   NaN   
1  0x0f2ba821a55eb44d0391fdaf6f34ce17bc58d49e              0.013000   
2  0xfa476a250c8bdb5bf3e9cbb8b432e4c9ab6f4097              0.098193   
3  0x4976a4a02f38326660d17bf34b431dc6e2eb2327              1.19

In [7]:
import time

start = time.time()


# Initialize a dictionary to store the feature data for activated addresses
activated_address_features = {
    'addr': [],  # Renamed to 'addr' as per request
    #'provision_block_number': [],  # Changed to block_number
    #'provision_amount': [],  # Keep the provision amount for activated addresses
    'provider_fan_out': [],  # Fan-out feature from gas providers
    'provider_total_gas_provision_amount': [],  # Total gas provision amount sent by provider
    'provider_avg_gas_provision_amount': [],  # Average amount sent to activated_address by provider
    'provider_max_gas_provision_amount': [],  # Max amount sent in a single tx by provider
    'provider_min_gas_provision_amount': [],  # Min amount sent in a single tx by provider
    'provider_is_star_like_attack': [],  # Binary indicator if activated by a star-like gas provider
    'provider_is_labeled': []  # Binary indicator if activated by a star-like gas provider
}

# Group by gas_provider (potential central node in star-like attack)
#grouped_providers = df_funding_tx.groupby('gas_provider')
grouped_providers = df_interactors.groupby('gas_provider')

# Track which activated addresses are part of star-like attacks
star_like_addresses = set()

# Process gas providers
for gas_provider, group in grouped_providers:
    activated_addresses_count = group['activated_address'].nunique()  # Count distinct activated addresses
    total_amount = group['gas_provision_amount'].sum()  # Total gas provision amount
    avg_amount = total_amount / activated_addresses_count if activated_addresses_count > 0 else 0  # Average provision amount
    max_amount = group['gas_provision_amount'].max()  # Max provision amount
    min_amount = group['gas_provision_amount'].min()  # Min provision amount

    # Determine if it is a star-like pattern (customizable): Example is fan-out > 5
    is_star_like = 1 if (activated_addresses_count > 1) and (gas_provider not in labeled_addresses) else 0
    
    # Mark activated addresses in star-like attack
    if is_star_like:
        star_like_addresses.update(group['activated_address'].tolist())

    # For each activated address related to this gas provider, append features
    for activated_address in group['activated_address']:
        # Exclude addresses from the star-like attack analysis
        activated_address_features['addr'].append(activated_address)
        activated_address_features['provider_fan_out'].append(activated_addresses_count)  # Fan-out from the gas provider
        activated_address_features['provider_total_gas_provision_amount'].append(total_amount)  # Total amount sent by the gas provider
        activated_address_features['provider_avg_gas_provision_amount'].append(avg_amount)  # Average amount sent
        activated_address_features['provider_max_gas_provision_amount'].append(max_amount)  # Max amount sent
        activated_address_features['provider_min_gas_provision_amount'].append(min_amount)  # Min amount sent
        activated_address_features['provider_is_labeled'] = int(gas_provider in labeled_addresses)
        
        # Check if this activated address is part of a star-like attack
        
        activated_in_star_like = 1 if activated_address in star_like_addresses else 0
        activated_address_features['provider_is_star_like_attack'].append(activated_in_star_like)

# Convert activated address feature data into a DataFrame
df_activated_address_features = pd.DataFrame(activated_address_features)


print('elapsed ', time.time() - start)

elapsed  39.034775733947754


In [13]:
from scipy.stats import skew
from statistics import variance
import numpy as np
import pandas as pd
import time
import csv

import gc

import logging
from datetime import datetime

import random

print ('*** testing', random.random())

# Generate a timestamp in a readable format
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_filename = f"{timestamp}_app.log"

# Set up logging configuration with the prefixed filename
logging.basicConfig(
    filename=log_filename,       # Log file name with timestamp prefix
    level=logging.DEBUG,          # Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Create a logger object
logging.captureWarnings(True)
logger = logging.getLogger()

start = time.time()

parent_mapping = {}
child_mapping = {}

# Define the function to calculate entropy
def calculate_entropy(gas_amounts):
    """Calculates the entropy of a list of gas amounts."""
    if not gas_amounts:
        return 0
    probabilities = [x / sum(gas_amounts) for x in gas_amounts]
    return -sum(p * np.log2(p) for p in probabilities if p > 0)

# Define the function to calculate skewness
def calculate_skewness(data):
    """Calculates the skewness of a dataset."""
    return skew(data) if len(data) > 1 else 0

# Function to find the root by traversing parent_mapping
def find_root(address, parent_mapping):
    leaf = address
    """Find the topmost parent of a given address using parent_mapping."""
    level = 0
    while address in parent_mapping:
        address = parent_mapping[address]
        level += 1
    logger.info(f"found root: {address} for address {leaf} at depth {level}")    
    return address, level

cache_hits = 0
calls = 0

def find_tree_features(root, child_mapping, computed_metrics):
    global calls, cache_hits
    calls += 1
    if root in computed_metrics:
        # Return cached metrics
        cache_hits += 1
        logger.info(f"********* cache hit:  {root}\t{cache_hits / calls}")
        return computed_metrics[root]
    
    # Initialize for DFS
    stack = [(root, 0)]
    tree_nodes = []
    total_gas = 0
    max_depth = 0
    min_leaf_depth = float('inf')
    total_children = 0
    leaf_gas = 0
    leaf_gas_amounts = []
    gas_provisions = []
    node_depths = []
    node_breadths = []
    internal_nodes_count = 0
    leaf_nodes_count = 0
    depth_sum = 0
    weighted_gas_sum = 0
    
    star_like_nodes = 0  # For star-like subtree ratio
    
    while stack:
        current_addr, depth = stack.pop()
        if current_addr in [node for node, _ in tree_nodes]:
            continue
        
        tree_nodes.append((current_addr, depth))
        node_depths.append(depth)
        max_depth = max(max_depth, depth)
        depth_sum += depth
        while len(node_breadths) <= depth:
            node_breadths.append(0)
        node_breadths[depth] += 1

        gas_amount = df_funding_tx.loc[df_funding_tx['activated_address'] == current_addr, 'gas_provision_amount'].sum()
        if ((gas_amount > 0) and (root != current_addr)):
            total_gas += gas_amount
            #logger.info(f'nonzero gas amount:  current_addr:{current_addr}\troot:{root}\tgas_amount:{gas_amount}\tstack_size:{len(stack)}')
            gas_provisions.append(gas_amount)

        weighted_gas_sum += gas_amount * depth
            
            
        if current_addr not in child_mapping:
            leaf_nodes_count += 1
            leaf_gas += gas_amount
            leaf_gas_amounts.append(gas_amount)
            min_leaf_depth = min(min_leaf_depth, depth)
        else:
            internal_nodes_count += 1
            num_children =  len(child_mapping[current_addr])
            total_children += num_children
            if num_children > 1:
                star_like_nodes += 1
            for child in child_mapping[current_addr]:
                stack.append((child, depth + 1))
    
    # Calculate metrics
    branching_factor = total_children / len(tree_nodes) if tree_nodes else 0
    balance_factor = max_depth - min_leaf_depth if min_leaf_depth != float('inf') else 0
    leaf_provision_proportion = leaf_gas / total_gas if total_gas > 0 else 0
    avg_depth = sum(node_depths) / len(node_depths) if node_depths else 0
    depth_variance = variance(node_depths) if len(node_depths) > 1 else 0
    leaf_to_internal_ratio = leaf_nodes_count / internal_nodes_count if internal_nodes_count > 0 else 0
    avg_leaf_gas = sum(leaf_gas_amounts) / len(leaf_gas_amounts) if leaf_gas_amounts else 0
    breadth_factor = sum(node_breadths) / len(node_breadths) if node_breadths else 0
    breadth_to_depth_ratio = breadth_factor / max_depth if max_depth > 0 else 0
    
    tree_size = len(tree_nodes)

    longest_chain_ratio = max_depth / tree_size if tree_size > 0 else 0
    star_like_ratio = star_like_nodes / tree_size if tree_size > 0 else 0
    depth_weighted_avg_gas = weighted_gas_sum / depth_sum if depth_sum > 0 else 0
    sparsity = len([node for node, _ in tree_nodes if df_funding_tx.loc[df_funding_tx['activated_address'] == node, 'gas_provision_amount'].sum() > 0]) / tree_size if tree_size > 0 else 0

    
    if len(gas_provisions) == 0 or np.sum(gas_provisions) == 0:
        print(root, gas_provisions)
        return
    
    gini_coefficient = (len(gas_provisions) - 1) / len(gas_provisions) * (1 - sum(sorted(gas_provisions) / np.sum(gas_provisions)))
    
    # Entropy and Skewness calculations
    gas_distribution_entropy = calculate_entropy(gas_provisions)
    gas_distribution_skewness = calculate_skewness(gas_provisions)
    leaf_gas_distribution_entropy = calculate_entropy(leaf_gas_amounts)
    leaf_gas_distribution_skewness = calculate_skewness(leaf_gas_amounts)

    subtree_metrics = {
        'tree_nodes': tree_nodes,
        'total_gas': total_gas,
        'max_depth': max_depth,
        'branching_factor': branching_factor,
        'balance_factor': balance_factor,
        'leaf_provision_proportion': leaf_provision_proportion,
        'avg_depth': avg_depth,
        'depth_variance': depth_variance,
        'leaf_to_internal_ratio': leaf_to_internal_ratio,
        'avg_leaf_gas': avg_leaf_gas,
        'breadth_factor': breadth_factor,
        'breadth_to_depth_ratio': breadth_to_depth_ratio,
        'gini_coefficient': gini_coefficient,
        'gas_distribution_entropy': gas_distribution_entropy,
        'gas_distribution_skewness': gas_distribution_skewness,
        'leaf_gas_distribution_entropy': leaf_gas_distribution_entropy,
        'leaf_gas_distribution_skewness': leaf_gas_distribution_skewness,
        'sparsity':sparsity,
        #'depth':
        'longest_chain_ratio':longest_chain_ratio,
        'star_like_ratio':star_like_ratio,
        'depth_weighted_avg_gas':depth_weighted_avg_gas
    }

    for node, _ in tree_nodes:
        computed_metrics[node] = subtree_metrics

    return subtree_metrics  # Return the dictionary


for idx, row in df_funding_tx.iterrows():
    if (row['gas_provider'] in labeled_addresses) or (row['activated_address'] in labeled_addresses):
        continue
    if (row['activated_address'] == row['gas_provider']):
        print (row['activated_address'], 'same as provider')
        a = row['activated_address']
        logger.warning (f'skipping activated address {a} same as provider')
        continue
    parent_mapping[row['activated_address']] = row['gas_provider']
    if row['gas_provider'] not in child_mapping:
        child_mapping[row['gas_provider']] = []
    child_mapping[row['gas_provider']].append(row['activated_address'])
    
print('elapsed', time.time()-start)

# Initialize computed metrics cache
computed_metrics = {}

count = 0

print(df_activated_address_features.shape)
            
total_records = df_interactors.shape[0]

for idx, row in df_activated_address_features.iterrows():
    addr = row['addr']
    if (count < 10):
        
        print ('first 10 address:', idx, addr)
    
    elif (random.random() < 0.001):

        print ('sampled address:', idx, addr)

    count +=1
        
    if addr in parent_mapping:
        root, depth = find_root(addr, parent_mapping)  
        #logger.info(f"root: {root}\ttraced from:{addr}")
        subtree_metrics = find_tree_features(root, child_mapping, computed_metrics)

        # Unpack the metrics from the dictionary
        tree_size = len(subtree_metrics['tree_nodes'])
        df_activated_address_features.at[idx, 'tree_size'] = tree_size
        df_activated_address_features.at[idx, 'total_gas'] = subtree_metrics['total_gas']
        df_activated_address_features.at[idx, 'max_depth'] = subtree_metrics['max_depth']
        df_activated_address_features.at[idx, 'branching_factor'] = subtree_metrics['branching_factor']
        df_activated_address_features.at[idx, 'balance_factor'] = subtree_metrics['balance_factor']
        df_activated_address_features.at[idx, 'leaf_provision_proportion'] = subtree_metrics['leaf_provision_proportion']
        df_activated_address_features.at[idx, 'avg_depth'] = subtree_metrics['avg_depth']
        df_activated_address_features.at[idx, 'depth_variance'] = subtree_metrics['depth_variance']
        df_activated_address_features.at[idx, 'leaf_to_internal_ratio'] = subtree_metrics['leaf_to_internal_ratio']
        df_activated_address_features.at[idx, 'avg_leaf_gas'] = subtree_metrics['avg_leaf_gas']
        df_activated_address_features.at[idx, 'breadth_factor'] = subtree_metrics['breadth_factor']
        df_activated_address_features.at[idx, 'breadth_to_depth_ratio'] = subtree_metrics['breadth_to_depth_ratio']
        df_activated_address_features.at[idx, 'gini_coefficient'] = subtree_metrics['gini_coefficient']
        df_activated_address_features.at[idx, 'gas_distribution_entropy'] = subtree_metrics['gas_distribution_entropy']
        df_activated_address_features.at[idx, 'gas_distribution_skewness'] = subtree_metrics['gas_distribution_skewness']
        df_activated_address_features.at[idx, 'leaf_gas_distribution_entropy'] = subtree_metrics['leaf_gas_distribution_entropy']
        df_activated_address_features.at[idx, 'leaf_gas_distribution_skewness'] = subtree_metrics['leaf_gas_distribution_skewness']
        df_activated_address_features.at[idx, 'sparsity'] = subtree_metrics['sparsity']
        df_activated_address_features.at[idx, 'depth'] = depth
        df_activated_address_features.at[idx, 'longest_chain_ratio'] = subtree_metrics['longest_chain_ratio']
        df_activated_address_features.at[idx, 'star_like_ratio'] = subtree_metrics['star_like_ratio']
        df_activated_address_features.at[idx, 'depth_weighted_avg_gas'] = subtree_metrics['depth_weighted_avg_gas']        
        logger.info(f"{addr}\t{idx}\t{tree_size}\t{time.time()-start}\t{(time.time()-start)/idx * total_records/60 if idx != 0 else -1}")
    else:
        logger.info(f"{addr}\t{idx}\t{time.time()-start}\tno parent, skipping")
        
    # Open the file in append mode ('a'), creating it if it doesn't exist
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        # Write the data row
        elapsed = time.time()-start
        writer.writerow([idx, elapsed, elapsed / idx * total_records/60 if idx != 0 else -1, elapsed / idx * total_records/60/60 if idx != 0 else -1])
            
print(time.time() - start)

# Fill any NaN values with 0 to avoid missing data
df_activated_address_features.fillna(0, inplace=True)

# Display the first few rows of the activated address features DataFrame
print(df_activated_address_features.head())


*** testing 0.8822133577842849
0x35d61a0648b972b3eb2686e36dcabaeff92b155b same as provider
elapsed 42.49613070487976
(434111, 8)
first 10 address: 0 0x7dae7524ab9c1c55998624322561ff81e4af53ea
first 10 address: 1 0xdd616b67d4a056e72b0bc650205635033b565965
first 10 address: 2 0xe180cb5ce2ec5c546eb63cda1633df40e8a0c945
first 10 address: 3 0x00bdd9ad789c648187b66651776bcd487610cbc9
first 10 address: 4 0x280cee30c8e0c81eddc3089011c8988001ef0a05
first 10 address: 5 0x29e17e24c51c84b24cae6b7f70ae1c7f9d00109a
first 10 address: 6 0x2bc967083adc9bdd32058182fb7b44b3a73546f3
first 10 address: 7 0x50871600c0910b25602cfb4cc66991ac8cf08c7d
first 10 address: 8 0x924fe7a52eaab9865b03582523d3d61ce7f83b73
first 10 address: 9 0x93dd9cafd349365a16f0f8380a4f639a2c4dbd85
sampled address: 1233 0x22a8f7a9209b25653545b01ba483e092380a0c36
sampled address: 2555 0x44a73012c53ef355b520865e42fd4f85ca035b37
sampled address: 3894 0x1fb008e12f98bf303097ee87012bf66d00b51fbb
sampled address: 4676 0x14df63191d6da0d3d8c3ac

sampled address: 117285 0xbbe77fbab3407b07a8780a554165ba89fcbecc5a
sampled address: 118748 0xd37215765ab59af4991f5bb773486a2c5e5c0e81
sampled address: 119955 0xe6f3414ee0885c43d308c5b0f2c6be0ce2ab114d
sampled address: 120594 0xf0e144b7ea4ff9c6c87625575d36db5faecb0eb8
sampled address: 120693 0xf21bbb7064884e713e5747a7fcd68cb6fc6704c1
sampled address: 120837 0xf48be1ed29d40287cbdb46f794a5926d900ffb90
sampled address: 121659 0x8f38e3f34ae87508f321c02ec106228095e4e870
sampled address: 121853 0x7effb3deee6ccff1d57da7af6157d92623c50711
sampled address: 121990 0x35fc4f5aea033ac20f270e9776b56efb45ec9ad8
sampled address: 122074 0x82cac318c83e257f6976c9d915ea0cd4033b52bc
sampled address: 122126 0xfadfde950063e803a8941ef22166e8acf6488e15
sampled address: 124829 0x6501ec038670804aefdd7913e89e515ced4bf865
sampled address: 124885 0x6967f2c9461c7c686c2a5d3498af52874a76a3a9
sampled address: 125001 0x73ffe58a1f64aa55c522f3285d4744c0edc3bd10
sampled address: 125689 0xad8175cc42ec0c5c725ea4fc924accaa9e05

sampled address: 215774 0x13403f97aad84609b9be84bf977e1c55d4e8852f
sampled address: 216229 0x2b6553ab2f02aade2bb5256f5fed3ed0aae755db
sampled address: 220559 0x3c542b77ae58f6a4c21d33bacbe1f00097ffbedc
sampled address: 221704 0x28454dd0943b0d1ab097ad7d2258ca3d71ee916c
sampled address: 221871 0xa0668617f02214067e59be33cc268d821b3b5348
sampled address: 222702 0x18581ca5feb996037ce54e675b795c61dff9bb02
sampled address: 222812 0x10a3864c4d2349aefc13f3ad82425cf6eb28d767
sampled address: 223375 0x5f2ca56b31dff74868159de9281471ac7664db6a
sampled address: 224857 0x0613f51ef30657c08ba530904248da0bb39d190d
sampled address: 225912 0x171a6dbd27c29e95446ec58530858a124b51472e
sampled address: 227392 0x2e80dbc1be59abdeb91e9536e7ea9e8760cf1413
sampled address: 227830 0x356229b69023fd010f83d92bd35502f63d6de3b9
sampled address: 227970 0x37b40207bdb80063535dc6bc9ca795c57718822b
sampled address: 229283 0x4b78dbba362d0e9cd1c160179b8703d399cbe935
sampled address: 229752 0x535041fcc3a5a072dcf0e8d978c18bd6dc9c

sampled address: 342184 0xf6c0a56c758c4754f7fc5e1532a9b358ef9cbd19
sampled address: 342328 0x45c56e138881fd3ff46359ba1826d5fc6fccaedc
sampled address: 342993 0x53ca6971fda53db07facb981eb5fce0f7025bb44
sampled address: 344591 0xccf6c29d87eb2c0bafede74f5df35f84541f4549
sampled address: 344654 0x4d3d104348b8c11967d25695baa8ea00c768734f
sampled address: 344977 0x4bfdc91bc5e7e18e806ef347fd192ee4978311da
sampled address: 345251 0x5544d4b8020fc9c62fa6a7bdb36f840b87c914ca
sampled address: 345554 0xaa46b1bdac2c1d83e1a1aa5e1afdc9d9b107d6c1
sampled address: 346292 0x0e0099d1f0cae04dc2de50e0b983bf61ea6ea210
sampled address: 348065 0xead6172e7b9b98e9095550d63c411d8a7e3e1301
sampled address: 348107 0xf14b03af68ad0862cc6816d587811f40468e3846
sampled address: 348899 0xb0ce34fbf95fa89fd616ff70fedfcc79b5abb76c
sampled address: 349781 0x7d3c21d9498249d7066b0c4eee7409534d051bc9
sampled address: 352589 0x145521c7256ff3e7d9fd3130cce83578570692c7
sampled address: 352733 0x23d5a404a641a0e83e88bdc832778d75ddc7

In [14]:
total_records

434794

In [15]:
# Function to check if a list of depths is a chain
def is_chain(depths):
    # Check if the depths form a sequence of consecutive integers
    return sorted(depths) == list(range(min(depths), max(depths) + 1))

# Dictionary to store chain lengths for each node
chain_lengths = {}

# Loop through each node and its data in computed_metrics
for node, data in computed_metrics.items():
    # Extract the list of tuples (first element of the data)
    tree_nodes = data['tree_nodes']
    
    # Extract the depths (second element of each tuple)
    depths = [depth for _, depth in tree_nodes]
    
    # Check if the depths form a chain
    if is_chain(depths):
        # If it's a chain, the chain length is the length of the list of depths
        chain_length = len(depths)
    else:
        # If it's not a chain, set chain length to zero
        chain_length = 0
    
    # Store the chain length for the current node
    chain_lengths[node] = chain_length

# Display the chain lengths for each node
len(chain_lengths)


311535

In [16]:
# Step 1: Compute chain lengths as before (already done in previous steps)
def is_valid_chain(depths):
    # Check if the depths form a sequence of consecutive integers of length >= 3
    return len(depths) >= 3 and sorted(depths) == list(range(min(depths), max(depths) + 1))

chain_lengths = {}

for node, data in computed_metrics.items():
    tree_nodes = data['tree_nodes']
    depths = [depth for _, depth in tree_nodes]
    
    if is_valid_chain(depths):
        chain_length = len(depths)
    else:
        chain_length = 0
    
    chain_lengths[node] = chain_length

# Step 2: Create chain_length column by mapping the addr column to the chain_lengths dictionary
df_activated_address_features['chain_length'] = df_activated_address_features['addr'].map(chain_lengths)

df_activated_address_features.fillna(0)

# Display the updated DataFrame
df_activated_address_features.head()



Unnamed: 0,addr,provider_fan_out,provider_total_gas_provision_amount,provider_avg_gas_provision_amount,provider_max_gas_provision_amount,provider_min_gas_provision_amount,provider_is_star_like_attack,provider_is_labeled,tree_size,total_gas,...,gas_distribution_entropy,gas_distribution_skewness,leaf_gas_distribution_entropy,leaf_gas_distribution_skewness,sparsity,depth,longest_chain_ratio,star_like_ratio,depth_weighted_avg_gas,chain_length
0,0x7dae7524ab9c1c55998624322561ff81e4af53ea,1,0.011,0.011,0.011,0.011,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,0xdd616b67d4a056e72b0bc650205635033b565965,2,0.000473,0.000236,0.000463,1e-05,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0xe180cb5ce2ec5c546eb63cda1633df40e8a0c945,2,0.000473,0.000236,0.000463,1e-05,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,0x00bdd9ad789c648187b66651776bcd487610cbc9,14,0.260068,0.018576,0.03,0.0055,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,0x280cee30c8e0c81eddc3089011c8988001ef0a05,14,0.260068,0.018576,0.03,0.0055,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [17]:
df_full_feature_set = df_activated_address_features.merge(df_funding_tx[['activated_address', 'gas_provision_amount', 'block_number']], left_on='addr', right_on='activated_address', how='left')
df_full_feature_set.drop(columns=['activated_address'], inplace=True)
df_full_feature_set.rename(columns={'block_number': 'gas_provision_block_number'}, inplace=True)
df_full_feature_set

Unnamed: 0,addr,provider_fan_out,provider_total_gas_provision_amount,provider_avg_gas_provision_amount,provider_max_gas_provision_amount,provider_min_gas_provision_amount,provider_is_star_like_attack,provider_is_labeled,tree_size,total_gas,...,leaf_gas_distribution_entropy,leaf_gas_distribution_skewness,sparsity,depth,longest_chain_ratio,star_like_ratio,depth_weighted_avg_gas,chain_length,gas_provision_amount,gas_provision_block_number
0,0x7dae7524ab9c1c55998624322561ff81e4af53ea,1,0.011000,0.011000,0.011000,0.011000,0,0,0.0,0.000000,...,0.000000,0.000000e+00,0.000000,0.0,0.000000,0.00,0.000000,,0.011000,18858186
1,0xdd616b67d4a056e72b0bc650205635033b565965,2,0.000473,0.000236,0.000463,0.000010,0,0,0.0,0.000000,...,0.000000,0.000000e+00,0.000000,0.0,0.000000,0.00,0.000000,,0.000463,17052218
2,0xe180cb5ce2ec5c546eb63cda1633df40e8a0c945,2,0.000473,0.000236,0.000463,0.000010,0,0,0.0,0.000000,...,0.000000,0.000000e+00,0.000000,0.0,0.000000,0.00,0.000000,,0.000010,17006141
3,0x00bdd9ad789c648187b66651776bcd487610cbc9,14,0.260068,0.018576,0.030000,0.005500,0,0,0.0,0.000000,...,0.000000,0.000000e+00,0.000000,0.0,0.000000,0.00,0.000000,,0.020000,18957914
4,0x280cee30c8e0c81eddc3089011c8988001ef0a05,14,0.260068,0.018576,0.030000,0.005500,0,0,0.0,0.000000,...,0.000000,0.000000e+00,0.000000,0.0,0.000000,0.00,0.000000,,0.015000,19508339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434106,0x58b7b7c27b9b181d351a0ab613926735abeed88b,1,4.000000,4.000000,4.000000,4.000000,0,0,5.0,4.205000,...,-0.000000,0.000000e+00,1.000000,4.0,0.800000,0.00,1.637500,5.0,4.000000,13788278
434107,0x5dd64dc64f0012c2970c2770f6152468c5ba1412,1,0.016831,0.016831,0.016831,0.016831,0,0,2.0,0.016831,...,-0.000000,0.000000e+00,1.000000,1.0,0.500000,0.00,0.016831,0.0,0.016831,17413448
434108,0xdb65979031a4011a1fcd7079bd6f298161586139,1,0.040854,0.040854,0.040854,0.040854,0,0,4.0,0.308519,...,0.682014,2.895964e-16,1.000000,1.0,0.500000,0.25,0.123380,0.0,0.040854,16941740
434109,0x953e85f76993440748a5ba34702a8237bed30fab,1,0.030000,0.030000,0.030000,0.030000,0,0,12.0,898.083960,...,-0.000000,0.000000e+00,0.916667,11.0,0.916667,0.00,73.122034,12.0,0.030000,4712017


In [18]:
df_full_feature_set.to_csv('20241117_graph_and_tree_features.csv', index=False)

In [19]:
df_activated_address_features.columns

Index(['addr', 'provider_fan_out', 'provider_total_gas_provision_amount',
       'provider_avg_gas_provision_amount',
       'provider_max_gas_provision_amount',
       'provider_min_gas_provision_amount', 'provider_is_star_like_attack',
       'provider_is_labeled', 'tree_size', 'total_gas', 'max_depth',
       'branching_factor', 'balance_factor', 'leaf_provision_proportion',
       'avg_depth', 'depth_variance', 'leaf_to_internal_ratio', 'avg_leaf_gas',
       'breadth_factor', 'breadth_to_depth_ratio', 'gini_coefficient',
       'gas_distribution_entropy', 'gas_distribution_skewness',
       'leaf_gas_distribution_entropy', 'leaf_gas_distribution_skewness',
       'sparsity', 'depth', 'longest_chain_ratio', 'star_like_ratio',
       'depth_weighted_avg_gas', 'chain_length'],
      dtype='object')

In [20]:
df_funding_tx[df_funding_tx.activated_address=='0x35d61a0648b972b3eb2686e36dcabaeff92b155b']

Unnamed: 0,activated_address,gas_provider,gas_provision_amount,first_gas_provision_time,block_number,tx_hash
127160,0x35d61a0648b972b3eb2686e36dcabaeff92b155b,0x35d61a0648b972b3eb2686e36dcabaeff92b155b,0.01,2023-04-15 20:08:23 UTC,17054655,0x125e8a5366eedcd0f6d314350025575492402f8fee84...
