# Training a simple logistic regression model to predict if an address is a bridge a cex or something else

- We use a list of voters to the citizen round and take any voter that have seeded more than one voting address
- From that list of addresses we want to know which one of them are a bridge/cex or a user
- We use a query to Flipside API to get some basic data about the kind of transactions made by an address
- We manually flagged some addresses as cex or bridge and use that data to train a logistic regression model
- We use the model to predict the kind of address on the remaining data that as not been manually flagged
- We verify some addresses from the unsee data and see if the model is working

The goal is to have a model that can predict the kind of address and then use that data to filter the addresses that were seeded by a bridge or a cex from the list of potential sybil. This methods is expected to flag automatically 500 sybils from 20000 voters.

This is because if an address fund several voting addresses and that address has not been seeded from a bridge or a cex, then it is very likely that these addresses belongs to the same entity.


In [507]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from sbdata.FlipsideApi import FlipsideApi

In [508]:
# Set path to data folder
current_dir = Path(os.getcwd())
DATA_DIR = os.path.join(current_dir.parent.parent, 'data-regen-rangers')
ODC_DATA_DIR = os.path.join(current_dir.parent.parent.parent, 'data')

# set up api key for flipside
api_key = os.environ['FLIPSIDE_API_KEY2']
flipside_api = FlipsideApi(api_key, timeout_minutes=60, max_address=1000)

## Load Seed Data

In [509]:
df_seed_wallet= pd.read_csv(os.path.join(DATA_DIR, 'seed_wallet_citizen.csv'))
df_seed_wallet.head(2)

Unnamed: 0,EOA,from_address,to_address
0,0x000000006f457c0f8f560333d9c2877287d92a92,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,0x000000006f457c0f8f560333d9c2877287d92a92
1,0x000128fa45d79dc9af8016da242781f12c363fd5,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,0x000128fa45d79dc9af8016da242781f12c363fd5


In [510]:
df_seeder_count = df_seed_wallet.groupby('from_address').count().sort_values(by='to_address', ascending=False).reset_index().drop(columns=['to_address']).rename(columns={'from_address': 'seeder', 'EOA': 'count_seed'})

In [511]:
df_seeder_count

Unnamed: 0,seeder,count_seed
0,0x80c67432656d59144ceff962e8faf8926599bcf8,3148
1,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,2980
2,0x2d2cc0eb095e43204e0c087e07dbf95909650939,1321
3,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,800
4,0xf89d7b9c864f589bbf53a82105107622b35eaa40,677
...,...,...
3155,0x568fd3434f2be3edd1454de76b4a7b2fe5d8d717,1
3156,0x569f1ec2149d4927da420637e6007021c7a8a606,1
3157,0x56aaf01b53c80fefd7f97e3610207773b4a855e2,1
3158,0x56d0a8c9519a6524eec4eecf0f9c2dc0af817f9f,1


## Load Labeled Data

In [512]:
df_labels = pd.read_csv(os.path.join(ODC_DATA_DIR, 'address_labels_citizen.csv'), usecols=['address', 'tag', 'sub_type'])

In [513]:
df_labels.head(2)

Unnamed: 0,address,tag,sub_type
0,0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,airdrop_master,
1,0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,cex_or_bridge,?


In [514]:
df_labels['target'] = df_labels['tag'].apply(lambda x: 1 if x in ['cex', 'bridge', 'cex_or_bridge'] else 0)

In [515]:
df_labels['target'].value_counts()

target
0    42
1    19
Name: count, dtype: int64

The class is a little inbalanced but that should be fine as the differences between these type of addresses are quite big.

## Retrieve features from flipside

In [516]:
sql_template = """
WITH
  transactions AS (
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      FROM_ADDRESS AS EOA,
      TO_ADDRESS AS COUNTERPARTY,
      ETH_VALUE,
      1 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      FROM_ADDRESS IN (%s)
    UNION ALL
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      TO_ADDRESS AS EOA,
      TO_ADDRESS AS COUNTERPARTY,
      ETH_VALUE,
      0 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      TO_ADDRESS IN (%s)
  )
SELECT
  EOA,
  COUNT(*) as n_tx,
  COUNT(DISTINCT(COUNTERPARTY)) as n_counterparty,
  SUM(ETH_VALUE) as eth_volume,
  SUM(BOOLEAN_OUT) as n_tx_out,
  n_tx - n_tx_out as n_tx_in,
  n_tx_out - n_tx_in as n_tx_diff_out_in,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), CURRENT_TIMESTAMP()) as age,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), MAX(BLOCK_TIMESTAMP)) as time_alive,
  CASE WHEN age = 0 THEN 0 ELSE n_tx / age END as tx_min,
  CASE WHEN time_alive = 0 THEN 0 ELSE n_tx / time_alive END as tx_min_alive,
  AVG(TX_FEE) as avg_tx_fee,
  STDDEV(TX_FEE) as std_tx_fee
FROM
  transactions
GROUP BY
  EOA;
"""

In [517]:
unique_seeder = df_seeder_count.seeder.values

In [518]:
def extract_data_flipside(flipside_api, array_address, sql_template):

    q, r = divmod(len(array_address), flipside_api.MAX_ADDRESS)
    if r != 0:
        q += 1
    list_df = []
    for i in range(q):
        start_index = i * flipside_api.MAX_ADDRESS
        end_index = (i + 1) * flipside_api.MAX_ADDRESS
        print(
            f"Extracting for address: {start_index} - {end_index}")
        array_address_slice = array_address[start_index:end_index]
        str_address_slice = flipside_api.get_string_address(array_address_slice)
        sql = sql_template % (str_address_slice, str_address_slice)
        df = flipside_api.execute_query(sql=sql)
        list_df.append(df)
    df = pd.concat(list_df)
    return df

In [519]:
df_features = extract_data_flipside(flipside_api, unique_seeder, sql_template)

Extracting for address: 0 - 1000


Extracting for address: 1000 - 2000
Extracting for address: 2000 - 3000
Extracting for address: 3000 - 4000


In [520]:
df_features.drop('__row_index', axis=1, inplace=True)

In [521]:
df_features['ratio_tx_counterparties'] = df_features['n_tx'] / df_features['n_counterparty']

In [522]:
df_features.head(2)

Unnamed: 0,eoa,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties
0,0x0e29304fee49a7f3bfb4bd0c0c3d574597dc2bdf,336,137,0.734106,332,4,328,853560,845304,0.000394,0.000397,0.000199,0.000313,2.452555
1,0x456325f2ac7067234dd71e01bebe032b0255e039,15847,520,1280.686215,14763,1084,13679,494026,490028,0.032077,0.032339,0.000333,0.000438,30.475


In [523]:
df_features.to_csv(os.path.join(ODC_DATA_DIR, 'features_citizen_seeder.csv'), index=False)

In [524]:
df_merge_feature_target = df_features.merge(df_labels, left_on='eoa', right_on='address', how='left').drop(columns=['address', 'tag', 'sub_type'])

In [525]:
print(df_labels.shape)
print(df_features.shape)
print(df_merge_feature_target.shape)

(61, 4)
(3160, 14)
(3160, 15)


In [526]:
df_merge_feature_target.target.isna().sum()

3099

In [527]:
df_merge_feature_target.set_index('eoa', inplace=True)


In [528]:
df_merge_feature_target.drop_duplicates(inplace=True)

In [529]:
df_test = df_merge_feature_target[df_merge_feature_target.target.isna()].drop(columns=['target'])
df_train = df_merge_feature_target[~df_merge_feature_target.target.isna()]
df_test.fillna(0, inplace=True)
df_train.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.fillna(0, inplace=True)


In [530]:
df_train.head(2)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0xb612884850f6f2dd04fb792e5ad4ff5b67ffeca6,363,161,0.728152,349,14,335,501571,497418,0.000724,0.00073,0.000124,0.000152,2.254658,0.0
0x2d2cc0eb095e43204e0c087e07dbf95909650939,448372,218237,397333.290502,278136,170236,107900,613317,613269,0.731061,0.731118,0.000108,9.9e-05,2.054519,1.0


### We dont train with n_tx < 15 because they are not an exchange or a bridge in these cases

In [531]:
df_train = df_train[df_train['n_tx'] > 50]

## Train a model

In [532]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_train.drop(columns=['target']), df_train['target'], test_size=0.2, random_state=42)


# Define the scalers and associated names
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Create the pipeline with normalization and logistic regression
pipe = Pipeline([
    ('scaler', None),  # Placeholder for scaler
    ('logistic_regression', LogisticRegression())
])

# Define the parameter grid for the pipeline
param_grid = {
    'scaler': [scaler for _, scaler in scalers],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__penalty': ['l2'],
    'logistic_regression__max_iter': [500, 1000, 2000],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
}

# Perform the grid search
lr_param_search = GridSearchCV(pipe, param_grid, cv=5)
lr_param_search.fit(x_train, y_train)

# Print the best parameters and score
print("Best parameters found in parameter search:", lr_param_search.best_params_)
print("Best accuracy found in parameter search:", lr_param_search.best_score_)

x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Best parameters found in parameter search: {'logistic_regression__C': 1, 'logistic_regression__max_iter': 500, 'logistic_regression__penalty': 'l2', 'logistic_regression__tol': 0.0001, 'scaler': RobustScaler()}
Best accuracy found in parameter search: 0.975


Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0xaab4a0ae72cdb9f4e312bb2d50d911300e2d7230,104,44,16.61703,97,7,90,481272,471204,0.000216,0.000221,0.000113,6e-05,2.363636,0.0,0.0
0x9ef21be1c270aa1c3c3d750f458442397fbffcb6,35013,11866,105351.6,14351,20662,-6311,744914,744861,0.047003,0.047006,0.000169,0.000156,2.950699,1.0,1.0
0x17f52a50227ea7aa6042eaae44c84a9a9d59d092,7293,6252,26.90207,7287,6,7281,204354,195490,0.035688,0.037306,0.0002,0.000111,1.166507,0.0,0.0
0xf704d714ec68a378dfe0c24825932b9dd38d1ccc,564,138,22.79158,511,53,458,582680,579792,0.000968,0.000973,0.000106,0.000139,4.086957,0.0,0.0
0xdfdc2927de08ce14c10af8417018f9586c348af5,223,77,4.770902,219,4,215,326903,321335,0.000682,0.000694,0.00012,0.000391,2.896104,0.0,0.0
0x0a88bc5c32b684d467b43c06d9e0899efeaf59df,5991,2336,2507.107,2481,3510,-1029,78640,22512,0.076183,0.266125,0.000165,5.5e-05,2.56464,1.0,1.0
0xba07c4b25a06d1b08d9d0c28ca6d9df2695a359d,51,38,0.3701197,48,3,45,181278,161322,0.000281,0.000316,0.000139,0.000101,1.342105,0.0,0.0
0x77ef7c18a27d3886dc1e4f6e67a7bb1e5b336d92,562,174,2.173956,539,23,516,398602,393409,0.00141,0.001429,0.000147,0.000212,3.229885,0.0,0.0
0x2fc617e933a52713247ce25730f6695920b3befe,95693,16333,8751.339,25475,70218,-44743,855039,854991,0.111917,0.111923,0.000146,0.000178,5.858875,1.0,1.0
0x6135575733f35f782d286507ee7798756d774c52,113,42,16.4948,107,6,101,476871,469426,0.000237,0.000241,9.4e-05,4.6e-05,2.690476,0.0,0.0


In [533]:
x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0xaab4a0ae72cdb9f4e312bb2d50d911300e2d7230,104,44,16.61703,97,7,90,481272,471204,0.000216,0.000221,0.000113,6e-05,2.363636,0.0,0.0
0x9ef21be1c270aa1c3c3d750f458442397fbffcb6,35013,11866,105351.6,14351,20662,-6311,744914,744861,0.047003,0.047006,0.000169,0.000156,2.950699,1.0,1.0
0x17f52a50227ea7aa6042eaae44c84a9a9d59d092,7293,6252,26.90207,7287,6,7281,204354,195490,0.035688,0.037306,0.0002,0.000111,1.166507,0.0,0.0
0xf704d714ec68a378dfe0c24825932b9dd38d1ccc,564,138,22.79158,511,53,458,582680,579792,0.000968,0.000973,0.000106,0.000139,4.086957,0.0,0.0
0xdfdc2927de08ce14c10af8417018f9586c348af5,223,77,4.770902,219,4,215,326903,321335,0.000682,0.000694,0.00012,0.000391,2.896104,0.0,0.0
0x0a88bc5c32b684d467b43c06d9e0899efeaf59df,5991,2336,2507.107,2481,3510,-1029,78640,22512,0.076183,0.266125,0.000165,5.5e-05,2.56464,1.0,1.0
0xba07c4b25a06d1b08d9d0c28ca6d9df2695a359d,51,38,0.3701197,48,3,45,181278,161322,0.000281,0.000316,0.000139,0.000101,1.342105,0.0,0.0
0x77ef7c18a27d3886dc1e4f6e67a7bb1e5b336d92,562,174,2.173956,539,23,516,398602,393409,0.00141,0.001429,0.000147,0.000212,3.229885,0.0,0.0
0x2fc617e933a52713247ce25730f6695920b3befe,95693,16333,8751.339,25475,70218,-44743,855039,854991,0.111917,0.111923,0.000146,0.000178,5.858875,1.0,1.0
0x6135575733f35f782d286507ee7798756d774c52,113,42,16.4948,107,6,101,476871,469426,0.000237,0.000241,9.4e-05,4.6e-05,2.690476,0.0,0.0


In [534]:
best_model = lr_param_search.best_estimator_
joblib.dump(best_model, 'optimism_cex_dex_logistic.joblib')

['optimism_cex_dex_logistic.joblib']

In [535]:
import joblib

# Load the saved model
best_model = joblib.load('optimism_cex_dex_logistic.joblib')

df_pred_test = df_test.copy()
df_pred_test['prediction'] = 0

# Make predictions using the loaded model and normalized data
df_pred_test.loc[df_test['n_tx'] > 50, 'prediction'] = best_model.predict(df_test[df_test['n_tx'] > 50])


In [536]:
df_pred_test.prediction.sum()

9

In [537]:
df_pred_test.sort_values(by='prediction', ascending=False).head(42)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0x88a09a05b60e5649e6f20b45d305d80d44431d42,870,75,4.041001,852,18,834,501970,498254,0.001733,0.001746,0.000106,9.7e-05,11.6,1
0x291c3063a40e1594a94ae8f5f84a5359936619f7,1988,118,61.052502,1943,45,1898,740409,739288,0.002685,0.002689,0.000173,0.000235,16.847458,1
0xc66fa5e6b44d1f1825ca9fef587f37de9f93ea4a,977,47,1.565209,957,20,937,268938,111767,0.003633,0.008741,0.000133,0.00011,20.787234,1
0xc8373edfad6d5c5f600b6b2507f78431c5271ff5,127546,59482,13292.062672,98391,29155,69236,624093,624044,0.20437,0.204386,9.9e-05,9.8e-05,2.144279,1
0xda43c54ce5083885f561e05fd6220b7096be246c,1646,155,18.462506,1596,50,1546,329458,329352,0.004996,0.004998,0.000123,0.000133,10.619355,1
0xd4319e11779920530d35e4c2bc852fcb360b9100,2298,124,90.55797,2117,181,1936,863576,843773,0.002661,0.002723,0.000145,0.000119,18.532258,1
0xe615aa5cd0cf5b46ef50cecbfab49a2a0e7dc51f,55,4,1.821819,54,1,53,669584,115453,8.2e-05,0.000476,0.000239,3.1e-05,13.75,1
0xf491d040110384dbcf7f241ffe2a546513fd873d,125939,60276,12878.015397,96282,29657,66625,624099,624040,0.201793,0.201812,9.8e-05,9.7e-05,2.089372,1
0x456325f2ac7067234dd71e01bebe032b0255e039,15847,520,1280.686215,14763,1084,13679,494026,490028,0.032077,0.032339,0.000333,0.000438,30.475,1
0x70422dd062f9ece1b1d24c4932e39f07e7857411,366,112,0.78347,359,7,352,473219,472271,0.000773,0.000775,0.000122,0.000118,3.267857,0


In [538]:
best_model

In [539]:
# Retrieve the feature importance (coefficients)
feature_importance = best_model.named_steps['logistic_regression'].coef_[0]

# Get the corresponding feature names
feature_names = x_train.columns.tolist()

# Create a dictionary mapping feature names to their importance
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print the feature importance
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

n_tx: 0.2484917714683742
n_counterparty: 0.09314466652781138
eth_volume: 0.8882670325426646
n_tx_out: 0.2079253187387922
n_tx_in: 0.494667154560622
n_tx_diff_out_in: 0.4257983638433375
age: -0.3074818300089279
time_alive: -0.21718476810475584
tx_min: 0.32177358052480703
tx_min_alive: 0.6408652900218925
avg_tx_fee: -0.2920516735257613
std_tx_fee: -0.37847755532631705
ratio_tx_counterparties: 0.5082641921962278


In [554]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

col_to_remove = ['n_tx_out', 'n_tx_in', 'n_tx_diff_out_in', 'time_alive', 'tx_min']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_train.drop(columns=col_to_remove).drop(columns=['target']), df_train.drop(columns=col_to_remove)['target'], test_size=0.2, random_state=42)

# Define the scalers and associated names
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Create the pipeline with normalization and logistic regression
pipe = Pipeline([
    ('scaler', None),  # Placeholder for scaler
    ('logistic_regression', LogisticRegression())
])

# Define the parameter grid for the pipeline
param_grid = {
    'scaler': [scaler for _, scaler in scalers],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__penalty': ['l2'],
    'logistic_regression__max_iter': [500, 1000, 2000],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
}

# Perform the grid search
# lr_param_search = GridSearchCV(pipe, param_grid, cv=5)
lr_param_search = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc')
lr_param_search.fit(x_train, y_train)

# Print the best parameters and score
print("Best parameters found in parameter search:", lr_param_search.best_params_)
print("Best accuracy found in parameter search:", lr_param_search.best_score_)

x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Best parameters found in parameter search: {'logistic_regression__C': 0.01, 'logistic_regression__max_iter': 500, 'logistic_regression__penalty': 'l2', 'logistic_regression__tol': 0.0001, 'scaler': RobustScaler()}
Best accuracy found in parameter search: 1.0


Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,age,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0xaab4a0ae72cdb9f4e312bb2d50d911300e2d7230,104,44,16.61703,481272,0.000221,0.000113,6e-05,2.363636,0.0,0.0
0x9ef21be1c270aa1c3c3d750f458442397fbffcb6,35013,11866,105351.6,744914,0.047006,0.000169,0.000156,2.950699,1.0,1.0
0x17f52a50227ea7aa6042eaae44c84a9a9d59d092,7293,6252,26.90207,204354,0.037306,0.0002,0.000111,1.166507,0.0,0.0
0xf704d714ec68a378dfe0c24825932b9dd38d1ccc,564,138,22.79158,582680,0.000973,0.000106,0.000139,4.086957,0.0,0.0
0xdfdc2927de08ce14c10af8417018f9586c348af5,223,77,4.770902,326903,0.000694,0.00012,0.000391,2.896104,0.0,0.0
0x0a88bc5c32b684d467b43c06d9e0899efeaf59df,5991,2336,2507.107,78640,0.266125,0.000165,5.5e-05,2.56464,1.0,0.0
0xba07c4b25a06d1b08d9d0c28ca6d9df2695a359d,51,38,0.3701197,181278,0.000316,0.000139,0.000101,1.342105,0.0,0.0
0x77ef7c18a27d3886dc1e4f6e67a7bb1e5b336d92,562,174,2.173956,398602,0.001429,0.000147,0.000212,3.229885,0.0,0.0
0x2fc617e933a52713247ce25730f6695920b3befe,95693,16333,8751.339,855039,0.111923,0.000146,0.000178,5.858875,1.0,1.0
0x6135575733f35f782d286507ee7798756d774c52,113,42,16.4948,476871,0.000241,9.4e-05,4.6e-05,2.690476,0.0,0.0


In [555]:
x_train_full['wrong'] = x_train_full['target'] != x_train_full['prediction']
x_train_full.sort_values(by=['wrong', 'target'], ascending=False)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,age,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction,wrong
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0x0a88bc5c32b684d467b43c06d9e0899efeaf59df,5991,2336,2507.107,78640,0.266125,0.000165,5.5e-05,2.56464,1.0,0.0,True
0x0a1ce4496471867fac0ad71b785e5258993c9b33,2528,855,1771.647,218604,0.011569,0.000129,9.3e-05,2.956725,1.0,0.0,True
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,19023,1424,319.5932,564883,0.03368,9e-05,0.000104,13.358848,1.0,0.0,True
0x43c5b1c2be8ef194a509cf93eb1ab3dbd07b97ed,5972,1292,14772.12,562031,0.010627,0.000111,0.000109,4.622291,1.0,0.0,True
0x5bdf85216ec1e38d6458c870992a69e38e03f7ef,58758,15489,14653.22,645093,0.091092,0.000119,9.9e-05,3.793531,1.0,0.0,True
0x0d0707963952f2fba59dd06f2b425ace40b492fe,29239,10902,9291.701,631359,0.046319,0.000109,0.000107,2.681985,1.0,0.0,True
0x9ef21be1c270aa1c3c3d750f458442397fbffcb6,35013,11866,105351.6,744914,0.047006,0.000169,0.000156,2.950699,1.0,1.0,False
0x2fc617e933a52713247ce25730f6695920b3befe,95693,16333,8751.339,855039,0.111923,0.000146,0.000178,5.858875,1.0,1.0,False
0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,218684,84244,72876.52,161344,1.355801,0.000121,0.000242,2.595841,1.0,1.0,False
0xebb8ea128bbdff9a1780a4902a9380022371d466,67490,18138,23351.0,659602,0.102327,0.000151,0.000111,3.720917,1.0,1.0,False


In [562]:
df_test.isna().sum()
# fill na with 0
df_test.fillna(0, inplace=True)

In [564]:
df_test['prediction'] = 0
df_test.loc[df_test['n_tx'] > 100, 'prediction'] = lr_param_search.predict(df_test.loc[df_test['n_tx'] > 100, x_train.columns])
df_merge_test_count = df_test.sort_values(by=['prediction'], ascending=False).reset_index().merge(df_seeder_count, left_on='eoa', right_on='seeder', how='left').drop(columns=['seeder'])

In [558]:
best_model = lr_param_search.best_estimator_
# Retrieve the feature importance (coefficients)
feature_importance = best_model.named_steps['logistic_regression'].coef_[0]

# Get the corresponding feature names
feature_names = x_train.columns.tolist()

# Create a dictionary mapping feature names to their importance
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print the feature importance
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

n_tx: 0.11107227634363538
n_counterparty: 0.09138682016314213
eth_volume: 0.1076540519191452
age: 0.004240107768319287
tx_min_alive: 0.12761088173071625
avg_tx_fee: -0.026756918380032066
std_tx_fee: -0.017341098982699234
ratio_tx_counterparties: 0.08636483826546786


In [559]:
df_test['prediction'].sum()

3

In [568]:
df_test['proba'] = 0
df_test.loc[df_test['n_tx'] > 100, 'proba'] = lr_param_search.predict_proba(df_test.loc[df_test['n_tx'] > 100, x_train.columns])[:, 1]

In [570]:
df_test.sort_values(['ratio_tx_counterparties'], ascending=False).head(20)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction,proba
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0x456325f2ac7067234dd71e01bebe032b0255e039,15847,520,1280.686215,14763,1084,13679,494026,490028,0.032077,0.032339,0.000333,0.000438,30.475,1,0.638507
0xc66fa5e6b44d1f1825ca9fef587f37de9f93ea4a,977,47,1.565209,957,20,937,268938,111767,0.003633,0.008741,0.000133,0.00011,20.787234,0,0.480714
0xd4319e11779920530d35e4c2bc852fcb360b9100,2298,124,90.55797,2117,181,1936,863576,843773,0.002661,0.002723,0.000145,0.000119,18.532258,0,0.443225
0x291c3063a40e1594a94ae8f5f84a5359936619f7,1988,118,61.052502,1943,45,1898,740409,739288,0.002685,0.002689,0.000173,0.000235,16.847458,0,0.408321
0x9841484a4a6c0b61c4eea71376d76453fd05ec9c,3158,217,17.102895,3124,34,3090,846209,846028,0.003732,0.003733,0.001295,0.004564,14.552995,0,0.150581
0xe615aa5cd0cf5b46ef50cecbfab49a2a0e7dc51f,55,4,1.821819,54,1,53,669584,115453,8.2e-05,0.000476,0.000239,3.1e-05,13.75,0,0.0
0x88a09a05b60e5649e6f20b45d305d80d44431d42,870,75,4.041001,852,18,834,501970,498254,0.001733,0.001746,0.000106,9.7e-05,11.6,0,0.339934
0x00000023f6b4ed7185e7b8928072a8bfec660ff3,275,24,1.49,273,2,271,536574,529770,0.000513,0.000519,0.000358,0.001444,11.458333,0,0.266279
0xda43c54ce5083885f561e05fd6220b7096be246c,1646,155,18.462506,1596,50,1546,329458,329352,0.004996,0.004998,0.000123,0.000133,10.619355,0,0.325925
0x082a71624ba33a47665f43a2c948dfa00685ebf2,81,8,1.126272,77,4,73,577601,440279,0.00014,0.000184,0.000806,0.000648,10.125,0,0.0


In [547]:
df_merge_test_count.set_index('eoa', inplace=True)

In [548]:
for i in range(1, 4):
    print(f'count_seed == {i}')
    print(df_merge_test_count[df_merge_test_count['count_seed']==i].prediction.value_counts())
print(f'count_seed > {i}')
print(df_merge_test_count[df_merge_test_count['count_seed']>i].prediction.value_counts())


count_seed == 1
prediction
0    2805
1       4
Name: count, dtype: int64
count_seed == 2
prediction
0    189
1      2
Name: count, dtype: int64
count_seed == 3
prediction
0    64
Name: count, dtype: int64
count_seed > 3
prediction
0    33
1     2
Name: count, dtype: int64


In [549]:
df_merge_test_count[df_merge_test_count['count_seed']>=i].count_seed.sum()

426

In [550]:
df_merge_test_count.index 

Index(['0xc8373edfad6d5c5f600b6b2507f78431c5271ff5',
       '0x88a09a05b60e5649e6f20b45d305d80d44431d42',
       '0xc66fa5e6b44d1f1825ca9fef587f37de9f93ea4a',
       '0x456325f2ac7067234dd71e01bebe032b0255e039',
       '0x291c3063a40e1594a94ae8f5f84a5359936619f7',
       '0xf491d040110384dbcf7f241ffe2a546513fd873d',
       '0xd4319e11779920530d35e4c2bc852fcb360b9100',
       '0xda43c54ce5083885f561e05fd6220b7096be246c',
       '0x6e63b7a630ed14251fb8461d280de999d0f812f1',
       '0x324c7200396b01f2bdc8f349f9d57e3d16df40dc',
       ...
       '0x27e0ecc732454e58c711c0757739bdfb97140ae3',
       '0xe06944e55f5c24d97b9e9b6e3c54793352a2e644',
       '0x3e5d7f4245fb0b5fccd9c4aaa506b52de2289e6b',
       '0x26ef90b47635e2950d2d3f45d05fd59faf49445e',
       '0xdae9939d73c300f69ba44901be4e86594d1383aa',
       '0x20775833b258c6a9c59dc84e693a22736f73616a',
       '0xd0933fb50223e3128bd0375b35c15bf4c383f200',
       '0xbc7a5a3ddfc709390320c21378b92d8ab2ad9d0f',
       '0xfe59bd86d2191b48b11df3b5f

In [551]:
df_merge_test_count[df_merge_test_count['count_seed']>=i].sort_values(by=['n_tx'], ascending=False).head(10)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction,count_seed
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0xc8373edfad6d5c5f600b6b2507f78431c5271ff5,127546,59482,13292.062672,98391,29155,69236,624093,624044,0.20437,0.204386,9.9e-05,9.8e-05,2.144279,1,6
0xf491d040110384dbcf7f241ffe2a546513fd873d,125939,60276,12878.015397,96282,29657,66625,624099,624040,0.201793,0.201812,9.8e-05,9.7e-05,2.089372,1,11
0x5b965a9cdcdb710d50967b0ac32e6341b72063a2,6270,2392,156.306392,5207,1063,4144,350071,350002,0.017911,0.017914,0.00012,9.8e-05,2.621237,0,9
0xa3f45e619ce3aae2fa5f8244439a66b203b78bcc,5851,1017,5229.977479,2968,2883,85,659562,659464,0.008871,0.008872,0.000135,9.7e-05,5.753196,0,6
0xcdd37ada79f589c15bd4f8fd2083dc88e34a2af2,5726,1826,788.835479,3659,2067,1592,700133,700003,0.008178,0.00818,0.000127,0.000152,3.135816,0,14
0x74e1d68ff9b267e48126a9d2289c8598e295fdac,4067,3188,343.076033,3979,88,3891,126870,126818,0.032056,0.03207,0.00011,7.1e-05,1.275721,0,51
0xc6d7cba263bc5afb0ecc97820d8c6c6c9c92b0c2,2873,1818,13.25573,2459,414,2045,582794,555939,0.00493,0.005168,0.000107,6.9e-05,1.580308,0,3
0x63a43a24de2ce6b1bfed393f6fad714b172a977e,2514,423,51.080567,2309,205,2104,600377,594871,0.004187,0.004226,0.000113,0.00023,5.943262,0,4
0xe48fe6012f97b6a13c0ce5cef314caf66e972deb,2200,351,41.000705,2160,40,2120,861219,857766,0.002555,0.002565,0.000164,0.000214,6.267806,0,3
0x39a80b830a4b77a56ef952df33caaba70f27fd5d,2086,732,1161.364228,1544,542,1002,370131,370083,0.005636,0.005637,0.000133,0.00012,2.849727,0,5


In [552]:
df_test['prediction'].sum()

8

In [553]:
x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full
df_test

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0x0e29304fee49a7f3bfb4bd0c0c3d574597dc2bdf,336,137,0.734106,332,4,328,853560,845304,0.000394,0.000397,0.000199,0.000313,2.452555,0
0x456325f2ac7067234dd71e01bebe032b0255e039,15847,520,1280.686215,14763,1084,13679,494026,490028,0.032077,0.032339,0.000333,0.000438,30.475000,1
0x84edd96e9f7e7c5a9c3e609cbd3c0d85f01e3742,160,52,0.367771,149,11,138,400296,383138,0.000400,0.000418,0.000101,0.000111,3.076923,0
0x862f3b8176867080cbd80e5fa1849975fac033a3,93,35,16.577286,86,7,79,329631,309081,0.000282,0.000301,0.000094,0.000113,2.657143,0
0x8cdd0b35e7d24d4733c34782efaa0c14ff3737b4,92,45,5.540840,83,9,74,328403,319729,0.000280,0.000288,0.000139,0.000097,2.044444,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0x58a833bc5841fc8dbfba70788cec0fa7cc89c22b,104,36,0.353454,99,5,94,142276,131725,0.000731,0.000790,0.000238,0.000948,2.888889,0
0x5411ff60efc87d815c575878c0281df005155579,4,4,0.001908,3,1,2,168642,24,0.000024,0.166667,0.000138,0.000118,1.000000,0
0x53c575344e760d316ac7683db6badddb5986cbfa,124,47,16.338348,117,7,110,298515,290924,0.000415,0.000426,0.000107,0.000118,2.638298,0
0x584abc5b9434ec986529f77dba9ab7f58b415c8c,100,39,0.658704,95,5,90,518279,508405,0.000193,0.000197,0.000141,0.000123,2.564103,0
