# Training a simple logistic regression model to predict if an address is a bridge a cex or something else

- We use a list of voters to the citizen round and take any voter that have seeded more than one voting address
- From that list of addresses we want to know which one of them are a bridge/cex or a user
- We use a query to Flipside API to get some basic data about the kind of transactions made by an address
- We manually flagged some addresses as cex or bridge and use that data to train a logistic regression model
- We use the model to predict the kind of address on the remaining data that as not been manually flagged
- We verify some addresses from the unsee data and see if the model is working

The goal is to have a model that can predict the kind of address and then use that data to filter the addresses that were seeded by a bridge or a cex from the list of potential sybil. This methods is expected to flag automatically 500 sybils from 20000 voters.

This is because if an address fund several voting addresses and that address has not been seeded from a bridge or a cex, then it is very likely that these addresses belongs to the same entity.


In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from sbdata.FlipsideApi import FlipsideApi

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score

In [3]:
# Set path to data folder
current_dir = Path(os.getcwd())
DATA_DIR = os.path.join(current_dir.parent.parent, 'data-regen-rangers')
ODC_DATA_DIR = os.path.join(current_dir.parent.parent.parent, 'data')

# set up api key for flipside
api_key = os.environ['FLIPSIDE_API_KEY']
flipside_api = FlipsideApi(api_key, timeout_minutes=10, max_age_minutes=60, max_address=1000)

## Load Seed Data

In [4]:
df_seed_wallet= pd.read_csv(os.path.join(DATA_DIR, 'seed_wallet_citizen.csv'))
df_seed_wallet.head(2)

Unnamed: 0,EOA,from_address,to_address
0,0x000000006f457c0f8f560333d9c2877287d92a92,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,0x000000006f457c0f8f560333d9c2877287d92a92
1,0x000128fa45d79dc9af8016da242781f12c363fd5,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,0x000128fa45d79dc9af8016da242781f12c363fd5


In [5]:
df_seeder_count = df_seed_wallet.groupby('from_address').count().sort_values(by='to_address', ascending=False).reset_index().drop(columns=['to_address']).rename(columns={'from_address': 'seeder', 'EOA': 'count_seed'})

In [6]:
df_seeder_count

Unnamed: 0,seeder,count_seed
0,0x80c67432656d59144ceff962e8faf8926599bcf8,3148
1,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,2980
2,0x2d2cc0eb095e43204e0c087e07dbf95909650939,1321
3,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,800
4,0xf89d7b9c864f589bbf53a82105107622b35eaa40,677
...,...,...
3155,0x568fd3434f2be3edd1454de76b4a7b2fe5d8d717,1
3156,0x569f1ec2149d4927da420637e6007021c7a8a606,1
3157,0x56aaf01b53c80fefd7f97e3610207773b4a855e2,1
3158,0x56d0a8c9519a6524eec4eecf0f9c2dc0af817f9f,1


## Load Labeled Data

In [7]:
df_labels = pd.read_csv(os.path.join(ODC_DATA_DIR, 'address_labels_citizen.csv'), usecols=['address', 'tag', 'sub_type'])
df_labels_test = pd.read_csv(os.path.join(ODC_DATA_DIR, 'address_labels_citizen_test.csv'), usecols=['address', 'tag', 'sub_type'])
df_labels = pd.concat([df_labels, df_labels_test])
df_labels.shape

(105, 3)

In [8]:
df_labels.head(2)

Unnamed: 0,address,tag,sub_type
0,0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,airdrop_master,
1,0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,cex_or_bridge,?


In [9]:
df_labels['target'] = df_labels['tag'].apply(lambda x: 1 if x in ['cex', 'bridge', 'cex_or_bridge'] else 0)

In [10]:
df_labels['target'].value_counts()

target
0    73
1    32
Name: count, dtype: int64

The class is a little inbalanced but that should be fine as the differences between these type of addresses are quite big.

## Retrieve features from flipside

In [11]:
sql_template = """
WITH
  transactions AS (
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      FROM_ADDRESS AS EOA,
      TO_ADDRESS AS COUNTERPARTY,
      ETH_VALUE,
      1 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      FROM_ADDRESS IN (%s)
    UNION ALL
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      TO_ADDRESS AS EOA,
      TO_ADDRESS AS COUNTERPARTY,
      ETH_VALUE,
      0 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      TO_ADDRESS IN (%s)
  )
SELECT
  EOA,
  COUNT(*) as n_tx,
  COUNT(DISTINCT(COUNTERPARTY)) as n_counterparty,
  SUM(ETH_VALUE) as eth_volume,
  SUM(BOOLEAN_OUT) as n_tx_out,
  n_tx - n_tx_out as n_tx_in,
  n_tx_out - n_tx_in as n_tx_diff_out_in,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), CURRENT_TIMESTAMP()) as age,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), MAX(BLOCK_TIMESTAMP)) as time_alive,
  CASE WHEN age = 0 THEN 0 ELSE n_tx / age END as tx_min,
  CASE WHEN time_alive = 0 THEN 0 ELSE n_tx / time_alive END as tx_min_alive,
  AVG(TX_FEE) as avg_tx_fee,
  STDDEV(TX_FEE) as std_tx_fee
FROM
  transactions
GROUP BY
  EOA;
"""

In [12]:
unique_seeder = df_seeder_count.seeder.values

In [13]:
def extract_data_flipside(flipside_api, array_address, sql_template):

    q, r = divmod(len(array_address), flipside_api.MAX_ADDRESS)
    if r != 0:
        q += 1
    list_df = []
    for i in range(q):
        start_index = i * flipside_api.MAX_ADDRESS
        end_index = (i + 1) * flipside_api.MAX_ADDRESS
        print(
            f"Extracting for address: {start_index} - {end_index}")
        array_address_slice = array_address[start_index:end_index]
        str_address_slice = flipside_api.get_string_address(array_address_slice)
        sql = sql_template % (str_address_slice, str_address_slice)
        df = flipside_api.execute_query(sql=sql)
        list_df.append(df)
    df = pd.concat(list_df)
    return df

In [14]:
df_features = extract_data_flipside(flipside_api, unique_seeder, sql_template)

Extracting for address: 0 - 1000


Extracting for address: 1000 - 2000
Extracting for address: 2000 - 3000
Extracting for address: 3000 - 4000


In [15]:
df_features.drop('__row_index', axis=1, inplace=True)

In [16]:
df_features['ratio_tx_counterparties'] = df_features['n_tx'] / df_features['n_counterparty']

In [17]:
df_features.head(2)

Unnamed: 0,eoa,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties
0,0x21543a89d39730bc239623fd9a41bf379dc3f6f1,107,33,5.179127,102,5,97,865483,827401,0.000124,0.000129,0.000311,0.000629,3.242424
1,0xad617514a6576f1964f2b491ed518730c7d0b954,124,47,16.198447,116,8,108,351834,347425,0.000352,0.000357,0.000102,7.2e-05,2.638298


In [18]:
df_features.to_csv(os.path.join(ODC_DATA_DIR, 'features_citizen_seeder.csv'), index=False)

In [19]:
df_merge_feature_target = df_features.merge(df_labels, left_on='eoa', right_on='address', how='left').drop(columns=['address', 'tag', 'sub_type'])

In [20]:
print(df_labels.shape)
print(df_features.shape)
print(df_merge_feature_target.shape)

(105, 4)
(3160, 14)
(3161, 15)


In [21]:
df_merge_feature_target.target.isna().sum()

3059

In [22]:
df_merge_feature_target.set_index('eoa', inplace=True)


In [23]:
df_merge_feature_target.drop_duplicates(inplace=True)

In [26]:
df_test = df_merge_feature_target[df_merge_feature_target.target.isna()].drop(columns=['target'])
df_train = df_merge_feature_target[~df_merge_feature_target.target.isna()]
df_test.fillna(0, inplace=True)
df_train.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.fillna(0, inplace=True)


In [28]:
df_train.to_csv('df_merge_feature_target_full.csv', index=False)

In [None]:
df_train.head(2)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0x88a09a05b60e5649e6f20b45d305d80d44431d42,870,75,4.041001,852,18,834,502294,498254,0.001732,0.001746,0.000106,9.7e-05,11.6,0.0
0xb612884850f6f2dd04fb792e5ad4ff5b67ffeca6,363,161,0.728152,349,14,335,501895,497418,0.000723,0.00073,0.000124,0.000152,2.254658,0.0


### We dont train with n_tx < 15 because they are not an exchange or a bridge in these cases

In [None]:
df_train = df_train[df_train['n_tx'] > 50]

## Train a model

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_train.drop(columns=['target']), df_train['target'], test_size=0.2, random_state=42)


# Define the scalers and associated names
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Create the pipeline with normalization and logistic regression
pipe = Pipeline([
    ('scaler', None),  # Placeholder for scaler
    ('logistic_regression', LogisticRegression())
])

# Define the parameter grid for the pipeline
param_grid = {
    'scaler': [scaler for _, scaler in scalers],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__penalty': ['l2'],
    'logistic_regression__max_iter': [500, 1000, 2000],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
}

# Perform the grid search
lr_param_search = GridSearchCV(pipe, param_grid, cv=5)
lr_param_search.fit(x_train, y_train)

# Print the best parameters and score
print("Best parameters found in parameter search:", lr_param_search.best_params_)
print("Best accuracy found in parameter search:", lr_param_search.best_score_)

x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
y_pred = lr_param_search.predict(x_train)

x_train_full['prediction'] = y_pred
# Calculate precision and recall
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)

# Print precision and recall
print("Precision:", precision)
print("Recall:", recall)

x_train_full

Best parameters found in parameter search: {'logistic_regression__C': 0.01, 'logistic_regression__max_iter': 500, 'logistic_regression__penalty': 'l2', 'logistic_regression__tol': 0.0001, 'scaler': RobustScaler()}
Best accuracy found in parameter search: 0.8666666666666666
Precision: 1.0
Recall: 0.6


Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0x62cc4edfe738701297f06ce979de18229b69b49a,641,163,11.880565,586,55,531,369284,369069,0.001736,0.001737,0.000145,0.000153,3.932515,0.0,0.0
0xdfdc2927de08ce14c10af8417018f9586c348af5,223,77,4.770902,219,4,215,327227,321335,0.000681,0.000694,0.00012,0.000391,2.896104,0.0,0.0
0xc6d7cba263bc5afb0ecc97820d8c6c6c9c92b0c2,2873,1818,13.25573,2459,414,2045,583118,555939,0.004927,0.005168,0.000107,6.9e-05,1.580308,0.0,0.0
0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,1714,1246,49.270997,1714,0,1714,871433,537860,0.001967,0.003187,0.000581,0.000379,1.375602,1.0,0.0
0x698da8663df8c6ad2b8d59dc957dd59e3e9e644d,773,97,45.334905,708,65,643,869093,868026,0.000889,0.000891,0.000186,0.000263,7.969072,0.0,0.0
0xa3d02f257c10c93bbb96505a058ac6f177d45195,149,42,4.213914,130,19,111,837231,803848,0.000178,0.000185,0.000307,0.000288,3.547619,0.0,0.0
0x47a916d0b694c95bc0c09953c1a6e22c8867be32,89,53,0.030389,52,37,15,275261,137322,0.000323,0.000648,6.6e-05,1.6e-05,1.679245,0.0,0.0
0x81e2d092418767e0cf69e03bbacfd50bd82024aa,175,59,159.76317,168,7,161,401965,401524,0.000435,0.000436,9.9e-05,0.000127,2.966102,0.0,0.0
0x99095b01f51c73b143b487669488e220fc16ea44,172,97,0.137372,159,13,146,162752,160562,0.001057,0.001071,0.000155,0.000135,1.773196,0.0,0.0
0xebb8ea128bbdff9a1780a4902a9380022371d466,67507,18138,23351.003321,52794,14713,38081,659926,659902,0.102295,0.102299,0.000151,0.000111,3.721855,1.0,1.0


In [None]:
x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0x62cc4edfe738701297f06ce979de18229b69b49a,641,163,11.880565,586,55,531,369284,369069,0.001736,0.001737,0.000145,0.000153,3.932515,0.0,0.0
0xdfdc2927de08ce14c10af8417018f9586c348af5,223,77,4.770902,219,4,215,327227,321335,0.000681,0.000694,0.00012,0.000391,2.896104,0.0,0.0
0xc6d7cba263bc5afb0ecc97820d8c6c6c9c92b0c2,2873,1818,13.25573,2459,414,2045,583118,555939,0.004927,0.005168,0.000107,6.9e-05,1.580308,0.0,0.0
0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,1714,1246,49.270997,1714,0,1714,871433,537860,0.001967,0.003187,0.000581,0.000379,1.375602,1.0,0.0
0x698da8663df8c6ad2b8d59dc957dd59e3e9e644d,773,97,45.334905,708,65,643,869093,868026,0.000889,0.000891,0.000186,0.000263,7.969072,0.0,0.0
0xa3d02f257c10c93bbb96505a058ac6f177d45195,149,42,4.213914,130,19,111,837231,803848,0.000178,0.000185,0.000307,0.000288,3.547619,0.0,0.0
0x47a916d0b694c95bc0c09953c1a6e22c8867be32,89,53,0.030389,52,37,15,275261,137322,0.000323,0.000648,6.6e-05,1.6e-05,1.679245,0.0,0.0
0x81e2d092418767e0cf69e03bbacfd50bd82024aa,175,59,159.76317,168,7,161,401965,401524,0.000435,0.000436,9.9e-05,0.000127,2.966102,0.0,0.0
0x99095b01f51c73b143b487669488e220fc16ea44,172,97,0.137372,159,13,146,162752,160562,0.001057,0.001071,0.000155,0.000135,1.773196,0.0,0.0
0xebb8ea128bbdff9a1780a4902a9380022371d466,67507,18138,23351.003321,52794,14713,38081,659926,659902,0.102295,0.102299,0.000151,0.000111,3.721855,1.0,1.0


In [None]:
x_train_full['wrong'] = x_train_full['target'] != x_train_full['prediction']
x_train_full.sort_values(by=['wrong', 'target'], ascending=False).head(20)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction,wrong
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,1714,1246,49.270997,1714,0,1714,871433,537860,0.001967,0.003187,0.000581,0.000379,1.375602,1.0,0.0,True
0x0a88bc5c32b684d467b43c06d9e0899efeaf59df,5991,2336,2507.106783,2481,3510,-1029,78964,22512,0.07587,0.266125,0.000165,5.5e-05,2.56464,1.0,0.0,True
0x0a1ce4496471867fac0ad71b785e5258993c9b33,2534,855,1771.978811,1267,1267,0,218928,218869,0.011575,0.011578,0.000129,9.3e-05,2.963743,1.0,0.0,True
0x74e1d68ff9b267e48126a9d2289c8598e295fdac,4085,3202,345.008433,3997,88,3909,127194,127165,0.032116,0.032124,0.000109,7.1e-05,1.275765,1.0,0.0,True
0x766182bfa8b8790d61c4d7e7912c1c3a6f42cef6,720,102,17.719717,662,58,604,176449,175188,0.00408,0.00411,0.000129,0.000143,7.058824,1.0,0.0,True
0xa3f45e619ce3aae2fa5f8244439a66b203b78bcc,5865,1017,5231.338388,2968,2897,71,659886,659704,0.008888,0.00889,0.000134,9.7e-05,5.766962,1.0,0.0,True
0x43c5b1c2be8ef194a509cf93eb1ab3dbd07b97ed,5977,1292,14772.150138,5944,33,5911,562381,562311,0.010628,0.010629,0.00011,0.000109,4.626161,1.0,0.0,True
0x456325f2ac7067234dd71e01bebe032b0255e039,15847,520,1280.686215,14763,1084,13679,494350,490028,0.032056,0.032339,0.000333,0.000438,30.475,1.0,0.0,True
0xebb8ea128bbdff9a1780a4902a9380022371d466,67507,18138,23351.003321,52794,14713,38081,659926,659902,0.102295,0.102299,0.000151,0.000111,3.721855,1.0,1.0,False
0x0d0707963952f2fba59dd06f2b425ace40b492fe,29245,10905,9292.85422,25209,4036,21173,631683,631621,0.046297,0.046302,0.000109,0.000107,2.681797,1.0,1.0,False


In [None]:
best_model = lr_param_search.best_estimator_
joblib.dump(best_model, 'optimism_cex_dex_logistic.joblib')

['optimism_cex_dex_logistic.joblib']

In [None]:
import joblib

# Load the saved model
best_model = joblib.load('optimism_cex_dex_logistic.joblib')

df_pred_test = df_test.copy()
df_pred_test['prediction'] = 0

# Make predictions using the loaded model and normalized data
df_pred_test.loc[df_test['n_tx'] > 50, 'prediction'] = best_model.predict(df_test[df_test['n_tx'] > 50])


In [None]:
df_pred_test.prediction.sum()

0

In [None]:
df_pred_test.sort_values(by='prediction', ascending=False).head(42)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0xb9726225b711f5ffe1eb3e117e9caaa0f78dcd37,173,55,0.035903,173,0,173,817706,799326,0.000212,0.000216,0.000133,0.000162,3.145455,0
0x34bdb29e7489955b0cdc1ac079dac53f5db91113,36,20,2.107439,33,3,30,472212,457073,7.6e-05,7.9e-05,0.000105,8.3e-05,1.8,0
0x082cb0b7651589baed0de3836c2a29a6033771cb,12,6,0.792907,9,3,6,863001,279942,1.4e-05,4.3e-05,0.000625,0.00029,2.0,0
0x0449daef586ea2434f3e4aaa6e031e0e7550e37d,117,46,16.229731,111,6,105,334309,312414,0.00035,0.000375,0.000106,0.00012,2.543478,0
0x61df21c9a21e2825c414c1c4eb75db7d806a9c47,72,29,2.185356,70,2,68,865130,857718,8.3e-05,8.4e-05,0.000334,0.000393,2.482759,0
0x0ccae05a604ec3ce059a73f42d5d78c5dafa4059,112,47,15.229659,103,9,94,491129,482293,0.000228,0.000232,0.000102,8.2e-05,2.382979,0
0x17710b9e26a497849072d2234d4ff21399b580f1,217,64,158.026829,197,20,177,547587,525699,0.000396,0.000413,9.7e-05,9.7e-05,3.390625,0
0x48215430399ccb4ab09f74ce5e9df8be56445ff0,101,44,15.970712,95,6,89,302968,292552,0.000333,0.000345,0.00011,0.000123,2.295455,0
0x0e3e7ae37eb2960d0e3f0e4bf8b56b79a731c446,9,4,0.064411,8,1,7,422531,39,2.1e-05,0.230769,5.6e-05,5.3e-05,2.25,0
0x03f1814197b4326b16d6484dbdda0124a107ae04,110,44,15.961658,102,8,94,475868,465432,0.000231,0.000236,0.000104,9.1e-05,2.5,0


In [None]:
best_model

In [None]:
# Retrieve the feature importance (coefficients)
feature_importance = best_model.named_steps['logistic_regression'].coef_[0]

# Get the corresponding feature names
feature_names = x_train.columns.tolist()

# Create a dictionary mapping feature names to their importance
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print the feature importance
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

n_tx: 0.05840731860117349
n_counterparty: 0.03725380056742567
eth_volume: 0.13638809836106747
n_tx_out: 0.06334710537165023
n_tx_in: 0.0726406172279659
n_tx_diff_out_in: 0.06199878360907511
age: -0.017389522408313977
time_alive: -0.014518254392220984
tx_min: 0.05421480829383147
tx_min_alive: 0.07797500944635054
avg_tx_fee: 0.0599229287028512
std_tx_fee: 0.0474701063066714
ratio_tx_counterparties: 0.0372333288283329


In [None]:
a

NameError: name 'a' is not defined

In [None]:

col_to_remove = ['n_tx_out', 'n_tx_in', 'n_tx_diff_out_in', 'time_alive', 'tx_min']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_train.drop(columns=col_to_remove).drop(columns=['target']), df_train.drop(columns=col_to_remove)['target'], test_size=0.2, random_state=42)

# Define the scalers and associated names
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Create the pipeline with normalization and logistic regression
pipe = Pipeline([
    ('scaler', None),  # Placeholder for scaler
    ('logistic_regression', LogisticRegression())
])

# Define the parameter grid for the pipeline
param_grid = {
    'scaler': [scaler for _, scaler in scalers],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__penalty': ['l2'],
    'logistic_regression__max_iter': [500, 1000, 2000],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
}

# Perform the grid search
# lr_param_search = GridSearchCV(pipe, param_grid, cv=5)
lr_param_search = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc')
lr_param_search.fit(x_train, y_train)

# Print the best parameters and score
print("Best parameters found in parameter search:", lr_param_search.best_params_)
print("Best accuracy found in parameter search:", lr_param_search.best_score_)


x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
y_pred = lr_param_search.predict(x_train)

x_train_full['prediction'] = y_pred
# Calculate precision and recall
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)

# Print precision and recall
print("Precision:", precision)
print("Recall:", recall)

x_train_full

Best parameters found in parameter search: {'logistic_regression__C': 0.01, 'logistic_regression__max_iter': 500, 'logistic_regression__penalty': 'l2', 'logistic_regression__tol': 0.0001, 'scaler': RobustScaler()}
Best accuracy found in parameter search: 0.9366666666666668
Precision: 1.0
Recall: 0.6153846153846154


Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,age,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,19036,1432,319.913911,565145,0.033689,0.000090,0.000104,13.293296,1.0,0.0
0x291c3063a40e1594a94ae8f5f84a5359936619f7,1990,118,61.542502,740671,0.002687,0.000173,0.000235,16.864407,0.0,0.0
0x80c67432656d59144ceff962e8faf8926599bcf8,613868,175098,176276.043382,729322,0.841773,0.000102,0.000107,3.505854,1.0,1.0
0x30620715bcf265773b2973eff4293cee0bb1b774,478,181,1.130679,853880,0.000563,0.000204,0.000274,2.640884,0.0,0.0
0x5bdf85216ec1e38d6458c870992a69e38e03f7ef,58912,15551,14715.217580,645355,0.091296,0.000119,0.000099,3.788309,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,172,141,0.399488,220001,0.000806,0.000131,0.000126,1.219858,0.0,0.0
0x729411caaca9e1118c3e11ef0575612f7a9a4448,202,59,1.242182,867450,0.000236,0.000123,0.000103,3.423729,0.0,0.0
0xd5a87b7f19716390725916cc5f3651019af2b212,254,69,0.961003,452732,0.000572,0.000054,0.000075,3.681159,0.0,0.0
0x77ef7c18a27d3886dc1e4f6e67a7bb1e5b336d92,562,174,2.173956,398864,0.001429,0.000147,0.000212,3.229885,0.0,0.0


In [None]:
x_train_full['wrong'] = x_train_full['target'] != x_train_full['prediction']
x_train_full.sort_values(by=['wrong', 'target'], ascending=False).head(20)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,age,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,target,prediction,wrong
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,19036,1432,319.9139,565145,0.033689,9e-05,0.000104,13.293296,1.0,0.0,True
0x456325f2ac7067234dd71e01bebe032b0255e039,15847,520,1280.686,494288,0.032339,0.000333,0.000438,30.475,1.0,0.0,True
0x39a80b830a4b77a56ef952df33caaba70f27fd5d,2090,733,1161.443,370393,0.005644,0.000133,0.00012,2.851296,1.0,0.0,True
0xa3f45e619ce3aae2fa5f8244439a66b203b78bcc,5865,1017,5231.338,659824,0.00889,0.000134,9.7e-05,5.766962,1.0,0.0,True
0x74e1d68ff9b267e48126a9d2289c8598e295fdac,4082,3200,344.8183,127132,0.032133,0.000109,7.1e-05,1.275625,1.0,0.0,True
0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,1714,1246,49.271,871371,0.003187,0.000581,0.000379,1.375602,1.0,0.0,True
0x0a1ce4496471867fac0ad71b785e5258993c9b33,2531,855,1771.823,218866,0.011568,0.000129,9.3e-05,2.960234,1.0,0.0,True
0xda43c54ce5083885f561e05fd6220b7096be246c,1648,155,18.46251,329720,0.005001,0.000122,0.000133,10.632258,1.0,0.0,True
0x0a88bc5c32b684d467b43c06d9e0899efeaf59df,5991,2336,2507.107,78902,0.266125,0.000165,5.5e-05,2.56464,1.0,0.0,True
0xcdd37ada79f589c15bd4f8fd2083dc88e34a2af2,5743,1827,788.9473,700395,0.008201,0.000127,0.000152,3.143404,1.0,0.0,True


In [None]:
df_test.isna().sum()
# fill na with 0
df_test.fillna(0, inplace=True)

In [None]:
df_test['prediction'] = 0
df_test.loc[df_test['n_tx'] > 100, 'prediction'] = lr_param_search.predict(df_test.loc[df_test['n_tx'] > 100, x_train.columns])
df_merge_test_count = df_test.sort_values(by=['prediction'], ascending=False).reset_index().merge(df_seeder_count, left_on='eoa', right_on='seeder', how='left').drop(columns=['seeder'])

In [None]:
best_model = lr_param_search.best_estimator_
# Retrieve the feature importance (coefficients)
feature_importance = best_model.named_steps['logistic_regression'].coef_[0]

# Get the corresponding feature names
feature_names = x_train.columns.tolist()

# Create a dictionary mapping feature names to their importance
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print the feature importance
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

n_tx: 0.10366616483812932
n_counterparty: 0.10046298586661558
eth_volume: 0.15077754011377775
age: -0.02850023439161045
tx_min_alive: 0.09512152916503286
avg_tx_fee: 0.04042131051228823
std_tx_fee: 0.05319017368439948
ratio_tx_counterparties: 0.0856055920435824


In [None]:
df_test['prediction'].sum()

0

In [None]:
df_test['proba'] = 0
df_test.loc[df_test['n_tx'] > 100, 'proba'] = lr_param_search.predict_proba(df_test.loc[df_test['n_tx'] > 100, x_train.columns])[:, 1]

In [None]:
df_test.sort_values(['ratio_tx_counterparties'], ascending=False).head(20)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction,proba
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0x00000023f6b4ed7185e7b8928072a8bfec660ff3,275,24,1.49,273,2,271,536836,529770,0.000512,0.000519,0.000358,0.001444,11.458333,0,0.327679
0xab7a354fcdb5a642ef2d2b45b4641f9432fcdb83,41,5,0.018865,40,1,39,61298,24548,0.000669,0.00167,0.000266,0.000194,8.2,0,0.0
0x3110ce980ade1e178263c2af19e9ae2787a34d9e,719,96,18.460295,707,12,695,831305,831159,0.000865,0.000865,0.000298,0.000563,7.489583,0,0.23352
0x80eee448cff5fa38947ed56e8f4a157eca4d9b78,600,82,4.444074,595,5,590,642303,636472,0.000934,0.000943,0.00018,0.000168,7.317073,0,0.197967
0xf756f70302647eed11d728cf674dd64f8783ee41,95,13,0.728402,94,1,93,789271,221882,0.00012,0.000428,0.000549,0.000122,7.307692,0,0.0
0x888364bec43f88ac951e5fb7a0b8dd0f33aa4bc3,189,26,2.2601,188,1,187,821080,409500,0.00023,0.000462,0.000154,0.000926,7.269231,0,0.23726
0x212647c56ba10ee429a838bc567dfb03a8d054ba,270,38,23.157363,228,42,186,557546,556643,0.000484,0.000485,0.000183,0.000189,7.105263,0,0.198578
0x3114097a1e9855c3da9db6fb568696792e027bcb,782,111,612.634792,734,48,686,806049,788374,0.00097,0.000992,0.000249,0.000345,7.045045,0,0.225226
0x2239ff85f256b0ad09502e6b757ad4c0a747a899,968,138,92.423002,956,12,944,859909,853160,0.001126,0.001135,0.000248,0.000332,7.014493,0,0.213792
0x9f9b2787ffa97cb65c65b8d28991065927f725ea,56,8,1.274844,55,1,54,570646,539042,9.8e-05,0.000104,0.000446,0.000548,7.0,0,0.0


In [None]:
df_merge_test_count.set_index('eoa', inplace=True)

In [None]:
for i in range(1, 4):
    print(f'count_seed == {i}')
    print(df_merge_test_count[df_merge_test_count['count_seed']==i].prediction.value_counts())
print(f'count_seed > {i}')
print(df_merge_test_count[df_merge_test_count['count_seed']>i].prediction.value_counts())


count_seed == 1
prediction
0    2785
Name: count, dtype: int64
count_seed == 2
prediction
0    186
Name: count, dtype: int64
count_seed == 3
prediction
0    62
Name: count, dtype: int64
count_seed > 3
prediction
0    27
Name: count, dtype: int64


In [None]:
df_merge_test_count[df_merge_test_count['count_seed']>=i].count_seed.sum()

314

In [None]:
df_merge_test_count[df_merge_test_count['count_seed'] > 1].count_seed.sum()

686

In [None]:
df_merge_train_count = df_train.sort_values(by=['target'], ascending=False).reset_index().merge(df_seeder_count, left_on='eoa', right_on='seeder', how='left').drop(columns=['seeder'])

In [None]:
df_merge_train_count[np.logical_and(df_merge_train_count['target'] == 0, df_merge_train_count['count_seed'] > 2)]['count_seed'].sum()

284

In [None]:
df_merge_test_count.index 

Index(['0x8adfc6e6e6da6ed9af7f094b3f3115ed72048395',
       '0x0439b6e6a64744ef17bbb24192c52755ba5ece48',
       '0x191db34cb2cc24d2480eb127a1fa0ff46fe0d6ae',
       '0x136c89d19ab4e816665a5eea9c32cfa6e0b446c0',
       '0x46d5a574af440681c30f291b704156712bf94086',
       '0x0c07222756ae5b30bd7d04c05dd0f0bdcbfe390f',
       '0x4b919fb49b82908675c14e62462923472d45cc63',
       '0x641cefed261b81add34c3d007475fc032877833e',
       '0x4f2ffdfa34587a35bf9405af1e836c392112bb50',
       '0x7d5b8c378f293e49cb5d477c277f46830ef8f70d',
       ...
       '0xdb0639d1f03485f84109fefecaff41a60247d589',
       '0xfd981695d81a01d9b4574a3b42f4b5987d2178dc',
       '0xc30da72a42be8e59963087db61b3c9763a326c48',
       '0x266a78d3c6ee02ac304fef8171d46d145d939b48',
       '0xf2ab4a4656ac7384586272630c2c9adcbcadf4fc',
       '0xfd3d2cbd82ab6a53def1d506d48362530170e7ff',
       '0x2475929077329c5ee6a74edaec3a90dd86af8931',
       '0xe023cb5fe496ceafb42f3402247b4c27e31844b8',
       '0x31d5720bf8d90c35ef25b2b49

In [None]:
x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full
df_test

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction,proba
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0x8adfc6e6e6da6ed9af7f094b3f3115ed72048395,1013,269,14.721275,1008,5,1003,820752,814948,0.001234,0.001243,0.000140,0.000196,3.765799,0,0.183321
0xa4066809936d11b04250b2a4108ec387159c86aa,652,156,36.642957,618,34,584,306658,300014,0.002126,0.002173,0.000166,0.000181,4.179487,0,0.192521
0x9a3cda8c21c9c3bb2560868e382a840bdf3d4001,114,48,17.951394,106,8,98,289468,279003,0.000394,0.000409,0.000131,0.000132,2.375000,0,0.176222
0x9ba288381ef2cbd816529ee1282fa7d1746bc129,106,44,16.338011,100,6,94,477084,469343,0.000222,0.000226,0.000100,0.000069,2.409091,0,0.167552
0x6902440e4e62ef77b12afd93ea539819e02ab0c2,176,61,0.885088,171,5,166,469427,423258,0.000375,0.000416,0.000104,0.000184,2.885246,0,0.175982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0x544717775459733581f5fe004f964d0e680b0c5c,3,3,0.063391,2,1,1,76236,27104,0.000039,0.000111,0.000415,0.000366,1.000000,0,0.000000
0x52b43ed88bf4ca35d7004a5d15dafd57d6e8722a,14,11,0.050151,10,4,6,137198,115360,0.000102,0.000121,0.000181,0.000137,1.272727,0,0.000000
0x5309d1eb434af2803a65ff35582f2af53aefd6fa,10,7,0.258342,8,2,6,84559,84476,0.000118,0.000118,0.000206,0.000175,1.428571,0,0.000000
0x55017c3173e4530a6bedf49f6f989ad1aaea64e6,50,24,16.091300,43,7,36,136765,129510,0.000366,0.000386,0.000146,0.000059,2.083333,0,0.000000
