# Training a simple logistic regression model to predict if an address is a bridge a cex or something else

- We use a list of voters to the citizen round and take any voter that have seeded more than one voting address
- From that list of addresses we want to know which one of them are a bridge/cex or a user
- We use a query to Flipside API to get some basic data about the kind of transactions made by an address
- We manually flagged some addresses as cex or bridge and use that data to train a logistic regression model
- We use the model to predict the kind of address on the remaining data that as not been manually flagged
- We verify some addresses from the unsee data and see if the model is working

The goal is to have a model that can predict the kind of address and then use that data to filter the addresses that were seeded by a bridge or a cex from the list of potential sybil. This methods is expected to flag automatically 500 sybils from 20000 voters.

This is because if an address fund several voting addresses and that address has not been seeded from a bridge or a cex, then it is very likely that these addresses belongs to the same entity.


In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from sbdata.FlipsideApi import FlipsideApi

In [2]:
# Set path to data folder
current_dir = Path(os.getcwd())
DATA_DIR = os.path.join(current_dir.parent.parent, 'data-regen-rangers')
ODC_DATA_DIR = os.path.join(current_dir.parent.parent.parent, 'data')

# set up api key for flipside
api_key = os.environ['FLIPSIDE_API_KEY2']
flipside_api = FlipsideApi(api_key, timeout_minutes=10, max_address=1000)

## Load Seed Data

In [3]:
df_seed_wallet= pd.read_csv(os.path.join(DATA_DIR, 'seed_wallet_citizen.csv'))
df_seed_wallet.head(2)

Unnamed: 0,EOA,from_address,to_address
0,0x000000006f457c0f8f560333d9c2877287d92a92,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,0x000000006f457c0f8f560333d9c2877287d92a92
1,0x000128fa45d79dc9af8016da242781f12c363fd5,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,0x000128fa45d79dc9af8016da242781f12c363fd5


In [4]:
df_seeder_count = df_seed_wallet.groupby('from_address').count().sort_values(by='to_address', ascending=False).reset_index().drop(columns=['to_address']).rename(columns={'from_address': 'seeder', 'EOA': 'count_seed'})

In [5]:
df_seeder_count

Unnamed: 0,seeder,count_seed
0,0x80c67432656d59144ceff962e8faf8926599bcf8,3148
1,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,2980
2,0x2d2cc0eb095e43204e0c087e07dbf95909650939,1321
3,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,800
4,0xf89d7b9c864f589bbf53a82105107622b35eaa40,677
...,...,...
3155,0x568fd3434f2be3edd1454de76b4a7b2fe5d8d717,1
3156,0x569f1ec2149d4927da420637e6007021c7a8a606,1
3157,0x56aaf01b53c80fefd7f97e3610207773b4a855e2,1
3158,0x56d0a8c9519a6524eec4eecf0f9c2dc0af817f9f,1


## Load Labeled Data

In [6]:
df_labels = pd.read_csv(os.path.join(ODC_DATA_DIR, 'address_labels_citizen.csv'), usecols=['address', 'tag', 'sub_type'])

In [7]:
df_labels.head(2)

Unnamed: 0,address,tag,sub_type
0,0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,airdrop_master,
1,0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,cex_or_bridge,?


In [8]:
df_labels['target'] = df_labels['tag'].apply(lambda x: 1 if x in ['cex', 'bridge', 'cex_or_bridge'] else 0)
df_labels['address'] = df_labels['address'].str.lower()

In [9]:
df_labels['target'].value_counts()

target
0    36
1    16
Name: count, dtype: int64

The class is a little inbalanced but that should be fine as the differences between these type of addresses are quite big.

## Retrieve features from flipside

In [10]:
sql_template = """
WITH
  transactions AS (
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      FROM_ADDRESS AS EOA,
      1 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      FROM_ADDRESS IN (%s)
    UNION ALL
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      TO_ADDRESS AS EOA,
      0 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      TO_ADDRESS IN (%s)
  )
SELECT
  EOA,
  COUNT(*) as n_tx,
  SUM(BOOLEAN_OUT) as n_tx_out,
  n_tx - n_tx_out as n_tx_in,
  n_tx_out - n_tx_in as n_tx_diff_out_in,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), CURRENT_TIMESTAMP()) as age,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), MAX(BLOCK_TIMESTAMP)) as time_alive,
  CASE WHEN age = 0 THEN 0 ELSE n_tx / age END as tx_min,
  CASE WHEN time_alive = 0 THEN 0 ELSE n_tx / time_alive END as tx_min_alive,
  AVG(TX_FEE) as avg_tx_fee,
  STDDEV(TX_FEE) as std_tx_fee
FROM
  transactions
GROUP BY
  EOA;
"""

In [11]:
unique_seeder = df_seeder_count.seeder.values

In [12]:
def extract_data_flipside(flipside_api, array_address, sql_template):

    q, r = divmod(len(array_address), flipside_api.MAX_ADDRESS)
    if r != 0:
        q += 1
    list_df = []
    for i in range(q):
        start_index = i * flipside_api.MAX_ADDRESS
        end_index = (i + 1) * flipside_api.MAX_ADDRESS
        print(
            f"Extracting for address: {start_index} - {end_index}")
        array_address_slice = array_address[start_index:end_index]
        str_address_slice = flipside_api.get_string_address(array_address_slice)
        sql = sql_template % (str_address_slice, str_address_slice)
        df = flipside_api.execute_query(sql=sql)
        list_df.append(df)
    df = pd.concat(list_df)
    return df

In [13]:
df_features = extract_data_flipside(flipside_api, unique_seeder, sql_template)

Extracting for address: 0 - 1000


Extracting for address: 1000 - 2000
Extracting for address: 2000 - 3000
Extracting for address: 3000 - 4000


In [14]:
df_features.drop('__row_index', axis=1, inplace=True)

In [15]:
df_features.shape[0] == df_seeder_count.shape[0]

True

In [16]:
df_features.head(2)

Unnamed: 0,eoa,n_tx,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee
0,0x9d77056156459335001147c870d10748182c6a86,278,258,20,238,364562,342838,0.000763,0.000811,0.000109,9.2e-05
1,0xf7d36a57cbe154fd3d9134cac44ab49019180e12,627,621,6,615,802066,800741,0.000782,0.000783,0.000116,0.000117


In [17]:
df_features['eoa'] = df_features['eoa'].str.lower()

In [18]:
df_features.to_csv(os.path.join(ODC_DATA_DIR, 'features_citizen_seeder.csv'), index=False)

In [19]:
df_merge_feature_target = df_features.merge(df_labels, left_on='eoa', right_on='address', how='left').drop(columns=['address', 'tag', 'sub_type'])
df_merge_feature_target.drop_duplicates(inplace=True)

In [20]:
print(df_labels.shape)
print(df_features.shape)
print(df_merge_feature_target.shape)

(52, 4)
(3160, 11)
(3160, 12)


In [21]:
df_merge_feature_target.target.isna().sum()

3109

In [22]:
df_merge_feature_target.set_index('eoa', inplace=True)

In [23]:
df_test = df_merge_feature_target[df_merge_feature_target.target.isna()].drop(columns=['target'])
df_train = df_merge_feature_target[~df_merge_feature_target.target.isna()]

In [24]:
df_merge_feature_target.fillna(0, inplace=True)

In [25]:
df_train.head(2)

Unnamed: 0_level_0,n_tx,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,target
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0xca9245a982387e964a8ee0048797beae5fda23ca,243,242,1,241,692733,675980,0.000351,0.000359,0.000161,0.000241,0.0
0xdffd151fdd0f900e8439bdf89ed2869b703c7ec0,145,143,2,141,843917,838837,0.000172,0.000173,0.000378,0.000399,0.0


## Train a model

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_train.drop(columns=['target']), df_train['target'], test_size=0.2, random_state=42)

# Define the scalers and associated names
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Create the pipeline with normalization and logistic regression
pipe = Pipeline([
    ('scaler', None),  # Placeholder for scaler
    ('logistic_regression', LogisticRegression())
])

# Define the parameter grid for the pipeline
param_grid = {
    'scaler': [scaler for _, scaler in scalers],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__penalty': ['l2'],
    'logistic_regression__max_iter': [500, 1000, 2000],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
}

# Perform the grid search
lr_param_search = GridSearchCV(pipe, param_grid, cv=5)
lr_param_search.fit(x_train, y_train)

# Print the best parameters and score
print("Best parameters found in parameter search:", lr_param_search.best_params_)
print("Best accuracy found in parameter search:", lr_param_search.best_score_)

x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Best parameters found in parameter search: {'logistic_regression__C': 1, 'logistic_regression__max_iter': 500, 'logistic_regression__penalty': 'l2', 'logistic_regression__tol': 0.0001, 'scaler': RobustScaler()}
Best accuracy found in parameter search: 0.925


Unnamed: 0_level_0,n_tx,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0xdf90c9b995a3b10a5b8570a47101e6c6a29eb945,127913,90490,37423,53067,675705,675692,0.189303,0.189307,0.00019,0.000645,1.0,1.0
0x7f7b0907b6483d3baf35f0d60fa4a35d8b1f970b,37,35,2,33,792573,655145,4.7e-05,5.6e-05,0.000812,0.000617,0.0,0.0
0x94a36d8bd3470cfc447eab754decd419c6a676b6,12,11,1,10,326962,496,3.7e-05,0.024194,5.2e-05,2.3e-05,0.0,0.0
0xba07c4b25a06d1b08d9d0c28ca6d9df2695a359d,51,48,3,45,177035,161322,0.000288,0.000316,0.000139,0.000101,0.0,0.0
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,18884,18192,692,17500,560640,560606,0.033683,0.033685,9e-05,0.000104,1.0,1.0
0x947484a2bd9308ef2a9c234adef767efa569ac7e,530,519,11,508,842649,839866,0.000629,0.000631,0.000181,0.000276,0.0,0.0
0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,172,148,24,124,215496,213527,0.000798,0.000806,0.000131,0.000126,0.0,0.0
0xebb8ea128bbdff9a1780a4902a9380022371d466,67230,52616,14614,38002,655359,655341,0.102585,0.102588,0.000151,0.000111,1.0,1.0
0x23b03d430544809e76a3e21a17074d6158e3fa40,2,2,0,2,76565,1,2.6e-05,2.0,0.000163,1.4e-05,0.0,0.0
0xf89d7b9c864f589bbf53a82105107622b35eaa40,183944,167194,16750,150444,593325,593319,0.310022,0.310025,0.000137,0.000116,1.0,1.0


In [27]:
x_train.merge(y_train, left_index=True, right_index=True).shape

(40, 11)

In [28]:
x_train.reset_index().drop_duplicates().shape

(40, 11)

In [29]:
x_train.shape

(40, 10)

In [30]:
lr_param_search.predict(x_train).size

40

In [31]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print(x_train_full.shape)

(40, 10)
(11, 10)
(40,)
(11,)
(40, 12)


In [32]:
x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Unnamed: 0_level_0,n_tx,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0xdf90c9b995a3b10a5b8570a47101e6c6a29eb945,127913,90490,37423,53067,675705,675692,0.189303,0.189307,0.00019,0.000645,1.0,1.0
0x7f7b0907b6483d3baf35f0d60fa4a35d8b1f970b,37,35,2,33,792573,655145,4.7e-05,5.6e-05,0.000812,0.000617,0.0,0.0
0x94a36d8bd3470cfc447eab754decd419c6a676b6,12,11,1,10,326962,496,3.7e-05,0.024194,5.2e-05,2.3e-05,0.0,0.0
0xba07c4b25a06d1b08d9d0c28ca6d9df2695a359d,51,48,3,45,177035,161322,0.000288,0.000316,0.000139,0.000101,0.0,0.0
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,18884,18192,692,17500,560640,560606,0.033683,0.033685,9e-05,0.000104,1.0,1.0
0x947484a2bd9308ef2a9c234adef767efa569ac7e,530,519,11,508,842649,839866,0.000629,0.000631,0.000181,0.000276,0.0,0.0
0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,172,148,24,124,215496,213527,0.000798,0.000806,0.000131,0.000126,0.0,0.0
0xebb8ea128bbdff9a1780a4902a9380022371d466,67230,52616,14614,38002,655359,655341,0.102585,0.102588,0.000151,0.000111,1.0,1.0
0x23b03d430544809e76a3e21a17074d6158e3fa40,2,2,0,2,76565,1,2.6e-05,2.0,0.000163,1.4e-05,0.0,0.0
0xf89d7b9c864f589bbf53a82105107622b35eaa40,183944,167194,16750,150444,593325,593319,0.310022,0.310025,0.000137,0.000116,1.0,1.0


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_train.drop(columns=['n_tx_out', 'n_tx_in']).drop(columns=['target']), df_train.drop(columns=['n_tx_out', 'n_tx_in'])['target'], test_size=0.2, random_state=42)

# Define the scalers and associated names
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Create the pipeline with normalization and logistic regression
pipe = Pipeline([
    ('scaler', None),  # Placeholder for scaler
    ('logistic_regression', LogisticRegression())
])

# Define the parameter grid for the pipeline
param_grid = {
    'scaler': [scaler for _, scaler in scalers],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__penalty': ['l2'],
    'logistic_regression__max_iter': [500, 1000, 2000],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
}

# Perform the grid search
lr_param_search = GridSearchCV(pipe, param_grid, cv=5)
lr_param_search.fit(x_train, y_train)

# Print the best parameters and score
print("Best parameters found in parameter search:", lr_param_search.best_params_)
print("Best accuracy found in parameter search:", lr_param_search.best_score_)

x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full

Best parameters found in parameter search: {'logistic_regression__C': 10, 'logistic_regression__max_iter': 500, 'logistic_regression__penalty': 'l2', 'logistic_regression__tol': 0.0001, 'scaler': RobustScaler()}
Best accuracy found in parameter search: 0.9


Unnamed: 0_level_0,n_tx,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,target,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0xdf90c9b995a3b10a5b8570a47101e6c6a29eb945,127913,53067,675705,675692,0.189303,0.189307,0.00019,0.000645,1.0,1.0
0x7f7b0907b6483d3baf35f0d60fa4a35d8b1f970b,37,33,792573,655145,4.7e-05,5.6e-05,0.000812,0.000617,0.0,0.0
0x94a36d8bd3470cfc447eab754decd419c6a676b6,12,10,326962,496,3.7e-05,0.024194,5.2e-05,2.3e-05,0.0,0.0
0xba07c4b25a06d1b08d9d0c28ca6d9df2695a359d,51,45,177035,161322,0.000288,0.000316,0.000139,0.000101,0.0,0.0
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,18884,17500,560640,560606,0.033683,0.033685,9e-05,0.000104,1.0,1.0
0x947484a2bd9308ef2a9c234adef767efa569ac7e,530,508,842649,839866,0.000629,0.000631,0.000181,0.000276,0.0,0.0
0x4a1939dc1de524ff01980a911f0f4d65ce7a27ba,172,124,215496,213527,0.000798,0.000806,0.000131,0.000126,0.0,0.0
0xebb8ea128bbdff9a1780a4902a9380022371d466,67230,38002,655359,655341,0.102585,0.102588,0.000151,0.000111,1.0,1.0
0x23b03d430544809e76a3e21a17074d6158e3fa40,2,2,76565,1,2.6e-05,2.0,0.000163,1.4e-05,0.0,0.0
0xf89d7b9c864f589bbf53a82105107622b35eaa40,183944,150444,593325,593319,0.310022,0.310025,0.000137,0.000116,1.0,1.0


In [34]:
x_train_full['wrong'] = x_train_full['target'] != x_train_full['prediction']
x_train_full.sort_values(by=['wrong', 'target'], ascending=False)

Unnamed: 0_level_0,n_tx,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,target,prediction,wrong
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0xdf90c9b995a3b10a5b8570a47101e6c6a29eb945,127913,53067,675705,675692,0.189303,0.189307,0.00019,0.000645,1.0,1.0,False
0xa0fca8fa8e9c6aa77305f94be0e03908d0a42900,18884,17500,560640,560606,0.033683,0.033685,9e-05,0.000104,1.0,1.0,False
0xebb8ea128bbdff9a1780a4902a9380022371d466,67230,38002,655359,655341,0.102585,0.102588,0.000151,0.000111,1.0,1.0,False
0xf89d7b9c864f589bbf53a82105107622b35eaa40,183944,150444,593325,593319,0.310022,0.310025,0.000137,0.000116,1.0,1.0,False
0x92bd687953da50855aee2df0cff282cc2d5f226b,11821,11685,570023,569998,0.020738,0.020739,0.000106,0.000106,1.0,1.0,False
0xee73323912a4e3772b74ed0ca1595a152b0ef282,39596,960,74378,74372,0.532362,0.532405,8.1e-05,6.5e-05,1.0,1.0,False
0x5bdf85216ec1e38d6458c870992a69e38e03f7ef,56700,38308,640850,640841,0.088476,0.088477,0.000119,0.0001,1.0,1.0,False
0x80c67432656d59144ceff962e8faf8926599bcf8,607867,31657,724817,724810,0.838649,0.838657,0.000103,0.000107,1.0,1.0,False
0xeec428c18ff39c66162e39e79ed6eb8c790e43d7,1714,1714,866866,537860,0.001977,0.003187,0.000581,0.000379,1.0,1.0,False
0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,214501,15133,157101,157092,1.36537,1.365448,0.000123,0.000244,1.0,1.0,False


In [35]:
df_test.isna().sum()
# fill na with 0
df_test.fillna(0, inplace=True)

In [36]:
pred = lr_param_search.predict(df_test.loc[:, x_train.columns])
df_test['prediction'] = pred
df_merge_test_count = df_test.sort_values(by=['prediction'], ascending=False).reset_index().merge(df_seeder_count, left_on='eoa', right_on='seeder', how='left').drop(columns=['seeder'])

In [37]:
df_merge_test_count.set_index('eoa', inplace=True)

In [38]:
for i in range(1, 4):
    print(f'count_seed == {i}')
    print(df_merge_test_count[df_merge_test_count['count_seed']==i].prediction.value_counts())
print(f'count_seed > {i}')
print(df_merge_test_count[df_merge_test_count['count_seed']>i].prediction.value_counts())


count_seed == 1
prediction
0.0    2773
1.0      43
Name: count, dtype: int64
count_seed == 2
prediction
0.0    191
1.0      1
Name: count, dtype: int64
count_seed == 3
prediction
0.0    64
1.0     1
Name: count, dtype: int64
count_seed > 3
prediction
0.0    34
1.0     2
Name: count, dtype: int64


In [39]:
df_merge_test_count[df_merge_test_count['count_seed']>=i].count_seed.sum()

436

In [40]:
df_merge_test_count.index 

Index(['0x57ee43c466b70f819d2f7c2e4760301ee230952c',
       '0xe0be1002e8557539f2ae97f7acc5c656640c187d',
       '0x5fdf4f2e3713cd81af9af7292feb21f907995608',
       '0x1beb5a36f40f68eccd53899c94902e9905135456',
       '0x2256ff702461e7c92303c0d132a06f4b4e5a5241',
       '0x2969a014cf50399415dba05fcde0794645f6c0a3',
       '0x406e3cf9fd4b9f079641bc9ed59e8529aa854731',
       '0xd4319e11779920530d35e4c2bc852fcb360b9100',
       '0xf491d040110384dbcf7f241ffe2a546513fd873d',
       '0xf756f70302647eed11d728cf674dd64f8783ee41',
       ...
       '0xf9e1d1e9f22c96752356adfd377231528c7e851e',
       '0xefedaf9c07e6eb56bb8f82f30018e4461b1c5f4c',
       '0xe431f7736784d033f7c981efa591e13affdb0408',
       '0x3110ce980ade1e178263c2af19e9ae2787a34d9e',
       '0xbf15ea6981354d7b60bc889d80ab671d6fce374e',
       '0xcdaebd017704dc2a8a8a25c3bf6f005aa18d716d',
       '0xcf7975b9fabc5793c2ca79ac138b35af63474c6f',
       '0xd0bd81e24d4f97e39b497cb45ac9f10b450b2c9e',
       '0xdff27408e664e4d856c4fe34d

In [41]:
df_merge_test_count[df_merge_test_count['count_seed']>=i].sort_values(by=['n_tx'], ascending=False).head(10)

Unnamed: 0_level_0,n_tx,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,prediction,count_seed
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0xc8373edfad6d5c5f600b6b2507f78431c5271ff5,121157,94254,26903,67351,619850,619844,0.195462,0.195464,0.000102,9.9e-05,1.0,6
0xf491d040110384dbcf7f241ffe2a546513fd873d,120489,92478,28011,64467,619856,619850,0.194382,0.194384,0.000101,9.9e-05,1.0,11
0x5b965a9cdcdb710d50967b0ac32e6341b72063a2,6145,5093,1052,4041,345828,345804,0.017769,0.01777,0.000122,9.9e-05,0.0,9
0xa3f45e619ce3aae2fa5f8244439a66b203b78bcc,5772,2968,2804,164,655319,655144,0.008808,0.00881,0.000135,9.7e-05,0.0,6
0xcdd37ada79f589c15bd4f8fd2083dc88e34a2af2,5675,3621,2054,1567,695890,695794,0.008155,0.008156,0.000128,0.000152,0.0,14
0x74e1d68ff9b267e48126a9d2289c8598e295fdac,3918,3833,85,3748,122627,122579,0.031951,0.031963,0.000113,7e-05,0.0,51
0xc6d7cba263bc5afb0ecc97820d8c6c6c9c92b0c2,2873,2459,414,2045,578551,555939,0.004966,0.005168,0.000107,6.9e-05,0.0,3
0x63a43a24de2ce6b1bfed393f6fad714b172a977e,2514,2309,205,2104,596134,594871,0.004217,0.004226,0.000113,0.00023,0.0,4
0x0a1ce4496471867fac0ad71b785e5258993c9b33,2476,1255,1221,34,214361,214264,0.011551,0.011556,0.000131,9.3e-05,0.0,7
0xe48fe6012f97b6a13c0ce5cef314caf66e972deb,2199,2159,40,2119,856976,854756,0.002566,0.002573,0.000164,0.000214,0.0,3


In [42]:
df_test['prediction'].sum()

47.0

In [43]:
x_train_full = x_train.merge(y_train, left_index=True, right_index=True)
x_train_full['prediction'] = lr_param_search.predict(x_train)
x_train_full
df_test

Unnamed: 0_level_0,n_tx,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0x9d77056156459335001147c870d10748182c6a86,278,258,20,238,364562,342838,0.000763,0.000811,0.000109,0.000092,0.0
0xf7d36a57cbe154fd3d9134cac44ab49019180e12,627,621,6,615,802066,800741,0.000782,0.000783,0.000116,0.000117,0.0
0xa5b7891cfebdcf04b62ede5d68e7ce524d4cdc86,589,586,3,583,661568,644758,0.000890,0.000914,0.000137,0.000163,0.0
0xbb29efd0a123b63d113c882111925a6f92c799a2,482,475,7,468,836480,755171,0.000576,0.000638,0.000189,0.000264,0.0
0x57a7604baae02bfb83f102eeb96f2c263d2f2ae1,451,447,4,443,805332,788581,0.000560,0.000572,0.000158,0.000184,0.0
...,...,...,...,...,...,...,...,...,...,...,...
0x54cf5fbd9a1ba2cbd23b321886d4848e3a0d47cf,631,609,22,587,531197,529464,0.001188,0.001192,0.000143,0.000173,0.0
0x532e9df581ac1073e6d74d23d97078c31c04da3c,329,322,7,315,820796,816840,0.000401,0.000403,0.000194,0.000235,0.0
0x5cf8ee727ef304616ffb98e5b74cd982c9ec433d,379,375,4,371,603385,591818,0.000628,0.000640,0.000137,0.000180,0.0
0x5d68619e8180e1eaf3d52bcdb26ac5660eddb7f6,23,21,2,19,278751,250722,0.000083,0.000092,0.000154,0.000213,0.0
