Compute a flag based on seed clustereing

In [3]:
import os
from pathlib import Path
import numpy as np
import requests
import pandas as pd
import joblib

from sbdata.FlipsideApi import FlipsideApi
from sbutils import LoadData
from sblegos.TransactionAnalyser import TransactionAnalyser as txa

In [4]:
# Set path to data folder
current_dir = Path(os.getcwd())
FOLDER_NAME = 'community_round'
DATA_DIR = os.path.join(current_dir.parent.parent, 'data-regen-rangers')
DATA_DIR_GITCOIN = os.path.join(current_dir.parent.parent, 'data-gitcoin')
GRANT_ROUND_ID = '0x984e29dCB4286c2D9cbAA2c238AfDd8A191Eefbc'
ODC_DATA_DIR = os.path.join(current_dir.parent.parent.parent, 'data')

PATH_TO_VOTES = os.path.join(DATA_DIR_GITCOIN, 'citizen-votes.csv')

CHAIN = 'optimism'
PATH_TO_EXPORT = os.path.join(current_dir.parent.parent, 'tx_data', FOLDER_NAME)

In [5]:
df_matching_address = pd.read_csv(f'../output_gitcoin/full_features/voters_features_{GRANT_ROUND_ID}.csv')
df_votes_feature = pd.read_csv(f'../output_gitcoin/full_features/votes_features_citizen_{GRANT_ROUND_ID}.csv')
df_votes = pd.read_csv(PATH_TO_VOTES).drop(columns=['__row_index'])

In [6]:
array_unique_address = df_votes['voter'].unique()

array_unique_address = np.char.lower(array_unique_address.astype(str))
print(f'Number of unique voter: {len(array_unique_address)}')

Number of unique voter: 17023


In [7]:
# Load data
data_loader = LoadData.LoadData(PATH_TO_EXPORT)
df_tx = data_loader.create_df_tx(CHAIN, array_unique_address)

In [8]:
arr_unique_to_address = df_tx['to_address'].str.lower().unique()

In [9]:
len(arr_unique_to_address)

46252

There are too many addresses to use from_address in (list of address) thus with hard code the query for that round, it is also a lot faster

In [17]:
sql_query_features_interact_address = """
with
  git_eth_donates as (
    select
      a.ORIGIN_FROM_ADDRESS as user
    from
      optimism.core.ez_eth_transfers a
      left join optimism.core.ez_eth_transfers b on a.tx_hash = b.tx_hash
    where
      a.ORIGIN_TO_ADDRESS = '0x984e29dcb4286c2d9cbaa2c238afdd8a191eefbc'
      and a.ORIGIN_FUNCTION_SIGNATURE = '0x7aa54b68'
      and a.ETH_FROM_ADDRESS = a.ORIGIN_FROM_ADDRESS
      and a.BLOCK_TIMESTAMP >= '2023-06-01'
      and b.ETH_FROM_ADDRESS = '0x0e5e1f6a82d1ec6ce5c6d5568096fca96ecde651'
  ),
  git_dai_donates as (
    select
      ORIGIN_FROM_ADDRESS as user
    from
      optimism.core.ez_token_transfers
    where
      ORIGIN_TO_ADDRESS = '0x984e29dcb4286c2d9cbaa2c238afdd8a191eefbc'
      and ORIGIN_FUNCTION_SIGNATURE = '0x7aa54b68'
      and CONTRACT_ADDRESS = '0xda10009cbd5d07dd0cecc66161fc93d7c9000da1'
      and FROM_ADDRESS = ORIGIN_FROM_ADDRESS
  ),
  distinct_voter as (
    select DISTINCT
      (user) as voter
    from
      (
        select
          *
        from
          git_eth_donates
        union all
        select
          *
        from
          git_dai_donates
      )
  ), interacted_address as 
(SELECT DISTINCT address 
FROM (
SELECT FROM_ADDRESS as address
FROM optimism.core.fact_transactions
WHERE TO_ADDRESS IN (SELECT voter FROM distinct_voter)
UNION ALL
SELECT TO_ADDRESS as address
FROM optimism.core.fact_transactions
WHERE FROM_ADDRESS IN (SELECT voter FROM distinct_voter)
)),

  transactions AS (
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      FROM_ADDRESS AS EOA,
      TO_ADDRESS AS COUNTERPARTY,
      ETH_VALUE,
      1 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      FROM_ADDRESS IN (SELECT address FROM interacted_address)
    UNION ALL
    SELECT
      BLOCK_TIMESTAMP,
      TX_FEE,
      TO_ADDRESS AS EOA,
      TO_ADDRESS AS COUNTERPARTY,
      ETH_VALUE,
      0 as BOOLEAN_OUT
    FROM
      optimism.core.fact_transactions
    WHERE
      TO_ADDRESS IN (SELECT address FROM interacted_address)
  )
SELECT
  EOA,
  COUNT(*) as n_tx,
  COUNT(DISTINCT(COUNTERPARTY)) as n_counterparty,
  SUM(ETH_VALUE) as eth_volume,
  SUM(BOOLEAN_OUT) as n_tx_out,
  n_tx - n_tx_out as n_tx_in,
  n_tx_out - n_tx_in as n_tx_diff_out_in,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), CURRENT_TIMESTAMP()) as age,
  TIMESTAMPDIFF(MINUTE, MIN(BLOCK_TIMESTAMP), MAX(BLOCK_TIMESTAMP)) as time_alive,
  CASE WHEN age = 0 THEN 0 ELSE n_tx / age END as tx_min,
  CASE WHEN time_alive = 0 THEN 0 ELSE n_tx / time_alive END as tx_min_alive,
  AVG(TX_FEE) as avg_tx_fee,
  STDDEV(TX_FEE) as std_tx_fee
FROM
  transactions
GROUP BY
  EOA
-- HAVING
--   n_tx < 100

;

"""

In [18]:
# set up api key for flipside
api_key = os.environ['FLIPSIDE_API_KEY2']
flipside_api = FlipsideApi(api_key, timeout_minutes=60)

In [19]:
df_features_add = flipside_api.execute_query(sql_query_features_interact_address)

In [20]:
df_features_add.shape

(40454, 14)

In [21]:
df_features_add['ratio_tx_counterparties'] = df_features_add['n_tx'] / df_features_add['n_counterparty']
df_features_add.drop('__row_index', axis=1, inplace=True)
df_features_add.set_index('eoa', inplace=True)
df_features_add.head(2)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0x88df58d97e818904ae51f7f99831963f2eb3c41b,41,18,0.092383,28,13,15,329197,321399,0.000125,0.000128,9.6e-05,6.8e-05,2.277778
0x320a92f77be6f987a853555224e35add2f09ccbc,266,76,3.592935,241,25,216,671749,671157,0.000396,0.000396,0.0002,0.000201,3.5


In [22]:
df_features_add.isna().sum()

n_tx                          0
n_counterparty                0
eth_volume                    0
n_tx_out                      0
n_tx_in                       0
n_tx_diff_out_in              0
age                           0
time_alive                    0
tx_min                        0
tx_min_alive                  0
avg_tx_fee                    0
std_tx_fee                 3058
ratio_tx_counterparties       0
dtype: int64

In [23]:
df_features_add.std_tx_fee.fillna(0, inplace=True)

In [24]:
model = joblib.load('optimism_cex_dex_logistic_best.joblib')
prediction = model.predict(df_features_add)

In [25]:
print(f'number of address {df_features_add.shape[0]}')
print(f'number of address {len(prediction)}')

number of address 40454
number of address 40454


In [26]:
df_features_add['prediction'] = prediction

In [27]:
df_features_add['prediction'].value_counts()

prediction
0.0    30186
1.0    10268
Name: count, dtype: int64

In [37]:
df_features_add[df_features_add['n_tx'] < 100].value_counts('prediction')

prediction
0.0    25652
1.0     6841
Name: count, dtype: int64

In [40]:
df_features_add[np.logical_and(df_features_add['n_tx'] < 100, df_features_add['prediction'] == 1)].sort_values('ratio_tx_counterparties', ascending=False).head(10)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0xddec2d3877aada172dfacc4cdc3c437e6f56ce38,99,1,0.0,0,99,-99,50007,432,0.00198,0.229167,4.4e-05,1.8e-05,99.0,1.0
0xeb572117ab7ff902f4b49680015f15c1e0520867,99,1,0.434264,0,99,-99,66231,49511,0.001495,0.002,0.000363,0.000205,99.0,1.0
0xc392c31ad5a62963d05dd8ede9ece77bd05ea7f4,98,1,0.098,0,98,-98,101878,50190,0.000962,0.001953,0.000417,0.000203,98.0,1.0
0x77256d49ab301c608f8ffa466936ccf84d07a41c,98,1,0.0,0,98,-98,56932,55551,0.001721,0.001764,0.000301,0.000478,98.0,1.0
0x29a85521afda94a6cbede083d3e8b83da87f8df5,97,1,0.0388,0,97,-97,169670,460,0.000572,0.21087,0.000123,3.4e-05,97.0,1.0
0xd42b3c5c287a7733a2039edf3ddd95ed5c4d1cc2,95,1,0.02553,0,95,-95,98884,97434,0.000961,0.000975,0.000383,0.000269,95.0,1.0
0x35c9d05558da3a3f3cddbf34a8e364e59b857004,92,1,9.93968,0,92,-92,97343,10198,0.000945,0.009021,0.001058,0.001081,92.0,1.0
0xf399bef3f4a3d8c446f17e73041cd4651dd422e8,91,1,0.0,0,91,-91,106684,78712,0.000853,0.001156,0.000415,0.000147,91.0,1.0
0x91bd02006ef183a28da93062961edf2ce01e711b,91,1,0.0,0,91,-91,100045,59097,0.00091,0.00154,0.000361,0.000174,91.0,1.0
0xf2485fbcd3d3f8c4525bbb932be28f0021124290,91,1,0.0,0,91,-91,108696,81168,0.000837,0.001121,0.000395,9.9e-05,91.0,1.0


In [43]:
df_features_add[np.logical_and(df_features_add['n_tx'] < 100, df_features_add['prediction'] == 1)].value_counts('n_tx_out')

n_tx_out
0     3130
1     2929
2      365
3      158
4       64
5       48
6       45
8       23
7       19
9       10
11       8
10       7
15       6
14       5
13       5
17       4
16       2
19       2
20       2
25       2
12       1
18       1
22       1
29       1
31       1
39       1
40       1
Name: count, dtype: int64

The number of transactions out must be different from 0, if it is equals to zero it is probably a kind of nft contract

In [45]:
df_select = df_features_add[np.logical_and(np.logical_and(df_features_add['n_tx'] < 100, df_features_add['prediction'] == 1), df_features_add['n_tx_out'] != 0)].sort_values('ratio_tx_counterparties', ascending=False)

In [46]:
df_select.shape

(3711, 14)

for a deposit wallet we expect the number of incomming connection to be closed to the number of outgoing transaction, thus n_tx_fi_out_in close to 0

In [55]:
df_select['diff_tx_small'] = np.logical_and(np.abs(df_select['n_tx_diff_out_in']) < df_select['n_tx_in'] / 5, np.abs(df_select['n_tx_diff_out_in']) < df_select['n_tx_out'] / 5)

In [57]:
df_select['diff_tx_small'].value_counts()

diff_tx_small
True     2868
False     843
Name: count, dtype: int64

In [60]:
df_select[df_select['diff_tx_small'] == True].sort_values('n_counterparty', ascending=False).head(10)

Unnamed: 0_level_0,n_tx,n_counterparty,eth_volume,n_tx_out,n_tx_in,n_tx_diff_out_in,age,time_alive,tx_min,tx_min_alive,avg_tx_fee,std_tx_fee,ratio_tx_counterparties,prediction,diff_tx_small
eoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0x546543b587978dc8d137a3ab01521527477ac791,15,5,0.495149,8,7,1,121574,80423,0.000123,0.000187,0.002603,0.004127,3.0,1.0,True
0x0161d02bf4dc4f02405f81062c904fa8ab629d4f,4,3,0.063557,2,2,0,558775,11,7e-06,0.363636,0.000161,7.8e-05,1.333333,1.0,True
0xc7a8ee1b5812442aadc86be69bd47e9b4a117222,4,3,0.00013,2,2,0,428399,7,9e-06,0.571429,3.2e-05,2e-06,1.333333,1.0,True
0xeaaa6b8afabb4718da9fc4667b7bf6ea1b9d7f45,4,3,0.00329,2,2,0,187768,9,2.1e-05,0.444444,0.000266,0.000138,1.333333,1.0,True
0x8dcdcf20e7cf4be327cdd478f0bd467d4d7b420c,4,3,0.000164,2,2,0,322940,5,1.2e-05,0.8,5e-05,1e-05,1.333333,1.0,True
0x9e1e23211c3536a8d686c60b29d36d541351db73,4,3,0.115628,2,2,0,68768,21,5.8e-05,0.190476,0.000242,2.8e-05,1.333333,1.0,True
0x9c88b8607890035d32e8f7edcad1d73c4861245b,4,3,0.095459,2,2,0,141126,13,2.8e-05,0.307692,9.7e-05,4.1e-05,1.333333,1.0,True
0x708f2ddaafb1aa298d336dd4a5ec0ea4c42277f7,4,3,0.017,2,2,0,244117,14,1.6e-05,0.285714,5.7e-05,6e-06,1.333333,1.0,True
0x7e9227e4ef0bc4e223d6fafea5c6b9399ee2597d,4,3,0.011887,2,2,0,336489,7,1.2e-05,0.571429,4.5e-05,9e-06,1.333333,1.0,True
0x6cff499ffb6054d1b0b32315326a0c57b09a5046,4,3,0.005056,2,2,0,375850,4,1.1e-05,1.0,7.4e-05,2.6e-05,1.333333,1.0,True


In [61]:
deposit_address = df_select[df_select['diff_tx_small'] == True].index.to_list()

In [66]:
df_tx_in_deposit = df_tx[df_tx['to_address'].isin(deposit_address)]

In [73]:
gb_dep = df_tx_in_deposit.drop_duplicates(subset=['from_address', 'to_address']).groupby('to_address')

In [81]:
list_similar_deposit = []
for name, group in gb_dep:
    if group.shape[0] > 1:
        list_similar_deposit.append(group['from_address'].values)


In [82]:
list_similar_deposit

[array(['0x37e2bbc7b95810b064d80e02ad3421e30f428ef6',
        '0x5f96322bafb9b06e33930e68bf8db3f23f83f126'], dtype=object),
 array(['0x4eff2f739b1b0d5719662f9094a06a742ed99c43',
        '0xab62265d80c6b88a01ffe8a7edd58c2a1a7b0db6'], dtype=object),
 array(['0x0f9c4213c040a39db2ba6f833472f739e61710b4',
        '0x2de3ba92ff11bad1d8a7efc40458368abe7056a0'], dtype=object),
 array(['0x228419c1e630944bc869c143d376112bc7f5f9cf',
        '0xa5cfb6838e1e9109c8ecdf176419b75277433f33',
        '0xc4c6f9e8f0ce50a284414fbf8e722b02435baef6'], dtype=object),
 array(['0x4749f46a18f07cf8303b42faadef413c6ca63af7',
        '0xc1ca319907101604b014598c4c07689802f2e243'], dtype=object),
 array(['0x07502e888ad3e92f18dc26d0cc40a2b8a4cb0d60',
        '0x1c8dbe89479c8b2e55a921be0038f16638a52b49'], dtype=object),
 array(['0x300588b284f30439bcb32e8ac85321410074e31b',
        '0x9929485d855048f49d518457fb9980a57139bc39'], dtype=object)]