# Compare flagging methods on the citizen round results

In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Set path to data folder
current_dir = Path(os.getcwd())
DATA_DIR = os.path.join(current_dir.parent.parent, 'data-regen-rangers')
DATA_DIR_GITCOIN = os.path.join(current_dir.parent.parent, 'data-gitcoin')
GRANT_ROUND_ID = '0x984e29dCB4286c2D9cbAA2c238AfDd8A191Eefbc'
ODC_DATA_DIR = os.path.join(current_dir.parent.parent.parent, 'data')

PATH_TO_VOTES = os.path.join(DATA_DIR_GITCOIN, 'citizen-votes.csv')
# PATH_TO_GRANTS = os.path.join(DATA_DIR, "all-allo-rounds.csv")
# PATH_TO_PROJECTS = os.path.join(DATA_DIR, "projects_QmQurt.csv")

In [27]:
df_matching_address = pd.read_csv(f'../output_gitcoin/full_features/voters_features_{GRANT_ROUND_ID}.csv')
df_votes = pd.read_csv(f'../output_gitcoin/full_features/votes_features_citizen_last_{GRANT_ROUND_ID}.csv')
df_seed = pd.read_csv(os.path.join(DATA_DIR, 'seed_wallet_citizen.csv'))
df_trusta = pd.read_excel(os.path.join(DATA_DIR, 'citizen_trusta_lab.xlsx'))
df_gray = pd.read_json(os.path.join(DATA_DIR, 'citizen_sybil_clusters_gray.json'), typ='series').reset_index()

In [None]:
df_matching_address.columns

Index(['seed_same_naive', 'seed_same', 'seed_suspicious', 'less_5_tx',
       'less_10_tx', 'interacted_other_ctbt', 'lcs', 'cluster_size_lcs',
       'mean_score_lcs', 'max_score_lcs', 'has_lcs',
       'count_interaction_with_pool', 'count_interaction_with_toxic',
       'interact_less_5tx', 'has_interaction_toxic', 'has_no_pool_interaction',
       'count_interaction_with_airdrop_m', 'is_airdrop_master',
       'count_interaction_with_tornado', 'count_interaction_with_disperse',
       'has_interaction_airdrop_m', 'has_interaction_tornado',
       'has_interaction_disperse', 'flagged', 'stakeridoo_detected',
       'doge_detected', 'really_suspicicious_cluster', 'odc_detected',
       'address'],
      dtype='object')

In [None]:
df_votes.columns

Index(['block_timestamp', 'tx_hash', 'voter', 'project', 'amount_usd',
       '__row_index', 'seed_same_naive', 'seed_same', 'seed_suspicious',
       'less_5_tx', 'less_10_tx', 'interacted_other_ctbt', 'lcs',
       'cluster_size_lcs', 'mean_score_lcs', 'max_score_lcs', 'has_lcs',
       'count_interaction_with_pool', 'count_interaction_with_toxic',
       'interact_less_5tx', 'has_interaction_toxic', 'has_no_pool_interaction',
       'count_interaction_with_airdrop_m', 'is_airdrop_master',
       'count_interaction_with_tornado', 'count_interaction_with_disperse',
       'has_interaction_airdrop_m', 'has_interaction_tornado',
       'has_interaction_disperse', 'flagged', 'stakeridoo_detected',
       'doge_detected', 'odc_detected', 'has_seed_cluster', 'cluster_number',
       'really_suspicicious_cluster'],
      dtype='object')

In [None]:
df_votes.sum()

In [None]:
df_seed.head(2)

Unnamed: 0,EOA,from_address,to_address
0,0x000000006f457c0f8f560333d9c2877287d92a92,0xacd03d601e5bb1b275bb94076ff46ed9d753435a,0x000000006f457c0f8f560333d9c2877287d92a92
1,0x000128fa45d79dc9af8016da242781f12c363fd5,0xe4edb277e41dc89ab076a1f049f4a3efa700bce8,0x000128fa45d79dc9af8016da242781f12c363fd5


There is no duplicate votes in the votes data frame

In [None]:
print(f'Number of votes: {df_votes.shape[0]}')
print(f'Number of unique votes: {df_votes.drop_duplicates().shape[0]}')
print(f'Number of unique voters: {df_votes.voter.nunique()}')

Number of votes: 57366
Number of unique votes: 57366
Number of unique voters: 17023


In [None]:
df_votes.groupby('voter').count().tx_hash.describe()

count    17023.000000
mean         3.369911
std          4.748125
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         99.000000
Name: tx_hash, dtype: float64

## Add Trusta lab flag

In [46]:
df_votes['trusta'] = df_votes.voter.isin(df_trusta.sybil_address)
df_matching_address['trusta'] = df_matching_address.address.isin(df_trusta.sybil_address)

## Add Gray Flag

In [47]:
address_flag_gray = np.unique(np.concatenate(df_gray.iloc[:, 1].values))
df_votes['gray'] = df_votes.voter.isin(address_flag_gray)
df_matching_address['gray'] = df_matching_address.address.isin(address_flag_gray)

## Add or column

In [81]:
or_flags = ['odc_detected', 'trusta', 'gray']
df_votes['or_flag'] = df_votes.loc[:, or_flags].sum(axis=1) > 0
df_matching_address['or_flag'] = df_matching_address.loc[:, or_flags].sum(axis=1) > 0

In [82]:
flagging_legos = ['has_lcs', 'has_interaction_toxic', 'has_interaction_airdrop_m', 'has_interaction_disperse', 'is_airdrop_master', 'interact_less_5tx', 'flagged', 'stakeridoo_detected', 'doge_detected', 'really_suspicicious_cluster', 'odc_detected', 'trusta', 'gray', 'or_flag']

In [85]:
df_sum = df_votes[flagging_legos].sum().reset_index().set_axis(['flag', 'votes'], axis='columns').merge(df_matching_address[flagging_legos].sum().reset_index().set_axis(['flag', 'voter'], axis='columns'), left_on='flag', right_on='flag')

In [126]:
df_sum['ratio'] = df_sum.votes / df_sum.voter

In [127]:
df_sum

Unnamed: 0,flag,votes,voter,ratio
0,has_lcs,3223,1082,2.978743
1,has_interaction_toxic,0,0,
2,has_interaction_airdrop_m,2606,591,4.409475
3,has_interaction_disperse,128,37,3.459459
4,is_airdrop_master,3463,728,4.756868
5,interact_less_5tx,101,34,2.970588
6,flagged,8626,2282,3.780018
7,stakeridoo_detected,983,136,7.227941
8,doge_detected,6,2,3.0
9,really_suspicicious_cluster,2814,758,3.712401


In [132]:
print(f'Squelching rate is of {df_sum[df_sum.flag == "or_flag"].votes.values[0]/ df_votes.shape[0]:.2%}')
print(f'Squelching voter rate is of {df_sum[df_sum.flag == "or_flag"].voter.values[0]/ df_matching_address.shape[0]:.2%}')

Squelching rate is of 22.03%
Squelching voter rate is of 20.61%


# Analysis of the distribution of the flags when a wallet is flagged

In [93]:
df_flag_address = df_matching_address.loc[df_matching_address['or_flag'], flagging_legos].copy()

In [94]:
df_flag_address.shape

(3509, 14)

In [96]:
df_flag_address.head(2)

Unnamed: 0,has_lcs,has_interaction_toxic,has_interaction_airdrop_m,has_interaction_disperse,is_airdrop_master,interact_less_5tx,flagged,stakeridoo_detected,doge_detected,really_suspicicious_cluster,odc_detected,trusta,gray,or_flag
9,False,False,False,False,False,False,False,False,False,True,True,True,False,True
10,False,False,False,False,False,False,False,False,False,True,True,True,False,True


In [120]:
from scipy.spatial.distance import pdist

def similarity_matrix(df, metric='jaccard'):
    
    n = df.shape[1]
    scores = 1 - pdist(np.array(df).T, metric=metric)
    result = np.zeros((n,n))
    result[np.triu_indices(n, k=1)] = scores
    result += result.T
    np.fill_diagonal(result, 1)
    return pd.DataFrame(result, index=df.columns, columns=df.columns)

jaccard_similarity = similarity_matrix(df_flag_address)
jaccard_similarity

Unnamed: 0,has_lcs,has_interaction_toxic,has_interaction_airdrop_m,has_interaction_disperse,is_airdrop_master,interact_less_5tx,flagged,stakeridoo_detected,doge_detected,really_suspicicious_cluster,odc_detected,trusta,gray,or_flag
has_lcs,1.0,0.0,0.002397,0.0,0.002215,0.001795,0.474145,0.0,0.0,0.019956,0.363575,0.015296,0.045233,0.30835
has_interaction_toxic,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
has_interaction_airdrop_m,0.002397,0.0,1.0,0.0,0.157018,0.00321,0.258983,0.011127,0.001689,0.063042,0.198589,0.059615,0.0,0.168424
has_interaction_disperse,0.0,0.0,0.0,1.0,0.002621,0.0,0.016214,0.0,0.0,0.0,0.012433,0.001828,0.0,0.010544
is_airdrop_master,0.002215,0.0,0.157018,0.002621,1.0,0.0,0.319018,0.005821,0.0,0.018506,0.244624,0.018077,0.0,0.207467
interact_less_5tx,0.001795,0.0,0.00321,0.0,0.0,1.0,0.014899,0.0,0.0,0.011494,0.011425,0.0,0.0,0.009689
flagged,0.474145,0.0,0.258983,0.016214,0.319018,0.014899,1.0,0.004988,0.000438,0.047553,0.766801,0.034828,0.024649,0.650328
stakeridoo_detected,0.0,0.0,0.011127,0.0,0.005821,0.0,0.004988,1.0,0.0,0.06302,0.045699,0.083752,0.0,0.038757
doge_detected,0.0,0.0,0.001689,0.0,0.0,0.0,0.000438,0.0,1.0,0.0,0.000672,0.0,0.0,0.00057
really_suspicicious_cluster,0.019956,0.0,0.063042,0.0,0.018506,0.011494,0.047553,0.06302,0.0,1.0,0.254704,0.16422,0.012898,0.216016


In [101]:
import plotly.express as px
#plotly heatmap
fig = px.imshow(jaccard_similarity, color_continuous_scale='Blues')
fig.show()

Moslty each method find different kind of wallets

We can see which method detects the largest number of wallets over the final flags:

In [103]:
jaccard_similarity['or_flag'].sort_values(ascending=False)

or_flag                        1.000000
odc_detected                   0.848105
flagged                        0.650328
has_lcs                        0.308350
really_suspicicious_cluster    0.216016
is_airdrop_master              0.207467
has_interaction_airdrop_m      0.168424
trusta                         0.145626
gray                           0.119692
stakeridoo_detected            0.038757
has_interaction_disperse       0.010544
interact_less_5tx              0.009689
doge_detected                  0.000570
has_interaction_toxic          0.000000
Name: or_flag, dtype: float64

Surprisingly, the foolowing flag do not have large overlap:
really_suspicicious_cluster, is_airdrop_master, has_interaction_airdrop_m
This can be explain by the way jacard similarity is computed
looking at the Hamming distance between the flags is a better indicator of how similar the flags are.

In [124]:
jaccard_similarity.loc[['really_suspicicious_cluster', 'is_airdrop_master', 'has_interaction_airdrop_m'], ['really_suspicicious_cluster', 'is_airdrop_master', 'has_interaction_airdrop_m']]

Unnamed: 0,really_suspicicious_cluster,is_airdrop_master,has_interaction_airdrop_m
really_suspicicious_cluster,1.0,0.018506,0.063042
is_airdrop_master,0.018506,1.0,0.157018
has_interaction_airdrop_m,0.063042,0.157018,1.0


In [121]:
hamming_similarity = similarity_matrix(df_flag_address, metric='hamming')

In [125]:
fig = px.imshow(hamming_similarity, color_continuous_scale='Blues')
fig.show()

In [123]:
hamming_similarity.loc[['really_suspicicious_cluster', 'is_airdrop_master', 'has_interaction_airdrop_m'], ['really_suspicicious_cluster', 'is_airdrop_master', 'has_interaction_airdrop_m']]

Unnamed: 0,really_suspicicious_cluster,is_airdrop_master,has_interaction_airdrop_m
really_suspicicious_cluster,1.0,0.591907,0.661157
is_airdrop_master,0.591907,1.0,0.726133
has_interaction_airdrop_m,0.661157,0.726133,1.0
