In [2]:
import pandas as pd
import numpy as np
import os
import importlib
import sybil_functions
import trino_fetch_results
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
import networkx as nx
from cdlib import algorithms
import networkx as nx
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import OPTICS
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt


importlib.reload(sybil_functions)
importlib.reload(trino_fetch_results)


from sybil_functions import (
    read_from_multiple_csv,
    check_df,
    count_pairs,
    remove_contract_transactions,
    stretched_sigmoid,
    get_weight_df,
    plot_weight_dist,
    plot_weight_cumulative_dist,
    create_community,
    community_visualization,
    file_name_lst,
    random_rate,
    find_main_wallet,
    find_transfer_for_wallet,
    find_commu_for_wallet,
    uncommon_wallets,
    filter_community_lst,
    expand_community_lst,
    calculate_likelihoods,
    check_contract,
)

from trino_fetch_results import (
    check_frequent_wallets,
    wallets_monthly_info,
    wallets_last_info,
)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'bayanpy', 'graph_tool'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw', 'pyclustering'}


# Import Data

In [3]:
# core wallet df
core_wallet_df = pd.read_excel(
    "/Users/floras/Desktop/unique-user/moca-airdrop/data/moca_airdrop_user_info.xlsx"
)
core_wallet_df.head()

Unnamed: 0,email,uuid,uuid.1,moca_id,id,name,abstract_account_address,web3auth_wallet_address,address,id.1,uuid.2,user_id
0,,0016e1a3-0cdf-475b-b921-2d121453bece,0016e1a3-0cdf-475b-b921-2d121453bece,kppytiuynykiboo,7418689.0,kppytiuynykiboo,0x4b5591379a65c0180103f421cce640c679d1d9d8,0xc0f32955a9b37e6c1fa235a4f330c2d23545af88,0xc0f32955a9b37e6c1fa235a4f330c2d23545af88,590112.0,2d45d118-3d6f-4c7f-930d-5c9f3d56f612,7418689.0
1,semidmitrii@gmail.com,00284435-f796-45ad-8c61-5cf4a5417043,00284435-f796-45ad-8c61-5cf4a5417043,therock,25939.0,therock,0x3f1f1694941ebea878fe9e88b048dcb6f0c8f07b,0x5335b140366404b2bd875339322de16f9cd9651c,0xe23308c15434f903825112035cc1d26914fab078,60816.0,6ee964db-f14c-49ad-84db-84d937463cb3,25939.0
2,,0048b42e-7630-42a2-9633-9ee78c35f2eb,0048b42e-7630-42a2-9633-9ee78c35f2eb,mocae,7361874.0,mocae,0x38c24c4158033ef3967f102748cba1d56079bd31,0x9c590bb4e0ffb4c9adfc03dfa2ee2484502ce067,0x9c590bb4e0ffb4c9adfc03dfa2ee2484502ce067,43567.0,5cbe41a2-b90c-43cb-9560-af5cb0cb2b2f,7361874.0
3,,00799be0-8c24-4f0b-a442-bbb1279a63d3,00799be0-8c24-4f0b-a442-bbb1279a63d3,accelerando,8000019.0,accelerando,0xd86b10033c4d718b9b5f330ab270cd17c31cba97,0xaa6c6d0ed231da328814c2b7853d2e4350b70000,0xaa6c6d0ed231da328814c2b7853d2e4350b70000,294969.0,182f98d8-9098-4763-be3d-4e47985beb8f,8000019.0
4,,00799be0-8c24-4f0b-a442-bbb1279a63d3,00799be0-8c24-4f0b-a442-bbb1279a63d3,accelerando,8000019.0,accelerando,0xd86b10033c4d718b9b5f330ab270cd17c31cba97,0xaa6c6d0ed231da328814c2b7853d2e4350b70000,0xd1f01c4bc479967f4dbde0a73f961dd14afa524f,343872.0,abe31f57-dcd4-4bec-a434-414e2c0d2181,8000019.0


In [4]:
# Get core wallet list
core_wallet_lst = core_wallet_df["address"].tolist()
# Get core wallet set
core_wallet_set = set(core_wallet_lst)

In [5]:
# Import transaction data
native_trans_path = "/Users/floras/Desktop/unique-user/moca-airdrop/data/raw-data"
native_trans_file_lst = file_name_lst(
    native_trans_path, startswith="native_m1", endswith=".csv"
)

In [97]:
native_trans_df_0 = read_from_multiple_csv(native_trans_path, native_trans_file_lst)
native_trans_df_0.columns = [
    "timestamp",
    "from_address",
    "to_address",
    "tx_count",
]
# remove self transactions
native_trans_df_0 = native_trans_df_0[
    native_trans_df_0["from_address"] != native_trans_df_0["to_address"]
]
print(native_trans_df_0.shape)
native_trans_df_0.head()

(28137, 4)


Unnamed: 0,timestamp,from_address,to_address,tx_count
0,2024-01-01 00:00:00.000 UTC,0xf3a8ae48c5029aab66f65be26ece0ff43bd1486f,0x25c805eaa831a0c98f45580e50676e041dcc87b8,1
1,2024-01-01 00:00:00.000 UTC,0xc888aea684b1d8f3b832f356dfc84bf0a1dc3406,0x9347ea27d20da04fa3af9444ebd086c73ff6553b,2
2,2024-01-01 00:00:00.000 UTC,0xb6f7f9326d631133c2b16e8822e6320ccfc34e23,0x852c2bd9230dcbf2a704d36b96a33c45e07ca192,1
3,2024-01-01 00:00:00.000 UTC,0x6bf7891adc8822aa26de1665887cccf921d475b8,0xd4118ba3793e1c20d6136bee2539ca011691d010,2
4,2024-01-01 00:00:00.000 UTC,0x077d360f11d220e4d5d831430c81c26c9be7c4a4,0x1f1bab9c578d92a10e2a061f86a9e6f74f25a8b5,1


# remove contract/hot wallets & transactions

## Define wallets to be checked based on the number of interacted wallets

In [98]:
def columns_unique_items(df, column_name_lst):
    unique_items = []
    for column_name in column_name_lst:
        unique_items.extend(df[column_name].unique().tolist())
    return list(set(unique_items))

In [99]:
all_wallets = columns_unique_items(native_trans_df_0, ["from_address", "to_address"])

In [100]:
def columns_item_count(unique_pairs_df, w_cols=["wallet_a", "wallet_b"]):
    # Flatten the selected columns into a single list
    wallet_z = unique_pairs_df[w_cols].values.flatten()
    # Count occurrences of each wallet using Counter
    wallet_z_count = Counter(wallet_z)
    # Convert the Counter object into a DataFrame
    wallet_z_count_df = pd.DataFrame(
        wallet_z_count.items(), columns=["wallet", "interacted_wallets"]
    )
    # Return the resulting DataFrame
    return wallet_z_count_df

In [101]:
def columns_item_unique_pairs(
    raw_transaction_df, address_cols=["from_address", "to_address"]
):
    # Ensure input columns are present in the DataFrame
    if not all(col in raw_transaction_df.columns for col in address_cols):
        raise ValueError(f"Columns {address_cols} not found in the DataFrame.")

    # Convert the columns to strings to ensure consistent dtype
    raw_transaction_df[address_cols] = raw_transaction_df[address_cols].astype(str)
    # Create sorted pairs using numpy operations (faster than apply)
    pairs = np.sort(raw_transaction_df[address_cols].to_numpy(dtype=str), axis=1)
    # Deduplicate pairs using numpy.unique
    unique_pairs = np.unique(pairs, axis=0)
    # Create a DataFrame for unique pairs
    unique_pairs_df = pd.DataFrame(unique_pairs, columns=["wallet_a", "wallet_b"])

    return unique_pairs_df

In [102]:
def columns_item_unique_pair_counts(
    raw_transaction_df, address_cols=["from_address", "to_address"]
):
    unique_pairs_df = columns_item_unique_pairs(raw_transaction_df, address_cols)
    # Use the optimized columns_item_count function
    wallet_count = columns_item_count(unique_pairs_df, w_cols=["wallet_a", "wallet_b"])
    # Reset index before returning
    wallet_count.reset_index(drop=True, inplace=True)
    return wallet_count

In [103]:
sample_wallet_pair_count_df = columns_item_unique_pair_counts(native_trans_df_0)
sample_wallet_pair_count_df.head()

Unnamed: 0,wallet,interacted_wallets
0,0x00000002f32c0886ee65d68059fbdb76ef6a6996,5
1,0x04f23cfd959b1f8aaf492e1d44c4c567102cf7db,3
2,0x7fd06d484a0d6ae974c3b45e1e63ff9ee5160889,3
3,0x8888889e0adf6c5afe2d5a6af91d31984365a1aa,3
4,0xe43888320d5361ce4357d0dccdb2aee4ad99b9a0,20


In [104]:
check_contract_wallet_lst = sample_wallet_pair_count_df[
    sample_wallet_pair_count_df["interacted_wallets"] > 1
]["wallet"].tolist()
check_contract_wallet_lst[:5]

['0x00000002f32c0886ee65d68059fbdb76ef6a6996',
 '0x04f23cfd959b1f8aaf492e1d44c4c567102cf7db',
 '0x7fd06d484a0d6ae974c3b45e1e63ff9ee5160889',
 '0x8888889e0adf6c5afe2d5a6af91d31984365a1aa',
 '0xe43888320d5361ce4357d0dccdb2aee4ad99b9a0']

In [105]:
def refine_lst(lst, exclude_lst):
    exclude_set = set(exclude_lst)
    return [x for x in lst if x not in exclude_set]

In [106]:
# remove core wallets from check_contract_wallet_lst
check_contract_wallet_lst = refine_lst(check_contract_wallet_lst, core_wallet_lst)

## get contract wallets list

In [107]:
def check_contract_wallet_addresses(wallet_addresses, max_threads=10):
    contract_wallets = []
    with ThreadPoolExecutor(max_threads) as executor:
        # Submit tasks for concurrent execution
        futures = [
            executor.submit(check_contract, wallet) for wallet in wallet_addresses
        ]
        for future in futures:
            results = future.result()
            if results:
                contract_wallets.append(results)
    return contract_wallets

In [108]:
check_contract_wallet_addresses(check_contract_wallet_lst[:10])

[]

## get hot wallets list

In [109]:
check_frequent_wallets(
    check_contract_wallet_lst[10:15], start_date_="2024-12-01", end_date_="2024-12-02"
)

['0x3b5a23f6207d87b423c6789d2625ea620423b32d',
 '0xdd3cb5c974601bc3974d908ea4a86020f9999e0c']

## remove contract/hot transactions

In [110]:
def filter_blacklist_transactions(df, col_lst, contract_lst):
    for col in col_lst:
        df = df[~df[col].isin(contract_lst)]
    return df

# Define weight

In [111]:
def columns_item_unique_pair_sums(
    raw_transaction_df, address_cols=["from_address", "to_address"], sum_col="tx_count"
):
    raw_transaction_df["pair"] = raw_transaction_df[address_cols].apply(
        lambda x: tuple(sorted(x)), axis=1
    )
    unique_pair_sums = raw_transaction_df.groupby("pair")[sum_col].sum().reset_index()
    unique_pair_sums["node_a"] = unique_pair_sums["pair"].apply(lambda x: x[0])
    unique_pair_sums["node_b"] = unique_pair_sums["pair"].apply(lambda x: x[1])
    unique_pair_sums.drop(columns=["pair"], inplace=True)
    return unique_pair_sums

In [112]:
def sigmoid_weight_transform(df, turning_point=2, col="tx_count"):
    df["weight"] = df[col].apply(lambda x: stretched_sigmoid(x, 2 / turning_point))
    return df

In [113]:
weight_df = columns_item_unique_pair_sums(native_trans_df_0)
weight_df = sigmoid_weight_transform(weight_df, 5)
weight_df.head()

Unnamed: 0,tx_count,node_a,node_b,weight
0,2,0x00000002f32c0886ee65d68059fbdb76ef6a6996,0x04f23cfd959b1f8aaf492e1d44c4c567102cf7db,0.689974
1,3,0x00000002f32c0886ee65d68059fbdb76ef6a6996,0x7fd06d484a0d6ae974c3b45e1e63ff9ee5160889,0.768525
2,1,0x00000002f32c0886ee65d68059fbdb76ef6a6996,0x8888889e0adf6c5afe2d5a6af91d31984365a1aa,0.598688
3,4,0x00000002f32c0886ee65d68059fbdb76ef6a6996,0xe43888320d5361ce4357d0dccdb2aee4ad99b9a0,0.832018
4,3,0x00000002f32c0886ee65d68059fbdb76ef6a6996,0xf36e5c04cdb9b465ce44e125c679f9d7a7fc799a,0.768525


# build community

In [114]:
def louvain_community_lst_func(
    df, resolution=1, node_cols=["node_a", "node_b"], weight_col="weight"
):
    unique_wallets_num = len(columns_unique_items(df, node_cols))
    num_edges = df.shape[0]
    print(f"Number of unique wallets: {unique_wallets_num}")
    print(f"Number of edges: {num_edges}")

    # Create a graph
    G = nx.Graph()

    # Add edges to the graph
    for _, row in df.iterrows():
        G.add_edge(row[node_cols[0]], row[node_cols[1]], weight=row[weight_col])

    communities = algorithms.louvain(G, weight=weight_col, resolution=resolution)

    communities_list = communities.communities
    print(f"Number of communities detected: {len(communities_list)}")
    print("-")
    display(pd.Series([len(community) for community in communities_list]).describe())
    print("")

    communities_list = sorted(communities_list, key=lambda x: len(x), reverse=True)

    return communities_list

In [115]:
def nested_lst_to_loc_dict(nested_lst_):
    loc_dict = {}
    for i, commu in enumerate(nested_lst_):
        for w in commu:
            loc_dict[w] = i
    return loc_dict

In [145]:
louvain_community_lst = louvain_community_lst_func(weight_df, resolution=200)

Number of unique wallets: 25198
Number of edges: 23489
Number of communities detected: 7107
-


count    7107.000000
mean        3.545519
std         5.137896
min         1.000000
25%         1.000000
50%         2.000000
75%         4.000000
max       102.000000
dtype: float64




In [234]:
def louvain_community_df_func(louvain_community_lst_):
    louvain_community_df_ = nested_lst_to_loc_dict(louvain_community_lst_)
    louvain_community_df_ = pd.DataFrame.from_dict(
        louvain_community_df_, orient="index", columns=["community"]
    )
    louvain_community_df_["type"] = "normal"
    return louvain_community_df_

In [236]:
louvain_community_df = louvain_community_df_func(louvain_community_lst)
louvain_community_df

Unnamed: 0,community,type
0x031c605262ec85aec6f793a5dd13bf21b6464af8,0,normal
0xb5a260659533ed9fe659b8c4254e4ff37047eadd,0,normal
0x04531b9d88da0eb58bb379637d4a914bdc122fdf,0,normal
0x0886b7d1549b70144eee26f05cf937259a3002d7,0,normal
0x112bcd74c8664cedbc0357f0021dfd485c891393,0,normal
...,...,...
0xfd6c69023c610350d6135b190ffb15b04a7aa9f2,7102,normal
0xfe603af99a456eff9a34cd9b29e5a5db9ffd4927,7103,normal
0xfe8da89113f28f65d71ee7b79de3b65704784529,7104,normal
0xff60fa1389cafb68b0537d69c981f5c4409977e7,7105,normal


### plot a community

In [368]:
plot_community_lst = louvain_community_lst[85]
plot_community_df = weight_df[
    weight_df["node_a"].isin(plot_community_lst)
    & weight_df["node_b"].isin(plot_community_lst)
]
print(plot_community_df.shape)
plot_community_df

(20, 4)


Unnamed: 0,tx_count,node_a,node_b,weight
1372,2,0x05415963e593fcfbab4cd6b01340fc495047423f,0x1650683e50e075efc778be4d1a6be929f3831719,0.689974
1373,1,0x05415963e593fcfbab4cd6b01340fc495047423f,0x36e242d81187123c4860c7e86874c23a8a21b6ad,0.598688
1377,2,0x05415963e593fcfbab4cd6b01340fc495047423f,0x8e202d4b77ac50343e753d898a2ef965f79abaa7,0.689974
1378,1,0x05415963e593fcfbab4cd6b01340fc495047423f,0xc132d6439a1a139f9a10e56150c00f448c370a61,0.598688
4463,1,0x1650683e50e075efc778be4d1a6be929f3831719,0x1c1bd8c213ecf004ace7866e12abc405a19b3ae3,0.598688
4465,1,0x1650683e50e075efc778be4d1a6be929f3831719,0x268766bd81e082b0a8e671fa950525f57b096b3b,0.598688
4466,1,0x1650683e50e075efc778be4d1a6be929f3831719,0x449d04edc47f0b69477f6876815230264f76e1d6,0.598688
4467,1,0x1650683e50e075efc778be4d1a6be929f3831719,0x73d05eb78a200b1400b882939aab240dfcd6a824,0.598688
4468,2,0x1650683e50e075efc778be4d1a6be929f3831719,0x8e7fef28b08e6845fbeabd972a3bd963939ab54a,0.689974
4469,1,0x1650683e50e075efc778be4d1a6be929f3831719,0xa17c6c6af848ac474f99480fda8564356896b3e3,0.598688


In [201]:
def community_visualization_2d(
    df,
    highlight_lst,
    highlight_lst_2_=None,
    source_col="node_a",
    target_col="node_b",
    weight_col="weight",
    weight_=800,
    height_=800,
):
    # Create a directed graph
    Gt = nx.from_pandas_edgelist(
        df,
        source=source_col,
        target=target_col,
        edge_attr=weight_col,
        create_using=nx.DiGraph(),
    )

    # Generate 2D positions for nodes
    pos = nx.spring_layout(Gt, weight=weight_col)

    # Extract node and edge positions
    edge_x = []
    edge_y = []
    edge_text = []  # List to hold weight information for hover
    for edge in Gt.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(f"weight: {round(edge[2][weight_col], 2)}")

    node_x = [pos[node][0] for node in Gt.nodes()]
    node_y = [pos[node][1] for node in Gt.nodes()]

    # Determine node colors based on the DataFrame
    node_colors = []
    for node in Gt.nodes():
        if node in highlight_lst:
            node_colors.append("yellowgreen")  # True condition
        elif highlight_lst_2_ and node in highlight_lst_2_:
            node_colors.append("dodgerblue")  # False condition
        else:
            node_colors.append("salmon")

    # Create Plotly figure
    fig = go.Figure()

    # Add edges
    fig.add_trace(
        go.Scatter(
            x=edge_x,
            y=edge_y,
            mode="lines",
            line=dict(color="black", width=2),
            hoverinfo="text",
            text=edge_text,  # Add edge weights to hover info
        )
    )

    # Add nodes
    fig.add_trace(
        go.Scatter(
            x=node_x,
            y=node_y,
            mode="markers",
            marker=dict(size=10, color=node_colors, line=dict(width=1, color="gray")),
            text=list(Gt.nodes()),  # Show node labels
            hoverinfo="text",
        )
    )

    # Update layout
    fig.update_layout(
        showlegend=False,
        width=weight_,
        height=height_,
        xaxis=dict(showgrid=True, zeroline=False),
        yaxis=dict(showgrid=False, zeroline=False),
        # plot_bgcolor="lightgray",
        title="Community Visualization",
    )

    # Show the plot
    fig.show()

In [369]:
community_visualization_2d(plot_community_df, core_wallet_lst, weight_=800, height_=500)

In [204]:
def community_visualization_3d(
    df,
    highlight_lst,
    highlight_lst_2_=None,
    source_col="node_a",
    target_col="node_b",
    weight_col="weight",
    weight_=800,
    height_=800,
):
    # Create a directed graph
    Gt = nx.from_pandas_edgelist(
        df,
        source=source_col,
        target=target_col,
        edge_attr=weight_col,
        create_using=nx.DiGraph(),
    )

    # Generate 3D positions
    pos = nx.spring_layout(Gt, dim=3, weight=weight_col)

    # Extract node and edge positions
    edge_x = []
    edge_y = []
    edge_z = []
    edge_text = []  # List to hold weight information for hover
    for edge in Gt.edges(data=True):
        x0, y0, z0 = pos[edge[0]]
        x1, y1, z1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_z.extend([z0, z1, None])
        edge_text.append(f"weight: {edge[2][weight_col]}")

    node_x = [pos[node][0] for node in Gt.nodes()]
    node_y = [pos[node][1] for node in Gt.nodes()]
    node_z = [pos[node][2] for node in Gt.nodes()]

    # Determine node colors based on the DataFrame
    node_colors = []
    for node in Gt.nodes():
        if node in highlight_lst:
            node_colors.append("yellowgreen")  # True condition
        elif highlight_lst_2_ and node in highlight_lst_2_:
            node_colors.append("dodgerblue")  # False condition
        else:
            node_colors.append("salmon")

    # Create Plotly figure
    fig = go.Figure()

    # Add edges
    fig.add_trace(
        go.Scatter3d(
            x=edge_x,
            y=edge_y,
            z=edge_z,
            mode="lines",
            line=dict(color="black", width=2),
            text=edge_text,
            hoverinfo="text",
        )
    )

    # Add nodes
    fig.add_trace(
        go.Scatter3d(
            x=node_x,
            y=node_y,
            z=node_z,
            mode="markers",
            marker=dict(size=5, color=node_colors),
            text=list(Gt.nodes()),  # Show node labels on hover
            hoverinfo="text",
            textfont=dict(size=5),
            # hoverinfo="none",
        )
    )

    axis_setting = dict(
        showbackground=True,
        titlefont=dict(size=10),
        tickfont=dict(size=10),
    )
    # Update layout
    fig.update_layout(
        showlegend=False,
        width=weight_,
        height=height_,
        scene=dict(
            xaxis=axis_setting,
            yaxis=axis_setting,
            zaxis=axis_setting,
        ),
    )

    # Show the plot
    fig.show()

In [205]:
community_visualization_3d(plot_community_df, core_wallet_lst, weight_=600, height_=600)

# Get clustering variables

## Community Data

Calculate how many wallets each wallet has interacted with within the community

In [123]:
def clustering_community_info(w_lst, weight_df_=weight_df):
    internal_weight_df = weight_df_[
        weight_df_["node_a"].isin(w_lst) & weight_df_["node_b"].isin(w_lst)
    ]
    wallet_internal_interaction_df = columns_item_count(
        internal_weight_df, w_cols=["node_a", "node_b"]
    )
    # set wallet as index and drop index name
    wallet_internal_interaction_df = wallet_internal_interaction_df.set_index("wallet")
    return wallet_internal_interaction_df

## Trino data 

"xxx_address_monthly_stats_full"
- address active days
- total gas fee in usd
- number of transaction
- wallets interacted
- last transaction time

-- 

(function is done but not sure if should use due to computational power required)

"xxx_address_last_stats_full"
- number of token types on different chains (ETH on Polygon and ETH on ethereum are different)

--

"xxx_token_transfers_full" (not yet given the computational power required)
- interval in second




In [125]:
def clustering_trino_info(w_lst, start_date_, end_date_):
    clustering_info = wallets_monthly_info(w_lst, start_date_, end_date_)
    clustering_info_df = pd.DataFrame(clustering_info).T
    clustering_info_df.columns = [
        "active_days",
        "total_gas_fee_in_usd",
        "txn_count",
        "wallets_interacted",
        "last_txn_date_int",
        "last_txn_date",
        "start_date",
        "end_date",
    ]
    return clustering_info_df

### Combine data from two sources

In [127]:
def clustering_info(w_lst, start_date_, end_date_):
    internal_info = clustering_community_info(w_lst)
    trino_info = clustering_trino_info(w_lst, start_date_, end_date_)
    clustering_info_df = internal_info.merge(
        trino_info, left_index=True, right_index=True
    )

    return clustering_info_df

In [258]:
def full_community_clustering_info(louvain_community_lst_, start_date_, end_date_):
    for _, commu in enumerate(louvain_community_lst_):
        if len(commu) < 3:
            continue
        clustering_info_df = clustering_info(commu, start_date_, end_date_)
        file_path = f"/Users/floras/Desktop/unique-user/-Data/clustering_info/clustering_info_{start_date_}_{end_date_}.csv"
        if os.path.isfile(file_path):
            # Append to the file without writing the header
            clustering_info_df.to_csv(file_path, mode="a", header=False, index=True)
        else:
            # Write a new file with the header
            clustering_info_df.to_csv(file_path, mode="w", header=True, index=True)

# clustering

**Why we scale data?**

- If the dataset has features with vastly different scales (e.g., one feature ranges from 0 to 1, while another ranges from 0 to 1000), the feature with the larger scale will dominate the distance calculations. Normalizing data ensures that each feature contributes equally to the distance calculation. This is particularly important in multi-dimensional datasets where features may not have the same units or magnitude.

- we used standardization (Z-Score Scaling): Scales features to have mean = 0 and standard deviation = 1.

In [154]:
def isolation_forest(clustering_info_df, clustring_features, contamination_):
    if contamination_ > 0.5:
        clustering_info_df["isolation_forest"] = -1
        return clustering_info_df
    # Normalize the data
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(clustering_info_df[clustring_features])

    # Apply Isolation Forest
    iso = IsolationForest(contamination=contamination_)  # Adjust contamination level
    clustering_info_df["isolation_forest"] = iso.fit_predict(scaled_features)

    return clustering_info_df

In [158]:
def optics_clustering(clustering_info_df, clustring_features, min_samples_=3):
    if len(clustering_info_df) < min_samples_:
        clustering_info_df["optics"] = 0
        return clustering_info_df
    # Normalize the data
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(clustering_info_df[clustring_features])

    # Apply OPTICS clustering
    optics = OPTICS(min_samples=min_samples_)
    clustering_info_df["optics"] = optics.fit_predict(scaled_features)

    return clustering_info_df

In [228]:
def two_step_clustering(
    clustering_info_df,
    iso_features_,
    optics_features_,
    core_num=2,
    optics_min_samples=3,
):
    core_num = min(core_num, np.floor(clustering_info_df.shape[0] / 2))
    if core_num < 1:
        raise ValueError("Core number should be at least 1")
    contamination_ = core_num / clustering_info_df.shape[0]
    clustered_df = clustering_info_df.copy()
    clustered_df = isolation_forest(clustered_df, iso_features_, contamination_)
    middle_info_df = clustered_df[clustered_df["isolation_forest"] != -1]
    clusted_middle_info_df = optics_clustering(
        middle_info_df, optics_features_, optics_min_samples
    )
    # merge result back to original df
    clustered_df = pd.merge(
        clustered_df,
        clusted_middle_info_df["optics"],
        how="left",
        left_index=True,
        right_index=True,
    )
    clustered_df["type"] = clustered_df.apply(
        lambda x: (
            "sybil_core"
            if x["isolation_forest"] == -1
            else "normal" if x["optics"] == -1 else "sybil"
        ),
        axis=1,
    )
    return clustered_df

In [370]:
test_df = clustering_info(plot_community_lst, None, None)
test_df

Unnamed: 0,interacted_wallets,active_days,total_gas_fee_in_usd,txn_count,wallets_interacted,last_txn_date_int,last_txn_date,start_date,end_date
0x05415963e593fcfbab4cd6b01340fc495047423f,4,2539,2622.214339,3843,487,739221,2024-12-12,,
0x1650683e50e075efc778be4d1a6be929f3831719,10,12889,15997.713794,16227,6667,739231,2024-12-12,,
0x36e242d81187123c4860c7e86874c23a8a21b6ad,1,19,14.259276,19,13,739136,2024-09-07,,
0x8e202d4b77ac50343e753d898a2ef965f79abaa7,1,90,1499.396391,102,32,739213,2024-11-23,,
0xc132d6439a1a139f9a10e56150c00f448c370a61,1,21,4.223352,24,3,738947,2024-11-29,,
0x1c1bd8c213ecf004ace7866e12abc405a19b3ae3,2,249,181.450406,445,125,739209,2024-11-26,,
0x268766bd81e082b0a8e671fa950525f57b096b3b,1,832,240.042779,1053,252,739101,2024-11-04,,
0x449d04edc47f0b69477f6876815230264f76e1d6,2,830,236.382807,1199,190,739174,2024-10-15,,
0x73d05eb78a200b1400b882939aab240dfcd6a824,2,877,325.799701,1326,173,739222,2024-12-02,,
0x8e7fef28b08e6845fbeabd972a3bd963939ab54a,1,194,407.960157,244,94,739043,2024-12-08,,


In [371]:
iso_features = ["interacted_wallets"]
optics_features = [
    "interacted_wallets",
    "active_days",
    "total_gas_fee_in_usd",
    "txn_count",
    "wallets_interacted",
    "last_txn_date_int",
]

clustered_df = two_step_clustering(
    test_df, iso_features, optics_features, core_num = 5, optics_min_samples = 3
)
print(len(clustered_df))
clustered_df.sort_values(by=['optics'])

21


Unnamed: 0,interacted_wallets,active_days,total_gas_fee_in_usd,txn_count,wallets_interacted,last_txn_date_int,last_txn_date,start_date,end_date,isolation_forest,optics,type
0x8e202d4b77ac50343e753d898a2ef965f79abaa7,1,90,1499.396391,102,32,739213,2024-11-23,,,1,-1.0,normal
0x41b44a79dbf04ae0d82abc1cb9da87f2e4cda591,1,120,423.887492,138,64,739077,2024-11-08,,,1,-1.0,normal
0x268766bd81e082b0a8e671fa950525f57b096b3b,1,832,240.042779,1053,252,739101,2024-11-04,,,1,-1.0,normal
0xebcd250474c27cbad3c56f3f34e08f97b370ac2d,2,2193,1137.160924,4069,513,739231,2024-12-11,,,1,-1.0,normal
0x8e7fef28b08e6845fbeabd972a3bd963939ab54a,1,194,407.960157,244,94,739043,2024-12-08,,,1,-1.0,normal
0x36e242d81187123c4860c7e86874c23a8a21b6ad,1,19,14.259276,19,13,739136,2024-09-07,,,1,0.0,sybil
0x40c4fc6b017b91b501a5218f6f3868498b94fb3d,1,61,10.134146,96,33,739140,2024-09-11,,,1,0.0,sybil
0x32e1e3bf7e47c3d26245499c4effaf556b07fcee,1,80,3.597563,85,13,739131,2024-11-23,,,1,0.0,sybil
0xa17c6c6af848ac474f99480fda8564356896b3e3,1,199,33.538301,330,100,739092,2024-12-08,,,1,0.0,sybil
0x647530bb6b399caeaaf27670c2b9b5f7431fbb96,1,62,251.723502,70,38,739170,2024-11-08,,,1,0.0,sybil


In [380]:
clustered_core_wallets = clustered_df[
    clustered_df["type"] == "sybil_core"
].index.tolist()
clustered_normal_wallets = clustered_df[clustered_df["type"] == "normal"].index.tolist()

community_visualization_2d(
    plot_community_df,
    clustered_core_wallets,
    clustered_normal_wallets,
    weight_=600,
    height_=600,
)

In [232]:
ttt = louvain_community_df.copy()
ttt["type"] = "normal"
ttt["type"].update(clustered_df["type"])

In [233]:
ttt["type"].value_counts()

normal        25194
sybil             3
sybil_core        1
Name: type, dtype: int64