In [1]:
!which python

/home/user/jfayzullaev/stellar-clustering/.venv-vis/bin/python


In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    normalized_mutual_info_score as NMI,
    adjusted_rand_score as ARI,
    accuracy_score,
    f1_score,
)

In [15]:
LPA_COMM_TX_FILE   = "transaction/sslpa_tx_lcc_lpa_communities.csv" 
LPA_COMM_TL_FILE   = "trustline/sslpa_trust_lcc_lpa_communities.csv" 


In [None]:
TX_GRAPH_PKL = os.path.expanduser(
    "~/stellar-clustering/network/LCC/transactions/LCC_G_tx_undirected_weighted.pkl"
)
TL_GRAPH_PKL = os.path.expanduser(
    "~/stellar-clustering/network/LCC/trustlines/trust_proj_LCC_idf.pkl"
)

In [16]:
NORM_LABELS_PATH = os.path.expanduser("~/stellar-clustering/network/labled-data/labels/label-normalization/labels_entities_normalized.csv") 

In [17]:
N_SPLITS = 5
RANDOM_STATE = 42

In [18]:
def overall_purity(df_comm_name: pd.DataFrame) -> float:
    if df_comm_name.empty:
        return np.nan
    counts = (
        df_comm_name
        .groupby(['pred_label', 'true_label'])
        .size()
        .reset_index(name='cnt')
    )
    totals = counts.groupby('pred_label')['cnt'].sum()
    max_per = counts.groupby('pred_label')['cnt'].max()
    return float(max_per.sum() / totals.sum())

In [None]:
def load_communities_fixed_resolution(graph_pkl_path: str):


    graph_pkl_path = os.path.expanduser(graph_pkl_path)
    if not os.path.exists(graph_pkl_path):
        raise FileNotFoundError(f"no file: {graph_pkl_path}")
    with open(graph_pkl_path, "rb") as f:
        G = pickle.load(f)
    if isinstance(G, nx.DiGraph):
        G = nx.Graph(G)
    return G

In [None]:
def evaluate_fixed_resolution_cv(
    labels_path: str,
    comm_df,                
    label_col: str = "name",
    n_splits: int = 5,
    random_state: int = 42
):


    G: nx.Graph = comm_df 

    if not os.path.exists(labels_path):
        raise FileNotFoundError(f"no labels: {labels_path}")

    labels_raw = (
        pd.read_csv(labels_path, usecols=['account_id', label_col])
          .dropna(subset=['account_id', label_col])
          .drop_duplicates(subset=['account_id'])
          .rename(columns={label_col: 'name'})
    )

    # keep only labeled accounts present in graph
    try:
        labels_raw['account_id'] = labels_raw['account_id'].astype('int64')
    except Exception:
        labels_raw['account_id'] = labels_raw['account_id'].astype(str)

    in_graph = labels_raw['account_id'].isin(G.nodes())
    labels = labels_raw.loc[in_graph].copy()

    n_labeled = len(labels_raw)
    n_joined  = len(labels)
    coverage  = (n_joined / n_labeled) if n_labeled else 0.0
    if n_joined == 0:
        raise ValueError("no labled in the graph")

    # SSLPA propagation
    def _sslpa_propagate(G: nx.Graph, seeds: dict, max_iter: int = 50):
        def w(u, v): return G[u][v].get('weight', 1.0)
        frozen = set(seeds.keys())
        labels_hat = {n: (seeds[n] if n in frozen else f"UNLAB_{n}") for n in G.nodes()}
        unlabeled_nodes = [n for n in G.nodes() if n not in frozen]
        unlabeled_nodes.sort(key=lambda x: G.degree(x), reverse=True)
        for _ in range(max_iter):
            changed = 0
            for n in unlabeled_nodes:
                acc = defaultdict(float)
                for nbr in G.neighbors(n):
                    lbl = labels_hat.get(nbr)
                    if lbl is None: 
                        continue
                    acc[lbl] += w(n, nbr)
                if not acc:
                    continue
                best = max(acc.values())
                cands = [lbl for lbl, val in acc.items() if val == best]
                old = labels_hat[n]
                new = old if old in cands else sorted(cands, key=str)[0]
                if new != old:
                    labels_hat[n] = new
                    changed += 1
            if changed == 0:
                break
        return labels_hat

    # CV over labeled nodes
    le = LabelEncoder()
    y_all = le.fit_transform(labels['name'].values)
    X_ids = labels['account_id'].values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    rows = []

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_ids, y_all), start=1):
        train_nodes = X_ids[tr_idx]
        test_nodes  = X_ids[te_idx]
        train_labels = labels['name'].values[tr_idx]
        test_true    = labels['name'].values[te_idx]

        # build seeds from train fold
        seeds = dict(zip(train_nodes, train_labels))

        # run propagation on the whole graph
        pred_all = _sslpa_propagate(G, seeds, max_iter=50)

        # predictions for subsets
        y_pred_tr = [pred_all[n] for n in train_nodes]
        y_pred_te = [pred_all.get(n, f"UNLAB_{n}") for n in test_nodes]

        # NMI and ARI need consistent integer codes per fold
        enc_tr = LabelEncoder().fit(list(set(train_labels) | set(y_pred_tr)))
        nmi_tr = NMI(enc_tr.transform(train_labels), enc_tr.transform(y_pred_tr))
        ari_tr = ARI(enc_tr.transform(train_labels), enc_tr.transform(y_pred_tr))

        enc_te = LabelEncoder().fit(list(set(test_true) | set(y_pred_te)))
        nmi_te = NMI(enc_te.transform(test_true), enc_te.transform(y_pred_te))
        ari_te = ARI(enc_te.transform(test_true), enc_te.transform(y_pred_te))

        purity_tr = overall_purity(pd.DataFrame({'pred_label': y_pred_tr, 'true_label': train_labels}))
        purity_te = overall_purity(pd.DataFrame({'pred_label': y_pred_te, 'true_label': test_true}))

        rows.append({
            'fold': fold,
            'n_train': len(train_nodes),
            'n_test': len(test_nodes),
            'train_frac': len(train_nodes) / len(X_ids),
            'NMI_train': nmi_tr,
            'ARI_train': ari_tr,
            'Purity_train': purity_tr,
            'NMI_test': nmi_te,
            'ARI_test': ari_te,
            'Purity_test': purity_te,
        })

    per_fold_df = pd.DataFrame(rows)
    averages = {
        'Avg_NMI_train': float(np.nanmean(per_fold_df['NMI_train'].values)),
        'Avg_ARI_train': float(np.nanmean(per_fold_df['ARI_train'].values)),
        'Avg_Purity_train': float(np.nanmean(per_fold_df['Purity_train'].values)),
        'Avg_NMI_test': float(np.nanmean(per_fold_df['NMI_test'].values)),
        'Avg_ARI_test': float(np.nanmean(per_fold_df['ARI_test'].values)),
        'Avg_Purity_test': float(np.nanmean(per_fold_df['Purity_test'].values)),
        'Avg_train_frac': float(np.nanmean(per_fold_df['train_frac'].values)),
    }
    coverage_info = {'n_labeled': int(n_labeled), 'n_joined': int(n_joined), 'coverage': float(coverage)}
    return per_fold_df, averages, coverage_info

# Transactions 

In [23]:
comm_df = load_communities_fixed_resolution(TX_GRAPH_PKL) 

print(f"Loaded graph from: {TX_GRAPH_PKL}")
print(f"Nodes: {comm_df.number_of_nodes():,}  |  Edges: {comm_df.number_of_edges():,}")


Loaded graph from: /home/user/jfayzullaev/stellar-clustering/network/Community Detection/Louvian/transaction_graph/TX-LCC/LCC_G_tx_undirected_weighted.pkl
Nodes: 206,451  |  Edges: 264,829


In [None]:
norm_per_fold, norm_avg, norm_cov = evaluate_fixed_resolution_cv(
    labels_path=NORM_LABELS_PATH,
    comm_df=comm_df,            
    label_col="name",   
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE
)

print("NORMALIZED labels: per fold metrics (TEST)")
display(norm_per_fold)

print("\nNORMALIZED labels: averages (TEST)")
for k, v in norm_avg.items():
    print(f"{k}: {v:.6f}")
print(f"Coverage: {norm_cov['coverage']:.2%}  ({norm_cov['n_joined']}/{norm_cov['n_labeled']})")




=== NORMALIZED labels: per-fold metrics (TEST) ===


Unnamed: 0,fold,n_train,n_test,train_frac,NMI_train,ARI_train,Purity_train,NMI_test,ARI_test,Purity_test
0,1,317,80,0.798489,1.0,1.0,1.0,0.505885,0.370615,0.8125
1,2,317,80,0.798489,1.0,1.0,1.0,0.555383,0.458054,0.825
2,3,318,79,0.801008,1.0,1.0,1.0,0.598518,0.405019,0.848101
3,4,318,79,0.801008,1.0,1.0,1.0,0.540902,0.394479,0.835443
4,5,318,79,0.801008,1.0,1.0,1.0,0.60128,0.572208,0.810127



=== NORMALIZED labels: averages (TEST) ===
Avg_NMI_train: 1.000000
Avg_ARI_train: 1.000000
Avg_Purity_train: 1.000000
Avg_NMI_test: 0.560393
Avg_ARI_test: 0.440075
Avg_Purity_test: 0.826234
Avg_train_frac: 0.800000
Coverage: 4.90%  (397/8102)


In [None]:
TX_PATH = 'transaction/cross-validation'
os.makedirs(TX_PATH, exist_ok=True)

norm_per_fold.to_csv(f"{TX_PATH}/sslpa_cv_results_norm_per_fold.csv", index=False)

pd.DataFrame([{**norm_avg, **norm_cov}]).to_csv(
    f"{TX_PATH}/sslpa_cv_results_norm_summary.csv", index=False
)

print("\nSaved:",
      f"{TX_PATH}/sslpa_cv_results_norm_per_fold.csv,",
      f"{TX_PATH}/sslpa_cv_results_norm_summary.csv")



Saved: transaction/cross-validation/sslpa_cv_results_norm_per_fold.csv, transaction/cross-validation/sslpa_cv_results_norm_summary.csv


# Trustlines

In [26]:
comm_df = load_communities_fixed_resolution(TL_GRAPH_PKL) 

print(f"Loaded graph from: {TL_GRAPH_PKL}")
print(f"Nodes: {comm_df.number_of_nodes():,}  |  Edges: {comm_df.number_of_edges():,}")


Loaded graph from: /home/user/jfayzullaev/stellar-clustering/network/Community Detection/Louvian/trustline_graph/trust_proj_LCC_idf/trust_proj_LCC_idf.pkl
Nodes: 24,586  |  Edges: 210,923


In [None]:
norm_per_fold, norm_avg, norm_cov = evaluate_fixed_resolution_cv(
    labels_path=NORM_LABELS_PATH,
    comm_df=comm_df,
    label_col="name",
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE
)

print("NORMALIZED labels: per-fold metrics (TEST)")
display(norm_per_fold)

print("\nNORMALIZED labels: averages (TEST)")
for k, v in norm_avg.items():
    print(f"{k}: {v:.6f}")
print(f"Coverage: {norm_cov['coverage']:.2%}  ({norm_cov['n_joined']}/{norm_cov['n_labeled']})")




=== NORMALIZED labels: per-fold metrics (TEST) ===


Unnamed: 0,fold,n_train,n_test,train_frac,NMI_train,ARI_train,Purity_train,NMI_test,ARI_test,Purity_test
0,1,111,28,0.798561,1.0,1.0,1.0,0.349233,0.091936,0.928571
1,2,111,28,0.798561,1.0,1.0,1.0,0.424271,0.23284,0.928571
2,3,111,28,0.798561,1.0,1.0,1.0,0.358386,0.118988,0.928571
3,4,111,28,0.798561,1.0,1.0,1.0,0.333619,0.144009,0.892857
4,5,112,27,0.805755,1.0,1.0,1.0,0.153023,0.047583,0.925926



=== NORMALIZED labels: averages (TEST) ===
Avg_NMI_train: 1.000000
Avg_ARI_train: 1.000000
Avg_Purity_train: 1.000000
Avg_NMI_test: 0.323706
Avg_ARI_test: 0.127071
Avg_Purity_test: 0.920899
Avg_train_frac: 0.800000
Coverage: 1.72%  (139/8102)


In [None]:
TL_PATH = 'trustline/cross-validation'
os.makedirs(TL_PATH, exist_ok=True)

norm_per_fold.to_csv(f"{TL_PATH}/sslpa_cv_results_norm_per_fold.csv", index=False)

pd.DataFrame([{**norm_avg, **norm_cov}]).to_csv(
    f"{TL_PATH}/sslpa_cv_results_norm_summary.csv", index=False
)

print("\nSaved:",
      f"{TL_PATH}/sslpa_cv_results_norm_per_fold.csv,",
      f"{TL_PATH}/sslpa_cv_results_norm_summary.csv")



Saved: trustline/cross-validation/sslpa_cv_results_norm_per_fold.csv, trustline/cross-validation/sslpa_cv_results_norm_summary.csv
