In [1]:
!which python

/home/user/jfayzullaev/stellar-clustering/.venv-vis/bin/python


In [2]:
import os
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import normalized_mutual_info_score as NMI, adjusted_rand_score as ARI


In [3]:
DBSCAN_FILE_TX   = "transactions/tx_role2vec_dbscan_cosine_pca64_kgrid_test.csv"


In [4]:
DBSCAN_FILE_TL   = "trustlines/tl_role2vec_dbscan_cosine_pca64_kgrid_test.csv"

In [5]:
NORM_LABELS_PATH = "~/stellar-clustering/network/labled-data/labels/label-normalization/labels_entities_normalized.csv"  

In [6]:
N_SPLITS     = 5
RANDOM_STATE = 42

In [7]:
def overall_purity_comm(df_comm_name: pd.DataFrame, comm_col: str, name_col: str = "name") -> float:

    if df_comm_name.empty:
        return np.nan
    counts = df_comm_name.groupby([comm_col, name_col]).size().reset_index(name='cnt')
    totals = counts.groupby(comm_col)['cnt'].sum()
    max_per_comm = counts.groupby(comm_col)['cnt'].max()
    return float(max_per_comm.sum() / totals.sum())


In [None]:
def load_dbscan_fixed_label(dbscan_file: str, label_col: str, include_noise: bool = True) -> pd.DataFrame:
    
    if not os.path.exists(dbscan_file):
        raise FileNotFoundError(f"no file: {dbscan_file}")

    df = pd.read_csv(dbscan_file)


    if label_col not in df.columns:
        db_cols = sorted([c for c in df.columns if c.startswith('dbscan_')])
        raise ValueError(
                f"no cols '{label_col}'. "
                f"available cols: {db_cols}"
        )

    out = df[['account_id', label_col]].dropna().drop_duplicates()
    out = out.rename(columns={label_col: 'cluster'})

    try:
        out['account_id'] = out['account_id'].astype(int)
    except Exception:
        out['account_id'] = out['account_id'].astype(str)

    if not include_noise:
        out = out[out['cluster'] != -1]

    return out

In [None]:
def evaluate_fixed_dbscan_cv(
    labels_path: str,
    clusters_df: pd.DataFrame,
    label_col: str = "name",
    n_splits: int = 5,
    random_state: int = 42,
    include_noise: bool = True
):
 

    if not os.path.exists(labels_path):
        raise FileNotFoundError(f"Labels not found: {labels_path}")

    labels = (pd.read_csv(labels_path)
                .dropna(subset=['account_id', label_col])
                .drop_duplicates(subset=['account_id'])
                .rename(columns={label_col: 'name'}))

    # sync dtype
    try:
        labels['account_id'] = labels['account_id'].astype(int)
        clu = clusters_df.copy()
        clu['account_id'] = clu['account_id'].astype(int)
    except Exception:
        labels['account_id'] = labels['account_id'].astype(str)
        clu = clusters_df.copy()
        clu['account_id'] = clu['account_id'].astype(str)

    if not include_noise:
        clu = clu[clu['cluster'] != -1].copy()

    joined = labels.merge(clu, on='account_id', how='inner')

    n_labeled = len(labels)
    n_joined  = len(joined)
    coverage  = (n_joined / n_labeled) if n_labeled else 0.0

    if n_joined == 0:
        raise ValueError("No labeled accounts to join")


    le = LabelEncoder()
    y_all = le.fit_transform(joined['name'].values)
    X_ids = joined['account_id'].values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    rows = []

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_ids, y_all), start=1):
        df_tr = joined.iloc[tr_idx].copy()
        df_te = joined.iloc[te_idx].copy()

        y_true_tr = le.transform(df_tr['name'])
        y_pred_tr = df_tr['cluster'].values
        nmi_tr = NMI(y_true_tr, y_pred_tr)
        ari_tr = ARI(y_true_tr, y_pred_tr)
        purity_tr = overall_purity_comm(df_tr[['cluster', 'name']], comm_col='cluster', name_col='name')

        y_true_te = le.transform(df_te['name'])
        y_pred_te = df_te['cluster'].values
        nmi_te = NMI(y_true_te, y_pred_te)
        ari_te = ARI(y_true_te, y_pred_te)
        purity_te = overall_purity_comm(df_te[['cluster', 'name']], comm_col='cluster', name_col='name')

        rows.append({
            'fold': fold,
            'n_train': len(df_tr),
            'n_test': len(df_te),
            'train_frac': len(df_tr) / len(joined),

            'NMI_train': nmi_tr,
            'ARI_train': ari_tr,
            'Purity_train': purity_tr,

            'NMI_test': nmi_te,
            'ARI_test': ari_te,
            'Purity_test': purity_te,
        })

    per_fold_df = pd.DataFrame(rows)

    averages = {
        'Avg_NMI_train': float(np.nanmean(per_fold_df['NMI_train'].values)),
        'Avg_ARI_train': float(np.nanmean(per_fold_df['ARI_train'].values)),
        'Avg_Purity_train': float(np.nanmean(per_fold_df['Purity_train'].values)),
        'Avg_NMI_test': float(np.nanmean(per_fold_df['NMI_test'].values)),
        'Avg_ARI_test': float(np.nanmean(per_fold_df['ARI_test'].values)),
        'Avg_Purity_test': float(np.nanmean(per_fold_df['Purity_test'].values)),
        'Avg_train_frac': float(np.nanmean(per_fold_df['train_frac'].values)),
    }

    coverage_info = {
        'n_labeled': int(n_labeled),
        'n_joined': int(n_joined),
        'coverage': float(coverage),
        'include_noise': bool(include_noise),
        'n_noise_in_eval': int((joined['cluster'] == -1).sum()) if include_noise else 0,
    }

    return per_fold_df, averages, coverage_info


## Transactions

In [10]:
df_tx = pd.read_csv(DBSCAN_FILE_TX)
dbscan_tx_cols = [col for col in df_tx.columns if col.startswith("dbscan_")]

display(dbscan_tx_cols)

['dbscan_ms5_p70_eps_0.416885',
 'dbscan_ms5_p80_eps_0.436465',
 'dbscan_ms5_p85_eps_0.446631',
 'dbscan_ms5_p90_eps_0.458462',
 'dbscan_ms5_p95_eps_0.473889',
 'dbscan_ms10_p70_eps_0.439453',
 'dbscan_ms10_p80_eps_0.458728',
 'dbscan_ms10_p85_eps_0.468802',
 'dbscan_ms10_p90_eps_0.480132',
 'dbscan_ms10_p95_eps_0.495112',
 'dbscan_ms15_p70_eps_0.452201',
 'dbscan_ms15_p80_eps_0.471395',
 'dbscan_ms15_p85_eps_0.481190',
 'dbscan_ms15_p90_eps_0.492474',
 'dbscan_ms15_p95_eps_0.507361']

In [11]:
DBSCAN_LABEL_COL = 'dbscan_ms10_p85_eps_0.468802'

In [12]:
clusters_df = load_dbscan_fixed_label(
    dbscan_file=DBSCAN_FILE_TX,
    label_col=DBSCAN_LABEL_COL,
    include_noise=True 
)
print(f"Loaded DBSCAN assignments from: {DBSCAN_FILE_TX}")
print(f"DBSCAN column: {DBSCAN_LABEL_COL}  |  Rows: {len(clusters_df):,}  |  Unique accounts: {clusters_df['account_id'].nunique():,}")


KeyboardInterrupt: 

In [None]:
norm_labels_path = os.path.expanduser(NORM_LABELS_PATH)

In [None]:
norm_per_fold, norm_avg, norm_cov = evaluate_fixed_dbscan_cv(
    labels_path=norm_labels_path,
    clusters_df=clusters_df,
    label_col="name",
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE,
    include_noise=True 
)




In [None]:
print(f"\n DBSCAN {DBSCAN_LABEL_COL} | Normalized Labels per fold metrics")
display(norm_per_fold)


 DBSCAN dbscan_ms10_p85_eps_0.468802 | Normalized Labels per fold metrics


Unnamed: 0,fold,n_train,n_test,train_frac,NMI_train,ARI_train,Purity_train,NMI_test,ARI_test,Purity_test
0,1,317,80,0.798489,0.028715,0.013653,0.719243,0.0,0.0,0.7125
1,2,317,80,0.798489,0.0,0.0,0.716088,0.09732,0.0524,0.725
2,3,318,79,0.801008,0.027841,0.013399,0.716981,0.0,0.0,0.721519
3,4,318,79,0.801008,0.028581,0.01344,0.716981,0.0,0.0,0.721519
4,5,318,79,0.801008,0.028959,0.013682,0.720126,0.0,0.0,0.708861


In [None]:
print(f"\n DBSCAN {DBSCAN_LABEL_COL} | Normalized Labels avgs")
for k, v in norm_avg.items():
    print(f"{k}: {v:.6f}")
print(f"Coverage: {norm_cov['coverage']:.2%}  ({norm_cov['n_joined']}/{norm_cov['n_labeled']})")



 DBSCAN dbscan_ms10_p85_eps_0.468802 | Normalized Labels avgs
Avg_NMI_train: 0.022819
Avg_ARI_train: 0.010835
Avg_Purity_train: 0.717884
Avg_NMI_test: 0.019464
Avg_ARI_test: 0.010480
Avg_Purity_test: 0.717880
Avg_train_frac: 0.800000
Coverage: 4.90%  (397/8102)


In [None]:
TX_PATH = "transactions/cross-validation"
os.makedirs(TX_PATH, exist_ok=True)

In [None]:
norm_per_fold.to_csv(f"{TX_PATH}/tx_cv_{DBSCAN_LABEL_COL}_norm_per_fold.csv", index=False)


pd.DataFrame([{
    **norm_avg,
    **norm_cov,
    "label_col": DBSCAN_LABEL_COL,
    "source_file": os.path.basename(DBSCAN_FILE_TX)
}]).to_csv(f"{TX_PATH}/tx_cv_{DBSCAN_LABEL_COL}_norm_summary.csv", index=False)

print(f"Saved to {TX_PATH}")

Saved to transactions/cross-validation


## Trustlines

In [None]:
df_tx = pd.read_csv(DBSCAN_FILE_TL)
dbscan_tx_cols = [col for col in df_tx.columns if col.startswith("dbscan_")]

display(dbscan_tx_cols)

['dbscan_ms5_p70_eps_0.290769',
 'dbscan_ms5_p80_eps_0.320623',
 'dbscan_ms5_p85_eps_0.340257',
 'dbscan_ms5_p90_eps_0.362472',
 'dbscan_ms5_p95_eps_0.395772',
 'dbscan_ms10_p70_eps_0.321141',
 'dbscan_ms10_p80_eps_0.353638',
 'dbscan_ms10_p85_eps_0.374160',
 'dbscan_ms10_p90_eps_0.398799',
 'dbscan_ms10_p95_eps_0.432206',
 'dbscan_ms15_p70_eps_0.339022',
 'dbscan_ms15_p80_eps_0.373094',
 'dbscan_ms15_p85_eps_0.393900',
 'dbscan_ms15_p90_eps_0.419475',
 'dbscan_ms15_p95_eps_0.453141']

In [None]:
DBSCAN_LABEL_COL = 'dbscan_ms5_p85_eps_0.340257'

In [None]:

clusters_df = load_dbscan_fixed_label(
    dbscan_file=DBSCAN_FILE_TL,
    label_col=DBSCAN_LABEL_COL,
    include_noise=True   
)
print(f"Loaded DBSCAN assignments from: {DBSCAN_FILE_TL}")
print(f"DBSCAN column: {DBSCAN_LABEL_COL}  |  Rows: {len(clusters_df):,}  |  Unique accounts: {clusters_df['account_id'].nunique():,}")


Loaded DBSCAN assignments from: trustlines/tl_role2vec_dbscan_cosine_pca64_kgrid_test.csv
DBSCAN column: dbscan_ms5_p85_eps_0.340257  |  Rows: 24,586  |  Unique accounts: 24,586


In [None]:
norm_labels_path = os.path.expanduser(NORM_LABELS_PATH)



In [None]:
norm_per_fold, norm_avg, norm_cov = evaluate_fixed_dbscan_cv(
    labels_path=norm_labels_path,
    clusters_df=clusters_df,
    label_col="name",
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE,
    include_noise=True 
)



In [None]:
print(f"\n DBSCAN {DBSCAN_LABEL_COL} | Normalized Labels per fold metrics")
display(norm_per_fold)


 DBSCAN dbscan_ms5_p85_eps_0.340257 | Normalized Labels per fold metrics


Unnamed: 0,fold,n_train,n_test,train_frac,NMI_train,ARI_train,Purity_train,NMI_test,ARI_test,Purity_test
0,1,111,28,0.798561,0.444122,0.508287,0.882883,0.199525,0.157656,0.857143
1,2,111,28,0.798561,0.429322,0.474746,0.882883,0.251554,0.237711,0.857143
2,3,111,28,0.798561,0.336027,0.373409,0.882883,0.670674,0.612332,0.928571
3,4,111,28,0.798561,0.384935,0.44261,0.882883,0.404275,0.344569,0.892857
4,5,112,27,0.805755,0.294636,0.328871,0.866071,1.0,1.0,1.0


In [None]:
print(f"\n DBSCAN {DBSCAN_LABEL_COL} | Normalized Labels avgs")
for k, v in norm_avg.items():
    print(f"{k}: {v:.6f}")
print(f"Coverage: {norm_cov['coverage']:.2%}  ({norm_cov['n_joined']}/{norm_cov['n_labeled']})")


 DBSCAN dbscan_ms5_p85_eps_0.340257 | Normalized Labels avgs
Avg_NMI_train: 0.377808
Avg_ARI_train: 0.425584
Avg_Purity_train: 0.879521
Avg_NMI_test: 0.505205
Avg_ARI_test: 0.470454
Avg_Purity_test: 0.907143
Avg_train_frac: 0.800000
Coverage: 1.72%  (139/8102)


In [None]:
TL_PATH = 'trustlines/cross-validation'
os.makedirs(TL_PATH, exist_ok=True)


In [None]:
norm_per_fold.to_csv(f"{TL_PATH}/tx_cv_{DBSCAN_LABEL_COL}_norm_per_fold.csv", index=False)


pd.DataFrame([{
    **norm_avg,
    **norm_cov,
    "label_col": DBSCAN_LABEL_COL,
    "source_file": os.path.basename(DBSCAN_FILE_TL)
}]).to_csv(f"{TL_PATH}/tx_cv_{DBSCAN_LABEL_COL}_norm_summary.csv", index=False)

print(f"Saved to {TL_PATH}")

Saved to trustlines/cross-validation
