In [1]:
!which python

/home/user/jfayzullaev/stellar-clustering/.venv-vis/bin/python


In [10]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import normalized_mutual_info_score as NMI, adjusted_rand_score as ARI


In [11]:
RES_FILE = "../louvain_result_res1.2.csv"


RAW_LABELS_PATH  = os.path.expanduser("~/stellar-clustering/network/labled-data/labels/labels_map.csv")
NORM_LABELS_PATH = os.path.expanduser("~/stellar-clustering/network/labled-data/labels/label-normalization/labels_entities_normalized.csv")


In [12]:
N_SPLITS = 5
RANDOM_STATE = 42

In [None]:
def overall_purity(df_comm_name: pd.DataFrame) -> float:
    if df_comm_name.empty:
        return np.nan
    counts = df_comm_name.groupby(['community', 'name']).size().reset_index(name='cnt')
    totals = counts.groupby('community')['cnt'].sum()
    max_per_comm = counts.groupby('community')['cnt'].max()
    return float(max_per_comm.sum() / totals.sum())


def load_communities_fixed_resolution(res_file: str) -> pd.DataFrame:

    if not os.path.exists(res_file):
        raise FileNotFoundError(f"no file: {res_file}")
    df = pd.read_csv(res_file)
    if 'account_id' not in df.columns and 'node' in df.columns:
        df = df.rename(columns={'node': 'account_id'})
    df = df[['account_id', 'community']].dropna().drop_duplicates()

    try:
        df['account_id'] = df['account_id'].astype(int)
    except Exception:
        df['account_id'] = df['account_id'].astype(str)
    return df


In [None]:
def evaluate_fixed_resolution_cv(
    labels_path: str,
    comm_df: pd.DataFrame,
    label_col: str = "name",
    n_splits: int = 5,
    random_state: int = 42
):
    if not os.path.exists(labels_path):
        raise FileNotFoundError(f"Labels file not found: {labels_path}")

    labels = (pd.read_csv(labels_path)
                .dropna(subset=['account_id', label_col])
                .drop_duplicates(subset=['account_id'])
                .rename(columns={label_col: 'name'}))


    try:
        labels['account_id'] = labels['account_id'].astype(int)
        comm_cast = comm_df.copy()
        comm_cast['account_id'] = comm_cast['account_id'].astype(int)
    except Exception:
        labels['account_id'] = labels['account_id'].astype(str)
        comm_cast = comm_df.copy()
        comm_cast['account_id'] = comm_cast['account_id'].astype(str)


    joined = labels.merge(comm_cast, on='account_id', how='inner')

    n_labeled = len(labels)
    n_joined  = len(joined)
    coverage  = (n_joined / n_labeled) if n_labeled else 0.0


    le = LabelEncoder()
    y_all = le.fit_transform(joined['name'].values)
    X_ids = joined['account_id'].values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    rows = []

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_ids, y_all), start=1):
        df_tr = joined.iloc[tr_idx].copy()
        df_te = joined.iloc[te_idx].copy()


        y_true_tr = le.transform(df_tr['name'])
        y_pred_tr = df_tr['community'].values
        nmi_tr = NMI(y_true_tr, y_pred_tr)
        ari_tr = ARI(y_true_tr, y_pred_tr)
        purity_tr = overall_purity(df_tr[['community', 'name']])

        y_true_te = le.transform(df_te['name'])
        y_pred_te = df_te['community'].values
        nmi_te = NMI(y_true_te, y_pred_te)
        ari_te = ARI(y_true_te, y_pred_te)
        purity_te = overall_purity(df_te[['community', 'name']])

        rows.append({
            'fold': fold,
            'n_train': len(df_tr),
            'n_test': len(df_te),
            'train_frac': len(df_tr) / len(joined),
            'NMI_train': nmi_tr,
            'ARI_train': ari_tr,
            'Purity_train': purity_tr,
            'NMI_test': nmi_te,
            'ARI_test': ari_te,
            'Purity_test': purity_te,
        })

    per_fold_df = pd.DataFrame(rows)

    averages = {
        'Avg_NMI_train': float(np.nanmean(per_fold_df['NMI_train'].values)),
        'Avg_ARI_train': float(np.nanmean(per_fold_df['ARI_train'].values)),
        'Avg_Purity_train': float(np.nanmean(per_fold_df['Purity_train'].values)),
        'Avg_NMI_test': float(np.nanmean(per_fold_df['NMI_test'].values)),
        'Avg_ARI_test': float(np.nanmean(per_fold_df['ARI_test'].values)),
        'Avg_Purity_test': float(np.nanmean(per_fold_df['Purity_test'].values)),
        'Avg_train_frac': float(np.nanmean(per_fold_df['train_frac'].values)),
    }

    coverage_info = {
        'n_labeled': int(n_labeled),
        'n_joined': int(n_joined),
        'coverage': float(coverage),
    }

    return per_fold_df, averages, coverage_info


In [15]:
comm_df = load_communities_fixed_resolution(RES_FILE)
print(f"Loaded communities from: {RES_FILE}")
print(f"Rows: {len(comm_df):,}  |  Unique accounts: {comm_df['account_id'].nunique():,}")


Loaded communities from: ../louvain_result_res1.2.csv
Rows: 24,586  |  Unique accounts: 24,586


### RAW LABELS

In [16]:
raw_per_fold, raw_avg, raw_cov = evaluate_fixed_resolution_cv(
    labels_path=RAW_LABELS_PATH,
    comm_df=comm_df,
    label_col="name",
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE
)

print("RAW labels: per-fold metrics (TEST)")
display(raw_per_fold)

print("\nRAW labels: averages (TEST)")
for k, v in raw_avg.items():
    print(f"{k}: {v:.6f}")
print(f"Coverage: {raw_cov['coverage']:.2%}  ({raw_cov['n_joined']}/{raw_cov['n_labeled']})")

# Save
raw_per_fold.to_csv("cv_results_raw_per_fold.csv", index=False)
pd.DataFrame([{
    **raw_avg,
    **raw_cov
}]).to_csv("cv_results_raw_summary.csv", index=False)
print("\nSaved: cv_results_raw_per_fold.csv, cv_results_raw_summary.csv")


RAW labels: per-fold metrics (TEST)




Unnamed: 0,fold,n_train,n_test,train_frac,NMI_train,ARI_train,Purity_train,NMI_test,ARI_test,Purity_test
0,1,111,28,0.798561,0.364933,0.033611,0.441441,0.578568,-0.0075,0.607143
1,2,111,28,0.798561,0.375847,0.025932,0.45045,0.576259,0.038963,0.535714
2,3,111,28,0.798561,0.399746,0.048946,0.459459,0.544344,0.031902,0.571429
3,4,111,28,0.798561,0.403217,0.061593,0.486486,0.585776,0.014727,0.642857
4,5,112,27,0.805755,0.392553,0.02854,0.4375,0.623935,0.166469,0.740741



RAW labels: averages (TEST)
Avg_NMI_train: 0.387259
Avg_ARI_train: 0.039724
Avg_Purity_train: 0.455068
Avg_NMI_test: 0.581776
Avg_ARI_test: 0.048912
Avg_Purity_test: 0.619577
Avg_train_frac: 0.800000
Coverage: 1.72%  (139/8102)

Saved: cv_results_raw_per_fold.csv, cv_results_raw_summary.csv


### NORMALIZED LABELS

In [17]:
norm_per_fold, norm_avg, norm_cov = evaluate_fixed_resolution_cv(
    labels_path=NORM_LABELS_PATH,
    comm_df=comm_df,
    label_col="name",
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE
)

print("NORMALIZED labels: per-fold metrics (TEST)")
display(norm_per_fold)

print("\nNORMALIZED labels: averages (TEST)")
for k, v in norm_avg.items():
    print(f"{k}: {v:.6f}")
print(f"Coverage: {norm_cov['coverage']:.2%}  ({norm_cov['n_joined']}/{norm_cov['n_labeled']})")

# Save
norm_per_fold.to_csv("cv_results_norm_per_fold.csv", index=False)
pd.DataFrame([{
    **norm_avg,
    **norm_cov
}]).to_csv("cv_results_norm_summary.csv", index=False)
print("\nSaved: cv_results_norm_per_fold.csv, cv_results_norm_summary.csv")


NORMALIZED labels: per-fold metrics (TEST)




Unnamed: 0,fold,n_train,n_test,train_frac,NMI_train,ARI_train,Purity_train,NMI_test,ARI_test,Purity_test
0,1,111,28,0.798561,0.223019,0.026513,0.873874,0.271247,0.005641,0.892857
1,2,111,28,0.798561,0.226855,0.008266,0.891892,0.332551,0.067158,0.892857
2,3,111,28,0.798561,0.226383,0.011505,0.882883,0.338523,0.041898,0.928571
3,4,111,28,0.798561,0.216474,0.014506,0.882883,0.316611,0.022845,0.928571
4,5,112,27,0.805755,0.23665,0.020481,0.866071,0.109109,-0.018951,0.925926



NORMALIZED labels: averages (TEST)
Avg_NMI_train: 0.225876
Avg_ARI_train: 0.016254
Avg_Purity_train: 0.879521
Avg_NMI_test: 0.273608
Avg_ARI_test: 0.023718
Avg_Purity_test: 0.913757
Avg_train_frac: 0.800000
Coverage: 1.72%  (139/8102)

Saved: cv_results_norm_per_fold.csv, cv_results_norm_summary.csv


In [19]:
compare = pd.DataFrame([
    {
        "labels": "RAW",
        "Avg_NMI_test": raw_avg["Avg_NMI_test"],
        "Avg_ARI_test": raw_avg["Avg_ARI_test"],
        "Avg_Purity_test": raw_avg["Avg_Purity_test"],
        "coverage": raw_cov["coverage"],
        "n_labeled": raw_cov["n_labeled"],
        "n_joined": raw_cov["n_joined"],
    },
    {
        "labels": "NORMALIZED",
        "Avg_NMI_test": norm_avg["Avg_NMI_test"],
        "Avg_ARI_test": norm_avg["Avg_ARI_test"],
        "Avg_Purity_test": norm_avg["Avg_Purity_test"],
        "coverage": norm_cov["coverage"],
        "n_labeled": norm_cov["n_labeled"],
        "n_joined": norm_cov["n_joined"],
    },
])

compare["Diff_NMI"] = compare["Avg_NMI_test"].diff().fillna(np.nan)
compare["Diff_NMI_ARI"] = compare["Avg_ARI_test"].diff().fillna(np.nan)
compare["Diff_NMI_Purity"] = compare["Avg_Purity_test"].diff().fillna(np.nan)

print("RAW vs NORMALIZED (averages, TEST)")
display(compare)

compare.to_csv("cv_results_compare_raw_vs_normalized.csv", index=False)
print("\nSaved: cv_results_compare_raw_vs_normalized.csv")


RAW vs NORMALIZED (averages, TEST)


Unnamed: 0,labels,Avg_NMI_test,Avg_ARI_test,Avg_Purity_test,coverage,n_labeled,n_joined,Diff_NMI,Diff_NMI_ARI,Diff_NMI_Purity
0,RAW,0.581776,0.048912,0.619577,0.017156,8102,139,,,
1,NORMALIZED,0.273608,0.023718,0.913757,0.017156,8102,139,-0.308168,-0.025194,0.29418



Saved: cv_results_compare_raw_vs_normalized.csv
