set working directory to project root

In [17]:
from pathlib import Path
import os

# Find project root by searching upward for repo markers
markers = ["src", "data", ".git", "README.md"]

here = Path.cwd()
root = None
for p in [here] + list(here.parents):
    if any((p / m).exists() for m in markers):
        root = p
        break

if root is None:
    raise RuntimeError("Could not locate project root. Open notebook from your repo folder.")

os.chdir(root)
print("✅ CWD set to project root:", Path.cwd())

✅ CWD set to project root: /Users/samyuktareddy/Desktop/MOF Gas absorption


imports and file finder

In [18]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

def find_first(pattern: str, root: str = "."):
    hits = list(Path(root).rglob(pattern))
    return hits[0] if hits else None

print("Ready ✅")

Ready ✅


In [19]:
# Optional (descriptor baselines)
DESC_BASELINES = find_first("outputs/baselines/metrics.json") or find_first("metrics.json")

# Optional (BH-feature baseline across runs)
BH_PER_RUN = find_first("outputs/baselines_bh_features/per_run_metrics.csv") or find_first("per_run_metrics.csv")

# Required (real vs shuffled comparison)
REAL_SHUF_CSV = (
    find_first("outputs/figures/05_gnn_vs_shuffled/model_comparison_real_vs_shuffled.csv")
    or find_first("model_comparison_real_vs_shuffled.csv")
)

print("DESC_BASELINES:", DESC_BASELINES)
print("BH_PER_RUN:", BH_PER_RUN)
print("REAL_SHUF_CSV:", REAL_SHUF_CSV)

DESC_BASELINES: outputs/baselines/metrics.json
BH_PER_RUN: outputs/baselines_bh_features/per_run_metrics.csv
REAL_SHUF_CSV: outputs/figures/05_gnn_vs_shuffled/model_comparison_real_vs_shuffled.csv


load descriptor baselines

In [20]:
desc_df = None

if DESC_BASELINES is not None and Path(DESC_BASELINES).exists():
    d = json.loads(Path(DESC_BASELINES).read_text())
    rows = []
    for model_name, block in d.items():
        if isinstance(block, dict) and "val" in block and "test" in block:
            rows.append({
                "model": model_name.upper(),
                "val_rmse": block["val"]["rmse"],
                "val_mae": block["val"]["mae"],
                "val_r2": block["val"]["r2"],
                "test_rmse": block["test"]["rmse"],
                "test_mae": block["test"]["mae"],
                "test_r2": block["test"]["r2"],
            })
    if rows:
        desc_df = pd.DataFrame(rows).sort_values("val_rmse")

desc_df

Unnamed: 0,model,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2
0,RIDGE,0.120002,0.090989,-0.012239,0.437148,0.17825,0.065915
2,RF,0.141599,0.069065,-0.409369,0.38965,0.146692,0.257871
1,KNN,0.156687,0.079781,-0.725712,0.408945,0.163669,0.182552


load BH feature baseline

In [21]:
bh = None
bh_summary = None

if BH_PER_RUN is not None and Path(BH_PER_RUN).exists():
    bh = pd.read_csv(BH_PER_RUN)
    bh_summary = bh.agg({
        "val_rmse": ["mean","std"],
        "val_mae": ["mean","std"],
        "val_r2": ["mean","std"],
        "test_rmse": ["mean","std"],
        "test_mae": ["mean","std"],
        "test_r2": ["mean","std"],
    })

bh_summary

Unnamed: 0,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2
mean,0.099852,0.050317,0.466082,0.377918,0.125377,0.093713
std,0.003456,0.002917,0.049614,0.002196,0.002759,0.013341


real vs shuffled comparison

In [22]:
assert REAL_SHUF_CSV is not None and Path(REAL_SHUF_CSV).exists(), \
    "Missing model_comparison_real_vs_shuffled.csv. Run: python src/viz/plot_real_vs_shuffled.py"

cmp = pd.read_csv(REAL_SHUF_CSV)
cmp.head()

Unnamed: 0,model,split,rmse_mean,rmse_std,mae_mean,mae_std,r2_mean,r2_std
0,BH-features,val,0.099852,0.002993,0.050317,0.002526,0.466082,0.042967
1,BH-features,test,0.377918,0.001902,0.125377,0.00239,0.093713,0.011553
2,GNN_real,val,0.15467,0.004744,0.092308,0.004381,-0.277667,0.040441
3,GNN_real,test,0.415657,0.003375,0.159476,0.005074,-0.096321,0.017592
4,GNN_shuffled,val,0.167378,0.021759,0.101161,0.017774,-0.507493,0.351259


comparison table:

In [23]:
def fmt_from_cmp(split, metric):
    sub = cmp[cmp["split"] == split][["model", f"{metric}_mean", f"{metric}_std"]].copy()
    out = {}
    for _, r in sub.iterrows():
        out[r["model"]] = f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}"
    return out

val_rmse = fmt_from_cmp("val", "rmse")
val_mae  = fmt_from_cmp("val", "mae")
val_r2   = fmt_from_cmp("val", "r2")

test_rmse = fmt_from_cmp("test", "rmse")
test_mae  = fmt_from_cmp("test", "mae")
test_r2   = fmt_from_cmp("test", "r2")

rows = []

# Best descriptor baseline (optional)
if desc_df is not None and len(desc_df) > 0:
    best = desc_df.iloc[0].to_dict()
    rows.append({
        "Model": f"Descriptor baseline ({best['model']})",
        "Uses graph": "No",
        "VAL RMSE": best["val_rmse"],
        "VAL MAE": best["val_mae"],
        "VAL R2": best["val_r2"],
        "TEST RMSE": best["test_rmse"],
        "TEST MAE": best["test_mae"],
        "TEST R2": best["test_r2"],
    })

# BH-feature baseline (optional)
if bh is not None:
    rows.append({
        "Model": "BlackHole features baseline (mean±std across runs)",
        "Uses graph": "Indirect (graph-derived features)",
        "VAL RMSE": f"{bh['val_rmse'].mean():.4f} ± {bh['val_rmse'].std(ddof=0):.4f}",
        "VAL MAE":  f"{bh['val_mae'].mean():.4f} ± {bh['val_mae'].std(ddof=0):.4f}",
        "VAL R2":   f"{bh['val_r2'].mean():.4f} ± {bh['val_r2'].std(ddof=0):.4f}",
        "TEST RMSE": f"{bh['test_rmse'].mean():.4f} ± {bh['test_rmse'].std(ddof=0):.4f}",
        "TEST MAE":  f"{bh['test_mae'].mean():.4f} ± {bh['test_mae'].std(ddof=0):.4f}",
        "TEST R2":   f"{bh['test_r2'].mean():.4f} ± {bh['test_r2'].std(ddof=0):.4f}",
    })

# GNN real + shuffled (required)
rows.append({
    "Model": "GNN (real graph) (mean±std across runs)",
    "Uses graph": "Yes",
    "VAL RMSE": val_rmse.get("GNN_real"),
    "VAL MAE":  val_mae.get("GNN_real"),
    "VAL R2":   val_r2.get("GNN_real"),
    "TEST RMSE": test_rmse.get("GNN_real"),
    "TEST MAE":  test_mae.get("GNN_real"),
    "TEST R2":   test_r2.get("GNN_real"),
})

rows.append({
    "Model": "GNN (shuffled graph) (mean±std across runs)",
    "Uses graph": "Yes (random topology)",
    "VAL RMSE": val_rmse.get("GNN_shuffled"),
    "VAL MAE":  val_mae.get("GNN_shuffled"),
    "VAL R2":   val_r2.get("GNN_shuffled"),
    "TEST RMSE": test_rmse.get("GNN_shuffled"),
    "TEST MAE":  test_mae.get("GNN_shuffled"),
    "TEST R2":   test_r2.get("GNN_shuffled"),
})

summary_table = pd.DataFrame(rows)
summary_table

Unnamed: 0,Model,Uses graph,VAL RMSE,VAL MAE,VAL R2,TEST RMSE,TEST MAE,TEST R2
0,Descriptor baseline (RIDGE),No,0.120002,0.090989,-0.012239,0.437148,0.17825,0.065915
1,BlackHole features baseline (mean±std across r...,Indirect (graph-derived features),0.0999 ± 0.0030,0.0503 ± 0.0025,0.4661 ± 0.0430,0.3779 ± 0.0019,0.1254 ± 0.0024,0.0937 ± 0.0116
2,GNN (real graph) (mean±std across runs),Yes,0.1547 ± 0.0047,0.0923 ± 0.0044,-0.2777 ± 0.0404,0.4157 ± 0.0034,0.1595 ± 0.0051,-0.0963 ± 0.0176
3,GNN (shuffled graph) (mean±std across runs),Yes (random topology),0.1674 ± 0.0218,0.1012 ± 0.0178,-0.5075 ± 0.3513,0.4200 ± 0.0146,0.1500 ± 0.0098,-0.1208 ± 0.0848


In [24]:
out_csv = Path("outputs/summary/main_comparison_table.csv")
out_csv.parent.mkdir(parents=True, exist_ok=True)

summary_table.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())

Saved: /Users/samyuktareddy/Desktop/MOF Gas absorption/outputs/summary/main_comparison_table.csv


In [26]:
# If you want a quick Markdown table for thesis writing / notes
md = summary_table.to_markdown(index=False)
print(md)

| Model                                              | Uses graph                        | VAL RMSE            | VAL MAE             | VAL R2                | TEST RMSE          | TEST MAE            | TEST R2             |
|:---------------------------------------------------|:----------------------------------|:--------------------|:--------------------|:----------------------|:-------------------|:--------------------|:--------------------|
| Descriptor baseline (RIDGE)                        | No                                | 0.12000245035217807 | 0.09098922826248051 | -0.012239298362148432 | 0.4371479045328399 | 0.17825004080797874 | 0.06591517216686493 |
| BlackHole features baseline (mean±std across runs) | Indirect (graph-derived features) | 0.0999 ± 0.0030     | 0.0503 ± 0.0025     | 0.4661 ± 0.0430       | 0.3779 ± 0.0019    | 0.1254 ± 0.0024     | 0.0937 ± 0.0116     |
| GNN (real graph) (mean±std across runs)            | Yes                               | 0.1547 ± 0.00