In [11]:
#
import kumoai.experimental.rfm as rfm, os
import pandas as pd
from pathlib import Path
from kumoai.experimental import rfm
root = Path("datasets/cleaned_small")
root = Path("datasets/cleaned")
dfs = {
    p.stem: pd.read_csv(p)
    for p in root.glob("*.csv")
}
dfs

{}

In [2]:
{k: list(df.columns) for k, df in dfs.items()}


{}

In [3]:
metadata_name = next((k for k in dfs if k.lower() == "metadata"), None)
metadata_name


In [4]:
def building_key_stats(df):
    if "building_key" not in df.columns:
        return None
    s = df["building_key"]
    return {
        "n_rows": len(df),
        "n_building_keys": s.nunique(dropna=True),
        "n_missing_building_key": int(s.isna().sum())
    }

stats = {k: building_key_stats(df) for k, df in dfs.items()}
stats


{}

In [5]:
utility_files = [k for k in dfs if k not in ("metadata", "weather")]
keysets = {
    k: set(dfs[k]["building_key"].dropna().astype(int).unique())
    for k in utility_files
    if "building_key" in dfs[k].columns
}

# Intersection / union sizes
common = set.intersection(*keysets.values()) if keysets else set()
union = set.union(*keysets.values()) if keysets else set()

len(common), len(union)

missing_by_file = {
    k: sorted(list(union - ks))[:50]  # show first 50 missing keys
    for k, ks in keysets.items()
}

{k: len(union - ks) for k, ks in keysets.items()}  # counts missing



{}

In [6]:
md = dfs.get("metadata")
if md is not None and "building_key" in md.columns:
    md_keys = set(md["building_key"].dropna().astype(int).unique())
    missing_in_md = sorted(list(union - md_keys))[:50]
    extra_in_md = sorted(list(md_keys - union))[:50]
    print("missing in metadata:", len(union - md_keys), missing_in_md[:10])
    print("extra in metadata:", len(md_keys - union), extra_in_md[:10])
else:
    print("metadata missing or has no building_key column")


metadata missing or has no building_key column


In [7]:
def check_label_to_key_consistency(df, name):
    if not {"all_building", "building_key"}.issubset(df.columns):
        return None
    tmp = df[["all_building", "building_key"]].dropna()
    bad = tmp.groupby("all_building")["building_key"].nunique()
    bad = bad[bad > 1]
    if len(bad):
        print(f"❌ {name}: {len(bad)} labels map to multiple building_keys")
        display(bad.head(20))
    else:
        print(f"✅ {name}: label → key mapping is consistent")

for k in utility_files:
    check_label_to_key_consistency(dfs[k], k)


In [8]:
{k: list(dfs[k].columns) for k in ["metadata", "electricity_cleaned", "gas_cleaned"] if k in dfs}


{}

In [9]:
import pandas as pd

md = dfs["metadata"].copy()

assert "building_key" in md.columns, "metadata has no building_key"

md_keys = set(md["building_key"].dropna().astype(int).unique())

results = []
for name, df in dfs.items():
    if name in ("metadata", "weather"):
        continue
    if "building_key" not in df.columns:
        continue

    util_keys = set(df["building_key"].dropna().astype(int).unique())
    missing_in_md = util_keys - md_keys
    extra_in_md = md_keys - util_keys  # not an error; just shows coverage

    results.append({
        "file": name,
        "util_keys": len(util_keys),
        "missing_keys_in_metadata": len(missing_in_md),
        "example_missing_keys": sorted(list(missing_in_md))[:10],
        "metadata_keys_not_in_file": len(extra_in_md),
    })

pd.DataFrame(results).sort_values("missing_keys_in_metadata", ascending=False)


KeyError: 'metadata'

In [None]:
[c for c in md.columns if "build" in c.lower() or "name" in c.lower() or "label" in c.lower()]


['building_id', 'building_id_kaggle', 'building_key']

In [None]:
label_col = "building_id"  # <-- change this to the correct column in metadata

md_labels = md[["building_key", label_col]].dropna().drop_duplicates()
md_labels[label_col] = md_labels[label_col].astype(str).str.strip().str.lower()

label_results = []
for name, df in dfs.items():
    if name in ("metadata", "weather"):
        continue
    if not {"building_key", "all_building"}.issubset(df.columns):
        continue

    util_labels = df[["building_key", "all_building"]].dropna().drop_duplicates()
    util_labels["all_building_norm"] = util_labels["all_building"].astype(str).str.strip().str.lower()

    merged = util_labels.merge(md_labels, on="building_key", how="left")
    missing_md_label = merged[label_col].isna().sum()

    # cases where both present but differ (rough check)
    mismatch = merged[(merged[label_col].notna()) & (merged["all_building_norm"] != merged[label_col])]

    label_results.append({
        "file": name,
        "n_keys": merged["building_key"].nunique(),
        "n_missing_metadata_label": int(missing_md_label),
        "n_label_mismatches": int(len(mismatch)),
        "example_mismatches": mismatch.head(5)[["building_key","all_building",label_col]].to_dict("records"),
    })

pd.DataFrame(label_results).sort_values("n_label_mismatches", ascending=False)


Unnamed: 0,file,n_keys,n_missing_metadata_label,n_label_mismatches,example_mismatches
0,chilledwater_cleaned,555,0,0,[]
1,electricity_cleaned,1578,0,0,[]
2,gas_cleaned,177,0,0,[]
3,hotwater_cleaned,185,0,0,[]
4,irrigation_cleaned,37,0,0,[]
5,solar_cleaned,5,0,0,[]
6,steam_cleaned,370,0,0,[]
7,water_cleaned,146,0,0,[]


In [None]:
sample = (
    dfs["electricity_cleaned"][["building_key", "all_building"]]
    .drop_duplicates()
    .sample(20, random_state=0)
    .sort_values("building_key")
)

# choose metadata label column
label_col = "building_id"  # change if needed
sample.merge(md[["building_key", "building_id", label_col]].drop_duplicates(), on="building_key", how="left")


Unnamed: 0,building_key,all_building,building_id,building_id.1
0,2,Bear_assembly_Beatrice,Bear_assembly_Beatrice,Bear_assembly_Beatrice
1,8,Bear_assembly_Roxy,Bear_assembly_Roxy,Bear_assembly_Roxy
2,110,Bobcat_education_Rodrick,Bobcat_education_Rodrick,Bobcat_education_Rodrick
3,170,Bull_education_Gregory,Bull_education_Gregory,Bull_education_Gregory
4,208,Bull_education_Venita,Bull_education_Venita,Bull_education_Venita
5,284,Cockatoo_education_Flora,Cockatoo_education_Flora,Cockatoo_education_Flora
6,347,Cockatoo_lodging_Tessie,Cockatoo_lodging_Tessie,Cockatoo_lodging_Tessie
7,547,Fox_education_Maureen,Fox_education_Maureen,Fox_education_Maureen
8,580,Fox_lodging_Jina,Fox_lodging_Jina,Fox_lodging_Jina
9,616,Fox_public_Denny,Fox_public_Denny,Fox_public_Denny
