In [1]:
import sys
from pathlib import Path
import pandas as pd

# ------------------------------------------------------------------
# 0.  Locate the project root so that src/ is importable
# ------------------------------------------------------------------
try:                       # notebook converted to .py
    project_root = Path(__file__).resolve().parents[1]
except NameError:          # live Jupyter notebook
    project_root = Path.cwd().parents[0]

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# ------------------------------------------------------------------
# 1.  Imports from src/
# ------------------------------------------------------------------
from src.thermo import add_CT
from src.impute import impute_all
from src import omp

# ------------------------------------------------------------------
# 2.  Load data → add CT → nutrient + KNN imputation
# ------------------------------------------------------------------
DATA_PATH = project_root / "data" / "interim" / "water_co2.parquet"
df = pd.read_parquet(DATA_PATH)

df = add_CT(df)       # Conservative Temperature (needs lon/lat/p)
df = impute_all(df, k=5)  # ½ MDL for nutrients + KNN for the rest

# ------------------------------------------------------------------
# 3.  Load end-member means + tracer σ
# ------------------------------------------------------------------
MEANS_YML = project_root / "data" / "metadata" / "endmembers.yml"
means, sigmas = omp.load_swts(yml=MEANS_YML)
tracers = list(means.columns)  # e.g. ['CT','sal_wat','o2']

# ------------------------------------------------------------------
# 4.  Ensure tracers are numeric & drop rows missing ALL tracers
# ------------------------------------------------------------------
df[tracers] = df[tracers].apply(pd.to_numeric, errors="coerce")

print("Missing counts after cast:")
display(df[tracers].isna().sum().to_frame("NaNs"))

empty_rows = df[tracers].isna().all(axis=1)
if empty_rows.any():
    print(f"Dropping {empty_rows.sum()} completely empty rows")
    df = df.loc[~empty_rows].reset_index(drop=True)

# ------------------------------------------------------------------
# 5.  Run the OMP solver (uses tracer-specific sigmas)
# ------------------------------------------------------------------

df_mix = omp.solve_df(df, means, sigmas)

# --------------------------------------------------------------
# after df_mix = omp.solve_df(...) and before saving the file
# --------------------------------------------------------------
frac_cols = [f"{name}_frac" for name in means.index]

# add percentage columns, keeping two-decimal precision
df_mix[[c.replace("_frac", "_pct") for c in frac_cols]] = (
    df_mix[frac_cols] * 100
).round(2)

display(df_mix.head())

# ------------------------------------------------------------------
# 6.  Save processed output
# ------------------------------------------------------------------
OUT_PATH = project_root / "data" / "processed" / "water_co2_OMP.parquet"
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_mix.to_parquet(OUT_PATH, index=False)
print(f"\n✔ OMP output written to {OUT_PATH}")

Missing counts after cast:


Unnamed: 0,NaNs
CT,0
sal_wat,0
o2,0


Unnamed: 0,station,date,time,lat,lon,loc,season,depth_m,depth_desc,sample_id,...,ammonium_imputed,phosphate_imputed,silicate_imputed,chl_imputed,LUW_frac,OOTW_frac,RSW_frac,LUW_pct,OOTW_pct,RSW_pct
0,S1,2025-04-23,13:23:00,5.49,-0.02,Sakumono,First warm / stratified,1.18023,surface,P4504-S1-250423-D1m-OA,...,False,False,False,False,0.0,1.0,0.0,0.0,100.0,0.0
1,S1,2025-04-23,13:50:00,5.49,-0.02,Sakumono,First warm / stratified,4.099514,dcm,P4504-S1-250423-D4m-OA,...,False,False,False,False,0.0,1.0,0.0,0.0,100.0,0.0
2,S1,2025-04-23,14:32:00,5.49,-0.02,Sakumono,First warm / stratified,29.117887,bdcm,P4504-S1-250423-D30m-OA,...,False,False,False,False,0.716401,0.248754,0.034845,71.64,24.88,3.48
3,S1,2025-04-23,14:15:00,5.49,-0.02,Sakumono,First warm / stratified,46.363425,bdcm,P4504-S1-250423-D50m-OA,...,True,True,True,True,0.828745,0.171255,0.0,82.87,17.13,0.0
4,S2,2025-04-23,16:25:00,5.4,0.025,Sakumono,First warm / stratified,0.99197,surface,P4504-S2-250423-D1m-OA,...,False,False,False,False,0.081341,0.70198,0.216679,8.13,70.2,21.67



✔ OMP output written to C:\Users\OA_2023-03\Documents\dev\ghana_carbonate_OMI\data\processed\water_co2_OMP.parquet
