In [1]:
# 01_exploratory_cruise_eda.ipynb
import sys
from pathlib import Path

# ------------------------------------------------------------------
# Detect project root whether we are in a notebook or a .py script
# ------------------------------------------------------------------
try:
    # Works when notebook is run via "jupyter nbconvert --execute" or .py scripts
    project_root = Path(__file__).resolve().parents[1]
except NameError:
    # Jupyter sets CWD = <repo>/notebooks by default, so one level up is the root
    project_root = Path.cwd().parents[0]

# Add the repo root to Python’s import search path once per kernel
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# ------------------------------------------------------------------
# Now normal imports resolve
# ------------------------------------------------------------------
from src.impute import impute_all
import pandas as pd
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt



DATA_DIR = Path.cwd().parents[0] / "data" / "interim"
df_raw   = pd.read_parquet(DATA_DIR / "water_co2.parquet")

df = impute_all(df_raw, k=5)             # ← nutrient half-MDL + KNN

# Quick look at flags
flag_cols = [c for c in df.columns if c.endswith("_imputed")]
display(df[flag_cols].sum())             # count how many values were filled

df.head()                                # imputed flags show up in Jupyter

# ---------- quick profile (takes 5–20 s depending on rows) ----------
profile = ProfileReport(
    df,
    title="Cruise EDA – water_co2",
    explorative=True
)

REPORT_DIR = Path.cwd().parents[0] / "results" / "reports"
REPORT_DIR.mkdir(parents=True, exist_ok=True)

profile_path = REPORT_DIR / "water_co2_profile.html"
profile.to_file(profile_path)

print("✔ Profile saved to:", profile_path)

import seaborn as sns
import matplotlib.pyplot as plt

FIG_DIR = Path.cwd().parents[0] / "results" / "figures"
FIG_DIR.mkdir(exist_ok=True)

# 3-a  depth profile (temp vs depth)
fig1, ax1 = plt.subplots()
sns.lineplot(data=df, x="temp_wat", y="depth_m",
             marker="o", hue="cruise", ax=ax1)
ax1.invert_yaxis()
ax1.set_xlabel("Water temperature (°C)")
ax1.set_ylabel("Depth (m)")
fig1.savefig(FIG_DIR / "temp_depth_profile.png", dpi=300, bbox_inches="tight")

# 3-b  θ-S diagram
fig2, ax2 = plt.subplots()
sns.scatterplot(data=df, x="sal_wat", y="temp_wat",
                hue="depth_m", palette="viridis_r", ax=ax2)
ax2.set_xlabel("Salinity (PSU)")
ax2.set_ylabel("Temperature (°C)")
fig2.savefig(FIG_DIR / "theta_s_diagram.png", dpi=300, bbox_inches="tight")

nitrate_nitrite_imputed    2
ammonium_imputed           2
phosphate_imputed          2
silicate_imputed           2
chl_imputed                4
dtype: int64

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 1920.49it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✔ Profile saved to: C:\Users\OA_2023-03\Documents\dev\ghana_carbonate_OMI\results\reports\water_co2_profile.html
