# Sensor Plot Builder v3d (robust timestamps + interpolation)

This version fixes `OutOfBoundsDatetime` by sanitizing `timestamp_unix` per file:
- Auto-detects **seconds vs. milliseconds** since epoch
- Drops timestamps outside a plausible window (2010-01-01 … 2030-12-31 UTC)
- Interpolates numeric series for continuous lines (plotting only; **source CSVs are untouched**)
- Keeps your Windows folders and enables grid + legend


In [7]:
# --- This cell performs data processing / plotting step ---
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# === CONFIG (Windows paths as requested) ===
BASE_DIRS = [
    Path(r"./tensiomark"),
    Path(r"./climavi"),
    Path(r"./plantobelly"),
    Path(r"./smt100"),
    Path(r"./watermark"),
]

OUTPUT_DIR = Path(r"./plots")

# Fixed x-axis (UTC)
X_START = pd.Timestamp("2024-01-01", tz="UTC")
X_END   = pd.Timestamp("2025-12-31 23:59:59", tz="UTC")

# Plausible timestamp window (seconds since epoch)
TS_MIN = pd.Timestamp("2010-01-01", tz="UTC").timestamp()
TS_MAX = pd.Timestamp("2030-12-31 23:59:59", tz="UTC").timestamp()

(BASE_DIRS, OUTPUT_DIR, X_START, X_END, int(TS_MIN), int(TS_MAX))


([WindowsPath('tensiomark'),
  WindowsPath('climavi'),
  WindowsPath('plantobelly'),
  WindowsPath('smt100'),
  WindowsPath('watermark')],
 WindowsPath('plots'),
 Timestamp('2024-01-01 00:00:00+0000', tz='UTC'),
 Timestamp('2025-12-31 23:59:59+0000', tz='UTC'),
 1262304000,
 1924991999)

In [8]:
# --- This cell performs data processing / plotting step ---
import re
from typing import Dict, List, Tuple

def list_all_csvs(base_dirs: List[Path]) -> List[Path]:
    files = []
    for d in base_dirs:
        if not d.exists():
            print(f"[warn] base dir not found: {d}")
            continue
        for p in sorted(d.glob("*.csv")):
            if p.name == "created_files_manifest.csv":
                continue
            files.append(p)
    return files

def group_by_sensor(files: List[Path]) -> Dict[str, List[Path]]:
    groups = {}
    for p in files:
        stem = p.stem
        if " " in stem:
            sensor, _site = stem.split(" ", 1)
        else:
            sensor = stem
        groups.setdefault(sensor, []).append(p)
    return groups

def depth_sorted_columns(cols: List[str]) -> List[str]:
    prio = {"-10":1,"-30":2,"-45":3,"10":4,"30":5,"45":6,"Ballen":7}
    return sorted(cols, key=lambda c: (prio.get(str(c), 999), str(c)))


In [9]:
# --- This cell performs data processing / plotting step ---
from typing import Tuple

def sanitize_timestamp_series(s: pd.Series, ts_min: float, ts_max: float) -> Tuple[pd.Series, dict]:
    stats = {"n_total": int(s.shape[0]), "n_non_numeric": 0, "ms_detected": False, "n_out_of_window": 0}
    s_num = pd.to_numeric(s, errors="coerce")
    stats["n_non_numeric"] = int(s_num.isna().sum())
    if s_num.max(skipna=True) > 1e12:
        s_num = s_num / 1000.0
        stats["ms_detected"] = True
    mask_in = (s_num >= ts_min) & (s_num <= ts_max)
    stats["n_out_of_window"] = int((~mask_in & s_num.notna()).sum())
    s_num = s_num.where(mask_in, np.nan)
    return s_num, stats

def load_interp_csv_robust(path: Path, ts_min: float, ts_max: float) -> Tuple[pd.DataFrame, dict]:
    df = pd.read_csv(path)
    info = {"file": str(path), "drops": 0, "ms_detected": False, "non_numeric": 0, "out_of_window": 0}
    if "timestamp_unix" not in df.columns:
        raise ValueError(f"'timestamp_unix' missing in {path}")
    ts_series, stats = sanitize_timestamp_series(df["timestamp_unix"], ts_min, ts_max)
    info["ms_detected"] = stats["ms_detected"]
    info["non_numeric"] = stats["n_non_numeric"]
    info["out_of_window"] = stats["n_out_of_window"]
    df["__dt"] = pd.to_datetime(ts_series, unit="s", utc=True, errors="coerce")
    before = len(df)
    df = df.loc[df["__dt"].notna()].copy()
    info["drops"] = before - len(df)
    value_cols = [c for c in df.columns if c not in ["timestamp_unix","__dt"]]
    for c in value_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.sort_values("__dt")
    if value_cols:
        df[value_cols] = df[value_cols].interpolate(method="linear", limit_direction="both")
    ordered = ["__dt"] + depth_sorted_columns(value_cols)
    return df[ordered], info


In [10]:
# --- This cell performs data processing / plotting step ---
def build_pdf_for_sensor(sensor: str, paths: List[Path], x_start, x_end, output_dir: Path, ts_min: float, ts_max: float) -> Path:
    output_dir.mkdir(parents=True, exist_ok=True)
    pdf_path = output_dir / f"{sensor}_plots.pdf"
    logs = []
    with PdfPages(pdf_path) as pdf:
        for p in sorted(paths, key=lambda x: x.stem):
            site = p.stem.split(" ", 1)[1] if " " in p.stem else p.stem
            df, info = load_interp_csv_robust(p, ts_min, ts_max)
            logs.append(info)
            fig, ax = plt.subplots(figsize=(10, 5))
            for c in df.columns:
                if c != "__dt":
                    ax.plot(df["__dt"], df[c], label=c)
            ax.set_title(f"{sensor} – {site}")
            ax.set_xlabel("Datum (UTC)")
            ax.set_ylabel("Messwert")
            ax.set_xlim([x_start, x_end])
            ax.legend(loc="best", fontsize="small", ncol=2)
            ax.grid(True, which="both", linestyle="--", alpha=0.5)
            fig.tight_layout()
            pdf.savefig(fig)
            plt.close(fig)
    import pandas as pd
    log_df = pd.DataFrame(logs)
    display_cols = ["file","ms_detected","non_numeric","out_of_window","drops"]
    try:
        from IPython.display import display
        display(log_df[display_cols])
    except Exception:
        print("=== Timestamp sanitation summary (per file) ===")
        print(log_df[display_cols].to_string(index=False))
    return pdf_path


In [11]:
# Discover and group
all_files = list_all_csvs(BASE_DIRS)
groups = group_by_sensor(all_files)
len(all_files), {k: len(v) for k,v in groups.items()}


(21,
 {'Tensiomark': 3,
  'Climavi': 5,
  'Plantobelly': 3,
  'SMT100': 5,
  'Watermark': 5})

In [12]:
# --- This cell performs data processing / plotting step ---
generated = {}
for sensor, paths in groups.items():
    out_pdf = build_pdf_for_sensor(sensor, paths, X_START, X_END, OUTPUT_DIR, TS_MIN, TS_MAX)
    generated[sensor] = str(out_pdf)

generated


Unnamed: 0,file,ms_detected,non_numeric,out_of_window,drops
0,tensiomark\Tensiomark 2.csv,False,0,1,1
1,tensiomark\Tensiomark 3.csv,False,0,1,1
2,tensiomark\Tensiomark 5.csv,False,0,1,1


Unnamed: 0,file,ms_detected,non_numeric,out_of_window,drops
0,climavi\Climavi 1.csv,False,0,0,0
1,climavi\Climavi 2.csv,False,0,0,0
2,climavi\Climavi 3.csv,False,0,0,0
3,climavi\Climavi 4.csv,False,0,0,0
4,climavi\Climavi 5.csv,False,0,0,0


Unnamed: 0,file,ms_detected,non_numeric,out_of_window,drops
0,plantobelly\Plantobelly 2.csv,False,0,0,0
1,plantobelly\Plantobelly 3.csv,False,0,0,0
2,plantobelly\Plantobelly 5.csv,False,0,0,0


Unnamed: 0,file,ms_detected,non_numeric,out_of_window,drops
0,smt100\SMT100 1.csv,False,0,0,0
1,smt100\SMT100 2.csv,False,0,0,0
2,smt100\SMT100 3.csv,False,0,0,0
3,smt100\SMT100 4.csv,False,0,0,0
4,smt100\SMT100 5.csv,False,0,0,0


Unnamed: 0,file,ms_detected,non_numeric,out_of_window,drops
0,watermark\Watermark 1.csv,False,0,0,0
1,watermark\Watermark 2.csv,False,0,0,0
2,watermark\Watermark 3.csv,False,0,0,0
3,watermark\Watermark 4.csv,False,0,0,0
4,watermark\Watermark 5.csv,False,0,0,0


{'Tensiomark': 'plots\\Tensiomark_plots.pdf',
 'Climavi': 'plots\\Climavi_plots.pdf',
 'Plantobelly': 'plots\\Plantobelly_plots.pdf',
 'SMT100': 'plots\\SMT100_plots.pdf',
 'Watermark': 'plots\\Watermark_plots.pdf'}