In [1]:
#!/usr/bin/env python
"""
compile_toa5.py
Combine one or many Campbell Scientific TOA5 *.dat files into a single
pandas DataFrame.

* Works with any number of files in a directory.
* Finds the column-header line automatically (looks for “TIMESTAMP…”, 
  “TIMESTAMP_START”, or “TIMESTAMP_END”).
* Skips the metadata record plus the two rows that hold units / sample counts.
* Parses timestamp columns to datetime and converts everything else to
  numbers where possible.
* Verifies that all files share an identical column set (raises otherwise).
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import Iterable, List

import pandas as pd

NA_VALUES = ["NAN", "nan", "NaN", ""]   # extend if needed


# ----------------------------------------------------------------------
def _discover_layout(path: Path) -> tuple[List[str], List[int]]:
    """
    Inspect *path* and return (column_names, rows_to_skip).

    The algorithm is:
      1. Scan the first ~20 lines.
      2. The first line that contains TIMESTAMP(/_START/_END) is taken
         as the column-name row.
      3. All lines before that row are skipped.
      4. The two immediately following lines (units & sample-counts) are
         also skipped.
    """
    header_row = None
    first_20: list[str] = []

    with path.open("r", encoding="latin1") as fh:  # latin-1 handles µ etc.
        for i in range(20):
            line = fh.readline()
            if not line:                      # reached EOF early
                break
            first_20.append(line)
            if re.search(r'"?TIMESTAMP(_START|_END)?', line):
                header_row = i
                break

    if header_row is None:
        raise ValueError(f"Couldn’t find a TIMESTAMP column in {path.name}")

    # Split column names, stripping surrounding quotes
    col_line = first_20[header_row].strip()
    columns = [c.strip('"') for c in col_line.split(",")]

    # rows to skip:   0 … header_row-1   +   header_row+1, header_row+2
    skip = list(range(header_row)) + [header_row + 1, header_row + 2]
    return columns, skip


# ----------------------------------------------------------------------
def read_toa5(path: Path) -> pd.DataFrame:
    """Read one TOA5 file into a DataFrame with correct dtypes."""
    columns, skiprows = _discover_layout(path)

    df = pd.read_csv(
        path,
        names=columns,
        header=0,            # after skipping, the first remaining row is data
        skiprows=skiprows,
        na_values=NA_VALUES,
        quotechar='"',
        low_memory=False,    # keeps dtype inference robust
    )

    # Parse timestamp columns
    ts_cols = [c for c in df.columns if c.upper().startswith("TIMESTAMP")]
    for c in ts_cols:
        df[c] = pd.to_datetime(df[c], errors="coerce")

    # Convert everything else that still looks like "object" to numeric
    obj_cols = df.select_dtypes(include="object").columns.difference(ts_cols)
    df[obj_cols] = df[obj_cols].apply(pd.to_numeric, errors="ignore")

    return df


# ----------------------------------------------------------------------
def compile_directory(
    directory: Path | str,
    pattern: str = "TOA5_*_CSFormat_*.dat",
    sort_by: str | Iterable[str] | None = None,
) -> pd.DataFrame:
    """
    Read *pattern* in *directory*, return a single concatenated DataFrame.
    """
    directory = Path(directory)
    files = sorted(directory.glob(pattern))
    if not files:
        raise FileNotFoundError(f"No files matching {pattern} in {directory}")

    dfs = [read_toa5(p) for p in files]

    # ——— sanity check ———
    cols0 = dfs[0].columns
    if any(not cols0.equals(df.columns) for df in dfs[1:]):
        raise ValueError("Column sets differ across files – check headers.")

    out = pd.concat(dfs, ignore_index=True)

    if sort_by is not None:
        out = out.sort_values(sort_by).reset_index(drop=True)

    return out



In [6]:
test_file = pathlib.Path(r"G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTW\Flux_CSFormat\TOA5_21025_Flux_CSFormat_18_2023_12_04_0030.dat")
df = pd.read_csv(test_file, skiprows=[0,2,3], na_values=["NAN", "nan", "NaN"], quotechar='"', low_memory=False)
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], errors='coerce')
df

Unnamed: 0,TIMESTAMP,RECORD,FC_mass,FC_QC,FC_samples,LE,LE_QC,LE_samples,H,H_QC,...,G_1_1_2,SG_1_1_1,SG_1_1_2,FETCH_MAX,FETCH_90,FETCH_55,FETCH_40,UPWND_DIST_INTRST,FP_DIST_INTRST,FP_EQUATION
0,2023-12-04 00:30:00,3531,-0.047333,8,18000,9.180949,8,18000,6.428024,8,...,,,,25.500060,175.61250,55.361410,40.337520,302.6379,95.67057,KormannMeixner
1,2023-12-04 01:00:00,3532,-0.106157,8,18000,11.053280,8,18000,7.367119,8,...,,,,4.054108,54.94184,14.040040,7.958910,302.6379,98.55187,KormannMeixner
2,2023-12-04 01:30:00,3533,0.062420,8,18000,5.443553,8,18000,1.302324,8,...,,,,10.155610,124.57700,29.580140,19.326610,302.6379,96.44489,KormannMeixner
3,2023-12-04 02:00:00,3534,0.042157,8,18000,-0.376746,8,18000,-1.396814,8,...,,,,3.533708,72.96345,14.976570,8.191458,302.6379,97.42684,KormannMeixner
4,2023-12-04 02:30:00,3535,0.063322,7,18000,-9.541075,7,18000,-11.067350,7,...,,,,121.718000,351.27360,175.896100,132.837800,302.6379,84.45523,Kljun et al
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,2023-12-05 09:00:00,3596,0.042639,9,18000,33.684590,8,18000,12.358530,8,...,,,,23.665650,176.48430,53.386430,38.400250,302.6379,95.49059,KormannMeixner
66,2023-12-05 09:30:00,3597,0.051271,8,18000,25.765530,8,18000,3.162924,8,...,,,,14.798050,196.88530,43.266390,28.913730,302.6379,93.77501,KormannMeixner
67,2023-12-05 10:00:00,3598,-0.001505,5,18000,75.812000,5,18000,22.718070,5,...,,,,15.575830,117.52690,35.624310,25.349730,302.6379,97.57819,KormannMeixner
68,2023-12-05 10:30:00,3599,-0.031856,3,18000,110.072800,3,18000,49.486990,3,...,,,,4.076138,36.60402,11.164580,6.952931,302.6379,99.52071,KormannMeixner


In [2]:
import pathlib

site_folders = {#'US-UTD':'Dugout_Ranch',
                #'US-UTB':'BSF',
                #'US-UTJ':'Bluff',
                #'US-UTW':'Wellington',
                #'US-UTE':'Escalante',
                #'US-UTM':'Matheson',
                'US-UTP':'Phrag',
                #'US-CdM':'Cedar_mesa',
                #'US-UTV':'Desert_View_Myton',
                #'US-UTN':'Juab',
                #'US-UTG':'Green_River',
                #'US-UTL':'Pelican_Lake',
                }

stationdf = {}

raw_fold = pathlib.Path(f'G:/Shared drives/UGS_Flux/Data_Downloads/compiled')

for key, value in site_folders.items():

    parent_fold = raw_fold / f"{key}" / "Flux_CSFormat"

    dfs = {}
    for file in parent_fold.glob("TOA5_*_CSFormat_*.dat"):
        if not file.is_file():
            print(f"Skipping {file} (not a file)")
            continue
        print(f"Processing {file}...")
        
        df = pd.read_csv(file, skiprows=[0,2,3], na_values=["NAN", "nan", "NaN"], quotechar='"', low_memory=False)
        df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], errors='coerce')
        dfs[file.stem] = df
    stationdf[key] = pd.concat(dfs)
    print(stationdf[key].head())
    print(stationdf[key].dtypes.head())
    begtime = pd.to_datetime(stationdf[key]['TIMESTAMP'].values[0])
    endtime = pd.to_datetime(stationdf[key]['TIMESTAMP'].values[-1])
    stationdf[key].to_csv(f"../../out_data/station_data/{key}-CSFormat_HH_{begtime:%Y%m%d%H%M}_{endtime:%Y%m%d%H%M}.csv")


Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8441_Flux_CSFormat_0_1_2023_01_26_1730.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8441_Flux_CSFormat_1_1_2023_02_07_0030.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8441_Flux_CSFormat_2_1_2023_11_28_0130.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8441_Flux_CSFormat_4_1_2023_11_28_0200.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8442_Flux_CSFormat_0_2024_06_07_0930.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8442_Flux_CSFormat_1_2024_06_07_2030.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CSFormat\TOA5_8442_Flux_CSFormat_2_2024_07_24_2100.dat...
Processing G:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTP\Flux_CS