## First program: Create empty CSV files tracking profile status

In [None]:
"""
Generate CSV file tracking CTD temperature profile status for OOI RCA Slope Base shallow profiler.
Creates rca_sb_ctd_temp_profile_status.csv with daily profile availability (2014-2025).
"""

import csv
import datetime
from pathlib import Path

def is_leap_year(year):
    """Check if year is a leap year."""
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def get_days_in_year(year):
    """Get number of days in year."""
    return 366 if is_leap_year(year) else 365

def julian_to_date(year, julian_day):
    """Convert Julian day to dd-MON-yyyy format."""
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(days=julian_day - 1)
    return date.strftime("%d-%b-%Y").upper()

def generate_profile_status_csv():
    """Generate the profile status CSV file."""
    
    output_file = Path("rca_sb_ctd_temp_profile_status.csv")
    
    # Define year range
    start_year = 2014
    end_year = 2025
    
    # Column headers
    headers = ['year', 'julian_day', 'date', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'Total', 'Noon', 'Midnight']
    
    total_days = 0
    total_profiles = 0
    
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write headers
        writer.writerow(headers)
        
        # Generate rows for each year
        for year in range(start_year, end_year + 1):
            days_in_year = get_days_in_year(year)
            
            for julian_day in range(1, days_in_year + 1):
                date_str = julian_to_date(year, julian_day)
                
                # Initialize profile columns (1-9) as 0 (will be populated when processing actual data)
                profiles = [0] * 9
                
                # Calculate totals
                total_profiles_day = sum(profiles)
                
                # Placeholder values for noon and midnight profile indices
                noon_profile = 0  # Will be determined from actual profile timing
                midnight_profile = 0  # Will be determined from actual profile timing
                
                # Write row
                row = [year, julian_day, date_str] + profiles + [total_profiles_day, noon_profile, midnight_profile]
                writer.writerow(row)
                
                total_days += 1
                total_profiles += total_profiles_day
    
    # Print diagnostics
    print(f"Generated {output_file}")
    print(f"Total days: {total_days}")
    print(f"Date range: {start_year} - {end_year}")
    print(f"Years covered: {end_year - start_year + 1}")
    print(f"Current mean profiles per day: {total_profiles / total_days:.2f}")
    print(f"Expected profiles per day when populated: 9")
    print(f"File ready for population with actual profile data")

if __name__ == "__main__":
    generate_profile_status_csv()

## Update the profile status program, write extracted profile files, create a timeline file 

In [None]:
"""
Extract individual temperature profiles from CTD NetCDF files to redux files.
"""

import pandas as pd
import xarray as xr
from pathlib import Path

def analyze_source_file(netcdf_file):
    """Analyze source NetCDF file time range and estimate profiles."""
    
    ds = xr.open_dataset(netcdf_file)
    ds = ds.swap_dims({'obs': 'time'})
    
    start_time = pd.to_datetime(ds.time.values[0])
    end_time = pd.to_datetime(ds.time.values[-1])
    
    time_range_days = (end_time - start_time).days + 1
    estimated_profiles = time_range_days * 9
    
    print(f"=== SOURCE FILE ANALYSIS ===")
    print(f"File: {netcdf_file}")
    print(f"Start time: {start_time}")
    print(f"End time: {end_time}")
    print(f"Time range: {time_range_days} days")
    print(f"Estimated profiles (9/day): {estimated_profiles}")
    print(f"================================\n")
    
    return ds, start_time, end_time

def load_profile_indices(year):
    """Load profile indices for given year."""
    profile_file = Path(f"~/profileIndices/RS01SBPS_profiles_{year}.csv").expanduser()
    if not profile_file.exists():
        return None
    return pd.read_csv(profile_file)

def extract_profiles(ds, start_time, end_time, output_dir):
    """Extract temperature profiles from NetCDF dataset."""
    
    attempted = 0
    successful = 0
    
    for year in range(start_time.year, end_time.year + 1):
        profiles_df = load_profile_indices(year)
        if profiles_df is None:
            print(f"No profile indices for {year}")
            continue
            
        daily_profiles = {}
        
        for _, profile_row in profiles_df.iterrows():
            attempted += 1
            
            profile_index = profile_row['profile']
            start_str = profile_row['start']
            peak_str = profile_row['peak']
            
            start_time_profile = pd.to_datetime(start_str)
            peak_time_profile = pd.to_datetime(peak_str)
            
            # Track daily profile sequence
            date_key = start_time_profile.date()
            if date_key not in daily_profiles:
                daily_profiles[date_key] = 0
            daily_profiles[date_key] += 1
            daily_sequence = daily_profiles[date_key]
            
            try:
                profile_data = ds.sel(time=slice(start_time_profile, peak_time_profile))
                
                if len(profile_data.time) == 0:
                    continue
                    
                # Check for sea_water_temperature data
                if 'sea_water_temperature' not in profile_data.data_vars:
                    continue
                
                # Create temperature dataset (rename variable)
                temp_ds = xr.Dataset({
                    'temperature': profile_data['sea_water_temperature']
                })
                
                # Add depth coordinate if available
                if 'depth' in profile_data.coords:
                    temp_ds = temp_ds.assign_coords(depth=profile_data['depth'])
                
                # Generate filename: AAA_SSS_TTT_BBB_YYYY_DDD_PPPP_Q_VVVV.nc
                julian_day = start_time_profile.timetuple().tm_yday
                filename = f"RCA_OSB_Profiler_Temp_{year}_{julian_day:03d}_{profile_index}_{daily_sequence}_V1.nc"
                output_path = output_dir / filename
                
                # Write file
                temp_ds.to_netcdf(output_path)
                successful += 1
                
                if successful % 50 == 0:
                    print(f"Extracted {successful} profiles...")
                    
            except Exception as e:
                print(f"Error processing profile {profile_index}: {e}")
                continue
    
    return attempted, successful

def main():
    """Main processing function."""
    
    output_dir = Path("~/redux").expanduser()
    output_dir.mkdir(exist_ok=True)
    
    ctd_file = Path("~/ooidata/rca/sb/scalar/2015_2025_ctd/deployment0004_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20180208T000000.840174-20180226T115959.391002.nc").expanduser()
    
    if not ctd_file.exists():
        print(f"CTD file not found: {ctd_file}")
        return
    
    # Analyze source file first
    ds, start_time, end_time = analyze_source_file(ctd_file)
    
    # Extract profiles
    attempted, successful = extract_profiles(ds, start_time, end_time, output_dir)
    
    # Print diagnostics
    print(f"\n=== EXTRACTION COMPLETE ===")
    print(f"Profiles attempted: {attempted}")
    print(f"Profiles successfully extracted: {successful}")
    print(f"Success rate: {successful/attempted*100:.1f}%" if attempted > 0 else "No profiles attempted")
    print(f"Redux files written to: {output_dir}")

if __name__ == "__main__":
    main()


In [9]:
import xarray as xr

#ds = xr.open_dataset('~/redux2018/RCA_sb_sp_temperature_2018_048_5440_9_V1.nc')
ds_ctd = xr.open_dataset('~/ooidata/rca/sb/scalar/2016_ctd/deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160707T000000.194092-20160716T111049.607585.nc')
ds_do  = xr.open_dataset('~/ooidata/rca/sb/scalar/2016_ctd/deployment0002_RS01SBPS-SF01A-2A-DOFSTA102-streamed-do_fast_sample_20160511T235959.098689-20160716T120000.633855.nc')
ds_ctd.data_vars.keys()

KeysView(Data variables:
    sea_water_pressure_qc_results                      (obs) uint8 ...
    sea_water_pressure                                 (obs) float64 ...
    sea_water_electrical_conductivity_qartod_results   (obs) uint8 ...
    corrected_dissolved_oxygen                         (obs) float64 ...
    sea_water_pressure_qc_executed                     (obs) uint8 ...
    sea_water_practical_salinity_qc_executed           (obs) uint8 ...
    driver_timestamp                                   (obs) datetime64[ns] ...
    id                                                 (obs) |S36 ...
    conductivity                                       (obs) float64 ...
    temperature                                        (obs) float64 ...
    sea_water_temperature_qartod_results               (obs) uint8 ...
    corrected_dissolved_oxygen_qc_executed             (obs) uint8 ...
    corrected_dissolved_oxygen_qc_results              (obs) uint8 ...
    pressure_temp                   

In [10]:
ds_do.data_vars.keys()

KeysView(Data variables:
    preferred_timestamp                     (obs) object ...
    ingestion_timestamp                     (obs) datetime64[ns] ...
    port_timestamp                          (obs) datetime64[ns] ...
    deployment                              (obs) int32 ...
    corrected_dissolved_oxygen_qc_executed  (obs) uint8 ...
    id                                      (obs) |S36 ...
    corrected_dissolved_oxygen              (obs) float64 ...
    corrected_dissolved_oxygen_qc_results   (obs) uint8 ...
    internal_timestamp                      (obs) datetime64[ns] ...
    ext_volt0                               (obs) float64 ...
    driver_timestamp                        (obs) datetime64[ns] ...)

In [None]:
"""
Plot temperature profiles with temperature on x-axis and depth on y-axis.
"""

import matplotlib.pyplot as plt
import xarray as xr
from pathlib import Path
import sys

    
# Load the profile data
ds = xr.open_dataset('~/redux/RCA_OSB_Profiler_Temp_2018_048_5440_9_V1.nc')

# Extract temperature and depth
temperature = ds['temperature'].values
depth = ds['depth'].values

# Create the plot
plt.figure(figsize=(8, 10))
plt.plot(temperature, depth, 'b-', linewidth=2, marker='o', markersize=2)

# Set up axes
plt.xlabel('Temperature (Â°C)', fontsize=12)
plt.ylabel('Depth (m)', fontsize=12)
plt.ylim(200, 0)  # 200m at bottom, 0m at top
plt.grid(True, alpha=0.3)

# Add title with filename
profile_name = Path('~/redux/RCA_OSB_Profiler_Temp_2018_048_5440_9_V1.nc').stem
plt.title(f'Temperature Profile: {profile_name}', fontsize=14)

# Tight layout and show
plt.tight_layout()
plt.show()


## Generate Temperature Mixed Layer Depth estimates: Interactive 


This code does not run in a Jupyter notebook: Something about the mouse events.
It will run in IDLE or from the PowerShell command line. 
The file is called `tmld_selector.py`.
The output file is `tmld_estimates.csv`.
It lives in the home directory of the `argosy` repository.
Eventually it will be renamed MLDSelector.py for Mixed Layer Depth Selector.


There is a major **bug** in the code however: The bundle plotter gets the profile index
wrong so the MLD shows up in the wrong place.


`Use regular Python`

## Shard a collection of source files into redux profile files

In [2]:
import pandas as pd
import xarray as xr
from pathlib import Path
import numpy as np

def get_input_with_default(prompt, default):
    """Get user input with default value."""
    response = input(f"{prompt} ").strip().lower()
    return response if response else default

def load_profile_indices(year):
    """Load profile indices for given year."""
    profile_file = Path(f"~/profileIndices/RS01SBPS_profiles_{year}.csv").expanduser()
    if not profile_file.exists():
        return None
    return pd.read_csv(profile_file)

# Sensor mapping: input variable -> output variable name
SENSOR_MAP = {
    'sea_water_temperature': 'temperature',
    'sea_water_practical_salinity': 'salinity',
    'sea_water_density': 'density',
    'do_fast_sample-corrected_dissolved_oxygen': 'dissolvedoxygen'
}

def process_multi_sensor_redux():
    """Process CTD files for multiple sensor types."""
    
    # Scan for source folders
    base_folder = Path("~/ooidata/rca/sb/scalar").expanduser()
    
    print("Scanning for source folders...")
    available_years = []
    for year in range(2014, 2027):
        source_folder = base_folder / f"{year}_ctd"
        if source_folder.exists():
            file_count = len(list(source_folder.glob("*CTDPF*.nc")))
            if file_count > 0:
                print(f"  {year}_ctd: {file_count} files")
                response = get_input_with_default(f"    Process {year}? [y/n] (default y):", "y")
                if response == 'y':
                    available_years.append(year)
    
    if not available_years:
        print("No years selected")
        return
    
    print(f"\nSelected years: {available_years}")
    
    # Create output directories
    for year in range(2014, 2027):
        output_dir = Path(f"~/redux{year}").expanduser()
        output_dir.mkdir(exist_ok=True)
    
    # Statistics
    stats = {sensor: {'attempted': 0, 'written': 0, 'skipped': 0} for sensor in SENSOR_MAP.values()}
    
    # Process each year
    for folder_year in available_years:
        source_folder = base_folder / f"{folder_year}_ctd"
        ctd_files = sorted(list(source_folder.glob("*CTDPF*.nc")))
        
        print(f"\n=== Processing {folder_year}_ctd ({len(ctd_files)} files) ===")
        
        for file_idx, file_path in enumerate(ctd_files, 1):
            if file_idx % 5 == 0:
                print(f"  File {file_idx}/{len(ctd_files)}")
            
            try:
                ds = xr.open_dataset(file_path)
                ds = ds.swap_dims({'obs': 'time'})
                
                start_time = pd.to_datetime(ds.time.values[0])
                end_time = pd.to_datetime(ds.time.values[-1])
                
                # Process each year in the file
                for year in range(start_time.year, end_time.year + 1):
                    profiles_df = load_profile_indices(year)
                    if profiles_df is None:
                        continue
                    
                    daily_profiles = {}
                    
                    for _, profile_row in profiles_df.iterrows():
                        profile_index = profile_row['profile']
                        start_str = profile_row['start']
                        peak_str = profile_row['peak']
                        
                        start_time_profile = pd.to_datetime(start_str)
                        peak_time_profile = pd.to_datetime(peak_str)
                        
                        # Track daily profile sequence
                        date_key = start_time_profile.date()
                        if date_key not in daily_profiles:
                            daily_profiles[date_key] = 0
                        daily_profiles[date_key] += 1
                        daily_sequence = daily_profiles[date_key]
                        
                        try:
                            profile_data = ds.sel(time=slice(start_time_profile, peak_time_profile))
                            
                            if len(profile_data.time) == 0:
                                continue
                            
                            # Determine output folder based on profile year
                            profile_year = start_time_profile.year
                            output_dir = Path(f"~/redux{profile_year}").expanduser()
                            julian_day = start_time_profile.timetuple().tm_yday
                            
                            # Process each sensor type
                            for input_var, output_var in SENSOR_MAP.items():
                                stats[output_var]['attempted'] += 1
                                
                                # Generate filename
                                filename = f"RCA_sb_sp_{output_var}_{profile_year}_{julian_day:03d}_{profile_index}_{daily_sequence}_V1.nc"
                                output_path = output_dir / filename
                                
                                # Skip if file already exists
                                if output_path.exists():
                                    stats[output_var]['skipped'] += 1
                                    continue
                                
                                # Check if variable exists in data
                                if input_var not in profile_data.data_vars:
                                    continue
                                
                                # Create dataset with renamed variable
                                sensor_ds = xr.Dataset({
                                    output_var: profile_data[input_var]
                                })
                                
                                # Add depth coordinate if available
                                if 'depth' in profile_data.coords:
                                    sensor_ds = sensor_ds.assign_coords(depth=profile_data['depth'])
                                
                                # Remove unwanted variables
                                for var in ['lat', 'lon', 'obs']:
                                    if var in sensor_ds.coords:
                                        sensor_ds = sensor_ds.drop_vars(var)
                                    if var in sensor_ds.data_vars:
                                        sensor_ds = sensor_ds.drop_vars(var)
                                
                                # Write file
                                sensor_ds.to_netcdf(output_path)
                                stats[output_var]['written'] += 1
                            
                        except Exception:
                            continue
                
            except Exception as e:
                continue
    
    # Print statistics
    print(f"\n=== Processing Complete ===")
    for sensor, counts in stats.items():
        print(f"\n{sensor}:")
        print(f"  Attempted: {counts['attempted']}")
        print(f"  Written: {counts['written']}")
        print(f"  Skipped (already exist): {counts['skipped']}")
    
    # Report files by year and sensor
    print("\n=== Files by Year ===")
    for year in range(2014, 2027):
        output_dir = Path(f"~/redux{year}").expanduser()
        if output_dir.exists():
            sensor_counts = {}
            for sensor in SENSOR_MAP.values():
                count = len(list(output_dir.glob(f"*_{sensor}_*.nc")))
                if count > 0:
                    sensor_counts[sensor] = count
            
            if sensor_counts:
                print(f"\n{year}:")
                for sensor, count in sensor_counts.items():
                    print(f"  {sensor}: {count}")

# Run the processing
process_multi_sensor_redux()


Scanning for source folders...
  2015_ctd: 19 files


    Process 2015? [y/n] (default y):  


  2016_ctd: 11 files


    Process 2016? [y/n] (default y):  n


  2018_ctd: 17 files


    Process 2018? [y/n] (default y):  


  2019_ctd: 15 files


    Process 2019? [y/n] (default y):  


  2020_ctd: 8 files


    Process 2020? [y/n] (default y):  



Selected years: [2015, 2018, 2019, 2020]

=== Processing 2015_ctd (19 files) ===
  File 5/19
  File 10/19
  File 15/19

=== Processing 2018_ctd (17 files) ===
  File 5/17
  File 10/17
  File 15/17

=== Processing 2019_ctd (15 files) ===
  File 5/15
  File 10/15
  File 15/15

=== Processing 2020_ctd (8 files) ===
  File 5/8

=== Processing Complete ===

temperature:
  Attempted: 7012
  Written: 5894
  Skipped (already exist): 212

salinity:
  Attempted: 7012
  Written: 5894
  Skipped (already exist): 212

density:
  Attempted: 7012
  Written: 5894
  Skipped (already exist): 212

dissolvedoxygen:
  Attempted: 7012
  Written: 5894
  Skipped (already exist): 212

=== Files by Year ===

2015:
  temperature: 659
  salinity: 659
  density: 659
  dissolvedoxygen: 659

2018:
  temperature: 1849
  salinity: 1849
  density: 1849
  dissolvedoxygen: 1849

2019:
  temperature: 2105
  salinity: 2105
  density: 2105
  dissolvedoxygen: 2105

2020:
  temperature: 1281
  salinity: 1281
  density: 1281
 