In [8]:
%cd ./datasets/

[WinError 2] The system cannot find the file specified: './datasets/'
c:\Personal Files\Arya\Programs\Github\float-chat\floatchat\backend\datasets


In [9]:
import xarray as xr
import pandas as pd
import numpy as np
import os
import json

def parse_argo_profile(nc_path: str):
    """
    Parse ARGO NetCDF profile file into structured DataFrames.
    
    Returns:
        profiles_df: DataFrame with profile metadata (one row per profile)
        measurements_df: DataFrame with depth measurements (multiple rows per profile)
    """
    # Load the NetCDF file
    ds = xr.open_dataset(nc_path, decode_timedelta=False)
    
    # Extract profile metadata
    profiles_data = []
    n_prof = ds.dims.get('N_PROF', 1)
    
    for i in range(n_prof):
        # Extract profile-level metadata
        profile_data = {
            'profile_id': f"prof_{i:03d}",
            'float_id': str(ds['PLATFORM_NUMBER'].values[i] if 'PLATFORM_NUMBER' in ds else f"float_{i}"),
            'cycle_number': int(ds['CYCLE_NUMBER'].values[i]) if 'CYCLE_NUMBER' in ds else i,
            'lat': float(ds['LATITUDE'].values[i]) if 'LATITUDE' in ds else None,
            'lon': float(ds['LONGITUDE'].values[i]) if 'LONGITUDE' in ds else None,
        }
        
        # Handle date extraction
        if 'JULD' in ds:
            try:
                # Convert Julian day to datetime
                juld = ds['JULD'].values[i]
                if not np.isnan(juld):
                    # ARGO uses days since 1950-01-01
                    ref_date = pd.to_datetime('1950-01-01')
                    profile_data['observation_date'] = ref_date + pd.Timedelta(days=juld)
                else:
                    profile_data['observation_date'] = None
            except:
                profile_data['observation_date'] = None
        else:
            profile_data['observation_date'] = None
            
        # Add QC summary placeholder
        profile_data['qc_summary'] = 'Data available'
        
        profiles_data.append(profile_data)
    
    profiles_df = pd.DataFrame(profiles_data)
    
    # Extract measurements data
    measurements_data = []
    n_levels = ds.dims.get('N_LEVELS', 0)
    
    for prof_idx in range(n_prof):
        profile_id = f"prof_{prof_idx:03d}"
        
        for level_idx in range(n_levels):
            # Extract depth measurements
            measurement = {
                'profile_id': profile_id,
                'level': level_idx,
            }
            
            # Extract depth (pressure)
            if 'PRES' in ds:
                pres = ds['PRES'].values[prof_idx, level_idx] if ds['PRES'].ndim > 1 else ds['PRES'].values[level_idx]
                if not np.isnan(pres):
                    measurement['depth'] = float(pres)  # Pressure as depth proxy
                else:
                    continue  # Skip invalid measurements
            else:
                continue
                
            # Extract temperature
            if 'TEMP' in ds:
                temp = ds['TEMP'].values[prof_idx, level_idx] if ds['TEMP'].ndim > 1 else ds['TEMP'].values[level_idx]
                measurement['temperature'] = float(temp) if not np.isnan(temp) else None
                
            # Extract salinity
            if 'PSAL' in ds:
                psal = ds['PSAL'].values[prof_idx, level_idx] if ds['PSAL'].ndim > 1 else ds['PSAL'].values[level_idx]
                measurement['salinity'] = float(psal) if not np.isnan(psal) else None
                
            # Calculate density (approximate)
            if measurement.get('temperature') and measurement.get('salinity'):
                # Simple density calculation (rough approximation)
                temp = measurement['temperature']
                sal = measurement['salinity']
                measurement['density'] = 1000 + (sal - 35) * 0.8 - (temp - 10) * 0.2
            else:
                measurement['density'] = None
                
            measurements_data.append(measurement)
    
    measurements_df = pd.DataFrame(measurements_data)
    
    # Clean up
    ds.close()
    
    return profiles_df, measurements_df

def parse_argo_profile_with_summary(nc_path: str):
    """
    Parse ARGO NetCDF profile file into:
      - profile metadata (profiles_df)
      - depth measurements (measurements_df)
      - vector DB summaries (vector_records)
    """
    profiles_df, measurements_df = parse_argo_profile(nc_path)

    vector_records = []

    if not profiles_df.empty and not measurements_df.empty:
        for profile_id in profiles_df["profile_id"].unique():
            subset = measurements_df[measurements_df["profile_id"] == profile_id]

            if subset.empty:
                continue

            # Compute derived stats
            min_depth = subset["depth"].min()
            max_depth = subset["depth"].max()
            mean_temp = subset["temperature"].mean()
            mean_salinity = subset["salinity"].mean()
            mean_density = subset["density"].mean() if "density" in subset else None

            # Lookup profile metadata
            meta = profiles_df[profiles_df["profile_id"] == profile_id].iloc[0]

            # Create summary text
            summary_text = (
                f"Profile {profile_id} from float {meta['float_id']} observed on {meta['observation_date']} "
                f"at location ({meta['lat']}, {meta['lon']}). "
                f"Cycle number {meta['cycle_number']}. "
                f"Depth range: {min_depth:.1f}m to {max_depth:.1f}m. "
                f"Mean temperature: {mean_temp:.2f}°C, mean salinity: {mean_salinity:.2f} PSU, "
                f"mean density: {mean_density:.2f} kg/m³. "
                f"QC summary: {meta['qc_summary']}."
            )

            # Create vector DB record
            vector_record = {
                "id": profile_id,
                "text": summary_text,
                "metadata": {
                    "float_id": meta["float_id"],
                    "cycle_number": int(meta["cycle_number"]) if pd.notna(meta["cycle_number"]) else None,
                    "date": str(meta["observation_date"]) if pd.notna(meta["observation_date"]) else None,
                    "lat": float(meta["lat"]) if pd.notna(meta["lat"]) else None,
                    "lon": float(meta["lon"]) if pd.notna(meta["lon"]) else None,
                    "min_depth": float(min_depth),
                    "max_depth": float(max_depth),
                    "mean_temp": float(mean_temp) if pd.notna(mean_temp) else None,
                    "mean_salinity": float(mean_salinity) if pd.notna(mean_salinity) else None,
                    "mean_density": float(mean_density) if pd.notna(mean_density) else None,
                }
            }

            vector_records.append(vector_record)

    return profiles_df, measurements_df, vector_records


# -------------------------------
# Example usage
# -------------------------------
nc_file = "1900054_prof.nc"  # replace with your file

profiles_df, measurements_df, vector_records = parse_argo_profile_with_summary(nc_file)

print(f"✅ Profiles: {profiles_df.shape}")
print(f"✅ Measurements: {measurements_df.shape}")
print(f"✅ Vector records: {len(vector_records)}")

# Save outputs
profiles_df.to_csv("profiles_metadata.csv", index=False)
measurements_df.to_csv("measurements_data.csv", index=False)
with open("vector_metadata.json", "w") as f:
    json.dump(vector_records, f, indent=2)

print("\n📁 Saved:")
print("- profiles_metadata.csv")
print("- measurements_data.csv")
print("- vector_metadata.json (for vector DB ingestion)")


  n_prof = ds.dims.get('N_PROF', 1)
  n_levels = ds.dims.get('N_LEVELS', 0)


✅ Profiles: (331, 7)
✅ Measurements: (16755, 6)
✅ Vector records: 331

📁 Saved:
- profiles_metadata.csv
- measurements_data.csv
- vector_metadata.json (for vector DB ingestion)
