I'll provide a Python script to convert radiosonde data files to NetCDF format. Since I haven't seen your specific file format, I'll make some assumptions based on common radiosonde data structures. You may need to adjust the script to match your exact file format.

In [15]:
import re
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime
import os

def parse_radiosonde_file(file_path):
    """Parse radiosonde data file with metadata, headers, and data sections"""
    
    # Initialize variables
    metadata = {}
    header = []
    data = []
    in_header = False
    in_data = False
    in_metadata = True
    
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
                
            # Check for metadata (typically key-value pairs)
            if in_metadata and ':' in line:
                key, value = line.split(':', 1)
                metadata[key.strip()] = value.strip()
                continue
            else:
                in_metadata = False
                
            # Check for header start
            if line.startswith('---') or any(word in line.lower() for word in ['press', 'hght', 'temp', 'dwpt']):
                in_header = True
                header = re.split(r'\s{2,}', line.strip())  # Split on multiple spaces
                continue
                
            # After header, data begins
            if in_header and not in_data:
                in_data = True
                continue
                
            # Parse data lines
            if in_data:
                # Skip section dividers
                if line.startswith('---') or line.startswith('Standard') or line.startswith('Significant'):
                    continue
                # Split data line (assuming space-separated values)
                data.append(re.split(r'\s+', line.strip()))
    
    # Convert data to DataFrame
    df = pd.DataFrame(data, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass
    
    return metadata, df

def create_netcdf(metadata, df, output_path):
    """Create NetCDF file from parsed data"""
    
    # Extract common metadata
    launch_time = metadata.get('Launch Time', '')
    station_id = metadata.get('Station Number', '')
    station_name = metadata.get('Station Name', '')
    
    # Convert time string to datetime object
    try:
        launch_dt = datetime.strptime(launch_time, '%Y-%m-%d %H:%M:%S')
    except:
        launch_dt = datetime.now()  # fallback
    
    # Create xarray Dataset
    ds = xr.Dataset()
    
    # Add metadata as global attributes
    ds.attrs['title'] = f"Radiosonde Data - {station_name}"
    ds.attrs['station_id'] = station_id
    ds.attrs['station_name'] = station_name
    ds.attrs['launch_time'] = launch_time
    ds.attrs['source'] = 'Radiosonde'
    ds.attrs['history'] = f"Created {datetime.now().isoformat()}"
    
    # Add variables from DataFrame
    for col in df.columns:
        # Skip non-numeric columns
        if not np.issubdtype(df[col].dtype, np.number):
            continue
            
        # Create dimension (assuming height/pressure as primary dimension)
        if 'hght' in col.lower() or 'height' in col.lower():
            ds['height'] = (('height'), df[col].values)
            ds['height'].attrs['units'] = 'm'
            ds['height'].attrs['long_name'] = 'Height above sea level'
            primary_dim = 'height'
        elif 'press' in col.lower() or 'pressure' in col.lower():
            ds['pressure'] = (('pressure'), df[col].values)
            ds['pressure'].attrs['units'] = 'hPa'
            ds['pressure'].attrs['long_name'] = 'Atmospheric pressure'
            primary_dim = 'pressure'
            
    # If no explicit dimension found, use index
    if 'primary_dim' not in locals():
        ds['index'] = (('index'), np.arange(len(df)))
        primary_dim = 'index'
    
    # Add other variables
    for col in df.columns:
        if not np.issubdtype(df[col].dtype, np.number):
            continue
        if col.lower() in ['hght', 'height', 'press', 'pressure']:
            continue
            
        var_name = col.lower().replace(' ', '_')
        ds[var_name] = ((primary_dim), df[col].values)
        
        # Add units if known
        if 'temp' in var_name:
            ds[var_name].attrs['units'] = 'Celsius'
            ds[var_name].attrs['long_name'] = 'Temperature'
        elif 'dwpt' in var_name or 'dew' in var_name:
            ds[var_name].attrs['units'] = 'Celsius'
            ds[var_name].attrs['long_name'] = 'Dew point temperature'
        elif 'rh' in var_name:
            ds[var_name].attrs['units'] = '%'
            ds[var_name].attrs['long_name'] = 'Relative humidity'
        elif 'wind' in var_name:
            ds[var_name].attrs['units'] = 'm/s'
            ds[var_name].attrs['long_name'] = 'Wind speed'
    
    # Add time coordinate
    ds['time'] = launch_dt
    ds['time'].attrs['long_name'] = 'Launch time'
    
    # Save to NetCDF
    ds.to_netcdf(output_path)
    return ds

def process_directory(input_dir, output_dir):
    """Process all files in a directory"""
    os.makedirs(output_dir, exist_ok=True)
    
    # List to store all datasets for merging
    all_datasets = []
    
    for filename in os.listdir(input_dir):
        if not filename.endswith('.txt'):  # adjust extension as needed
            continue
            
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.nc")
        
        print(f"Processing {filename}...")
        try:
            metadata, df = parse_radiosonde_file(input_path)
            ds = create_netcdf(metadata, df, output_path)
            all_datasets.append(ds)
            print(f"Saved to {output_path}")
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
    
    # Combine all files into one NetCDF
    if all_datasets:
        combined_path = os.path.join(output_dir, "combined_radiosonde_data.nc")
        combined_ds = xr.concat(all_datasets, dim='time')
        combined_ds.to_netcdf(combined_path)
        print(f"Combined all files into {combined_path}")

# Example usage
if __name__ == "__main__":
    # input_directory = "path/to/your/radiosonde/files"
    # output_directory = "path/to/output/netcdf/files"
    input_directory = "."
    output_directory = "."

    process_directory(input_directory, output_directory)

How to Use This Script
Save the script as radiosonde_to_netcdf.py

Modify the input_directory and output_directory paths at the bottom of the script

Run the script: python radiosonde_to_netcdf.py

Customizing the Script
Since I haven't seen your exact file format, you may need to adjust:

Metadata parsing: Modify the parse_radiosonde_file function to match how your metadata is structured

Column names: Update the variable detection logic in create_netcdf to match your column headers

Data sections: Adjust how the script identifies the start/end of data sections if your format differs

Combining Multiple Files
The script automatically combines all processed files into a single NetCDF file named combined_radiosonde_data.nc in the output directory. The combination is done along the time dimension.

Alternative Approach for Large Datasets
If you have hundreds of files and memory becomes an issue, you might want to process them in batches or use Dask for out-of-core computation. Here's a modified version of the combination code:

In [None]:
import xarray as xr
import glob

def combine_netcdf_files(output_dir):
    """Combine all NetCDF files in a directory"""
    files = glob.glob(os.path.join(output_dir, "*.nc"))
    
    # Exclude the combined file if it exists
    files = [f for f in files if not f.endswith("combined_radiosonde_data.nc")]
    
    if not files:
        print("No NetCDF files found to combine")
        return
    
    # Open all files as a single dataset
    combined_ds = xr.open_mfdataset(files, combine='by_coords')
    
    # Save combined file
    combined_path = os.path.join(output_dir, "combined_radiosonde_data.nc")
    combined_ds.to_netcdf(combined_path)
    print(f"Combined {len(files)} files into {combined_path}")

In [10]:
import pandas as pd
import xarray as xr
from datetime import datetime
from io import StringIO
import os

def convert_radiosonde_to_netcdf(file_path, output_dir="."):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        all_lines = f.readlines()

    # Metadata
    station = all_lines[0].split(':')[1].strip()

    # Parse location
    location_parts = all_lines[1].split(':')[1].strip().split()
    lat = float(location_parts[0])
    lon = float(location_parts[1])
    alt = float(location_parts[2])  # altitude in meters

    # Parse launch time
    launch_time_str = all_lines[2].split(':')[1].strip()
    launch_time = datetime.strptime(launch_time_str, "%Y %m %d at %H %M UTC")

    # Radiosonde info
    rs_type = all_lines[3].split(':')[1].strip()
    rs_number = all_lines[4].split(':')[1].strip()

    # Locate start of data block
    data_start_idx = next(i for i, line in enumerate(all_lines) if line.strip().startswith('0'))
    data_lines = all_lines[data_start_idx:]

    # Define columns based on structure
    columns = ['min', 'sec', 'Hght_gpm', 'Press_hPa', 'Temp_degC', 'RH_percent',
               'DewPoint_degC', 'MixingRatio_gpkg', 'WindDir_deg', 'WindSpd_ms',
               'Latitude_deg', 'Longitude_deg']

    # Read the data
    df = pd.read_csv(StringIO(''.join(data_lines)),
                     delim_whitespace=True,
                     names=columns,
                     engine='python')
    
    # Drop rows with missing Pressure (filtering interpolated/special levels)
    df = df.dropna(subset=['Press_hPa'])

    # Compute absolute time
    df['TimeSec'] = df['min'] * 60 + df['sec']
    time = pd.to_timedelta(df['TimeSec'], unit='s') + pd.Timestamp(launch_time)

    # Create xarray Dataset
    ds = xr.Dataset(
        {
            "Pressure": ("time", df["Press_hPa"]),
            "Temperature": ("time", df["Temp_degC"]),
            "Humidity": ("time", df["RH_percent"]),
            "DewPoint": ("time", df["DewPoint_degC"]),
            "MixingRatio": ("time", df["MixingRatio_gpkg"]),
            "WindDirection": ("time", df["WindDir_deg"]),
            "WindSpeed": ("time", df["WindSpd_ms"]),
            "Height": ("time", df["Hght_gpm"]),
            "Latitude": ("time", df["Latitude_deg"]),
            "Longitude": ("time", df["Longitude_deg"])
        },
        coords={
            "time": time
        },
        attrs={
            "station": station,
            "latitude": lat,
            "longitude": lon,
            "altitude_m": alt,
            "launch_time": str(launch_time),
            "RS_type": rs_type,
            "RS_number": rs_number
        }
    )

    # Output NetCDF file
    filename = os.path.splitext(os.path.basename(file_path))[0] + ".nc"
    output_path = os.path.join(output_dir, filename)
    ds.to_netcdf(output_path)
    print(f"NetCDF saved to: {output_path}")
    return output_path

In [None]:
# Single file conversion
convert_radiosonde_to_netcdf("summary_2407130557.EDT", output_dir="./output")

  df = pd.read_csv(StringIO(''.join(data_lines)),


PermissionError: [Errno 13] Permission denied: 'd:\\Projects\\my_projects\\work\\radio_sonde\\output\\summary_2407130557.nc'