In [8]:
import netCDF4 as nc
import pandas as pd
import xarray as xr

In [9]:
%cd ../argo_data/

c:\Personal Files\Arya\Programs\FloatChat\argo_data


In [36]:
try:
    ds = xr.open_dataset("1900054_prof.nc", engine="netcdf4",decode_timedelta=False)
except ValueError:
    ds = xr.open_dataset("1900054_prof.nc", engine="scipy",decode_timedelta=False)


In [37]:
ds

In [30]:
print(ds)

<xarray.Dataset> Size: 1MB
Dimensions:                              (N_PARAM: 3, N_MEASUREMENT: 6653,
                                          N_CYCLE: 332, N_HISTORY: 25)
Dimensions without coordinates: N_PARAM, N_MEASUREMENT, N_CYCLE, N_HISTORY
Data variables: (12/102)
    DATA_TYPE                            object 8B ...
    FORMAT_VERSION                       object 8B ...
    HANDBOOK_VERSION                     object 8B ...
    REFERENCE_DATE_TIME                  object 8B ...
    DATE_CREATION                        object 8B ...
    DATE_UPDATE                          object 8B ...
    ...                                   ...
    HISTORY_PARAMETER                    (N_HISTORY) object 200B ...
    HISTORY_PREVIOUS_VALUE               (N_HISTORY) float32 100B ...
    HISTORY_INDEX_DIMENSION              (N_HISTORY) object 200B ...
    HISTORY_START_INDEX                  (N_HISTORY) float64 200B ...
    HISTORY_STOP_INDEX                   (N_HISTORY) float64 200B ...
   

In [31]:
ds=ds.to_dataframe()

KeyboardInterrupt: 

In [17]:
ds.to_csv('13857_prof.csv',index=False)

In [32]:
# Handle large NetCDF files - Memory efficient approach
print("=== MEMORY-EFFICIENT NETCDF PROCESSING ===")

# First, let's examine the dataset structure without loading all data
print("Dataset dimensions:")
for dim_name, dim_size in ds.dims.items():
    print(f"  {dim_name}: {dim_size}")

print("\nData variables and their sizes:")
total_memory = 0
for var_name, var in ds.data_vars.items():
    size_mb = var.nbytes / (1024 * 1024)  # Convert to MB
    total_memory += size_mb
    print(f"  {var_name}: {var.shape} - {size_mb:.2f} MB")

print(f"\nTotal estimated memory needed: {total_memory:.2f} MB ({total_memory/1024:.2f} GB)")

# Check which variables are causing the memory issue
large_vars = []
small_vars = []
for var_name, var in ds.data_vars.items():
    size_mb = var.nbytes / (1024 * 1024)
    if size_mb > 100:  # Variables larger than 100MB
        large_vars.append((var_name, size_mb, var.shape))
    else:
        small_vars.append((var_name, size_mb, var.shape))

print(f"\nLarge variables (>100MB):")
for name, size, shape in large_vars:
    print(f"  {name}: {size:.2f} MB, shape: {shape}")

print(f"\nSmaller variables (<100MB):")
for name, size, shape in small_vars:
    print(f"  {name}: {size:.2f} MB, shape: {shape}")

=== MEMORY-EFFICIENT NETCDF PROCESSING ===
Dataset dimensions:
  N_PARAM: 3
  N_MEASUREMENT: 6653
  N_CYCLE: 332
  N_HISTORY: 25

Data variables and their sizes:
  DATA_TYPE: () - 0.00 MB
  FORMAT_VERSION: () - 0.00 MB
  HANDBOOK_VERSION: () - 0.00 MB
  REFERENCE_DATE_TIME: () - 0.00 MB
  DATE_CREATION: () - 0.00 MB
  DATE_UPDATE: () - 0.00 MB
  PLATFORM_NUMBER: () - 0.00 MB
  PROJECT_NAME: () - 0.00 MB
  PI_NAME: () - 0.00 MB
  TRAJECTORY_PARAMETERS: (3,) - 0.00 MB
  DATA_CENTRE: () - 0.00 MB
  DATA_STATE_INDICATOR: () - 0.00 MB
  PLATFORM_TYPE: () - 0.00 MB
  FLOAT_SERIAL_NO: () - 0.00 MB
  FIRMWARE_VERSION: () - 0.00 MB
  WMO_INST_TYPE: () - 0.00 MB
  POSITIONING_SYSTEM: () - 0.00 MB
  JULD: (6653,) - 0.05 MB
  JULD_STATUS: (6653,) - 0.05 MB
  JULD_QC: (6653,) - 0.05 MB
  JULD_ADJUSTED: (6653,) - 0.05 MB
  JULD_ADJUSTED_STATUS: (6653,) - 0.05 MB
  JULD_ADJUSTED_QC: (6653,) - 0.05 MB
  LATITUDE: (6653,) - 0.05 MB
  LONGITUDE: (6653,) - 0.05 MB
  POSITION_ACCURACY: (6653,) - 0.05 MB
 

  for dim_name, dim_size in ds.dims.items():


In [34]:
# Strategy 1: Extract only small variables first
print("=== EXTRACTING SMALLER VARIABLES ===")

small_data = {}

# Add coordinates (usually small)
for coord_name, coord in ds.coords.items():
    try:
        small_data[coord_name] = coord.values
        print(f"✅ Loaded coordinate: {coord_name} - {coord.shape}")
    except MemoryError:
        print(f"❌ Memory error loading coordinate: {coord_name}")

# Add small data variables
for name, size, shape in small_vars:
    try:
        small_data[name] = ds[name].values
        print(f"✅ Loaded variable: {name} - {shape}")
    except MemoryError:
        print(f"❌ Memory error loading variable: {name}")

if small_data:
    # Check if arrays have compatible lengths
    lengths = []
    for key, values in small_data.items():
        if hasattr(values, 'shape'):
            if len(values.shape) == 1:
                lengths.append(len(values))
            else:
                lengths.append(values.size)  # Total number of elements
        else:
            lengths.append(1)
    
    print(f"\nArray lengths: {set(lengths)}")
    
    # Create DataFrame with small variables
    if len(set(lengths)) <= 2:  # At most 2 different lengths
        # Find the most common length
        from collections import Counter
        length_counts = Counter(lengths)
        target_length = length_counts.most_common(1)[0][0]
        
        filtered_data = {}
        for key, values in small_data.items():
            if hasattr(values, 'shape') and len(values.shape) == 1 and len(values) == target_length:
                filtered_data[key] = values
            elif hasattr(values, 'shape') and values.size == target_length:
                filtered_data[key] = values.flatten()
        
        if filtered_data:
            df_small = pd.DataFrame(filtered_data)
            print(f"\n✅ Created DataFrame with small variables: {df_small.shape}")
            print(f"Columns: {list(df_small.columns)}")
            
            # Save small variables to CSV
            df_small.to_csv('argo_small_variables.csv', index=False)
            print("✅ Saved small variables to: argo_small_variables.csv")
            print(df_small.head())
        else:
            print("No compatible small variables found")
    else:
        print("Variables have too many different dimensions to combine easily")
else:
    print("No small variables could be loaded")

=== EXTRACTING SMALLER VARIABLES ===
✅ Loaded variable: DATA_TYPE - ()
✅ Loaded variable: FORMAT_VERSION - ()
✅ Loaded variable: HANDBOOK_VERSION - ()
✅ Loaded variable: REFERENCE_DATE_TIME - ()
✅ Loaded variable: DATE_CREATION - ()
✅ Loaded variable: DATE_UPDATE - ()
✅ Loaded variable: PLATFORM_NUMBER - ()
✅ Loaded variable: PROJECT_NAME - ()
✅ Loaded variable: PI_NAME - ()
✅ Loaded variable: TRAJECTORY_PARAMETERS - (3,)
✅ Loaded variable: DATA_CENTRE - ()
✅ Loaded variable: DATA_STATE_INDICATOR - ()
✅ Loaded variable: PLATFORM_TYPE - ()
✅ Loaded variable: FLOAT_SERIAL_NO - ()
✅ Loaded variable: FIRMWARE_VERSION - ()
✅ Loaded variable: WMO_INST_TYPE - ()
✅ Loaded variable: POSITIONING_SYSTEM - ()
✅ Loaded variable: JULD - (6653,)
✅ Loaded variable: JULD_STATUS - (6653,)
✅ Loaded variable: JULD_QC - (6653,)
✅ Loaded variable: JULD_ADJUSTED - (6653,)
✅ Loaded variable: JULD_ADJUSTED_STATUS - (6653,)
✅ Loaded variable: JULD_ADJUSTED_QC - (6653,)
✅ Loaded variable: LATITUDE - (6653,)
✅ Lo

In [35]:
# Strategy 2: Handle large variables by chunking or sampling
print("=== HANDLING LARGE VARIABLES ===")

for name, size, shape in large_vars:
    print(f"\nProcessing large variable: {name} ({size:.2f} MB)")
    var = ds[name]
    
    # Option 1: Take a sample/subset of the data
    if len(shape) >= 2:
        print(f"  Dimensions: {var.dims}")
        print(f"  Shape: {shape}")
        
        # Try to take a smaller subset
        try:
            if len(shape) == 4:  # e.g., (time, depth, lat, lon)
                # Take first time slice, every 10th depth, every 10th lat/lon
                sample = var.isel({var.dims[0]: 0, var.dims[1]: slice(0, None, 10), 
                                  var.dims[2]: slice(0, None, 10), var.dims[3]: slice(0, None, 10)})
                sample_data = sample.values
                print(f"  ✅ Sampled to shape: {sample_data.shape}")
                
                # Flatten and save
                flattened = sample_data.flatten()
                with open(f'{name}_sample.csv', 'w') as f:
                    f.write(f"{name}_sample\n")
                    for value in flattened:
                        f.write(f"{value}\n")
                print(f"  ✅ Saved sample to: {name}_sample.csv")
                
            elif len(shape) == 3:  # e.g., (time, lat, lon)
                # Take first time slice, every 5th lat/lon
                sample = var.isel({var.dims[0]: 0, var.dims[1]: slice(0, None, 5), var.dims[2]: slice(0, None, 5)})
                sample_data = sample.values
                print(f"  ✅ Sampled to shape: {sample_data.shape}")
                
                # Flatten and save
                flattened = sample_data.flatten()
                with open(f'{name}_sample.csv', 'w') as f:
                    f.write(f"{name}_sample\n")
                    for value in flattened:
                        f.write(f"{value}\n")
                print(f"  ✅ Saved sample to: {name}_sample.csv")
                
            elif len(shape) == 2:  # e.g., (lat, lon)
                # Take every 5th point in each dimension
                sample = var.isel({var.dims[0]: slice(0, None, 5), var.dims[1]: slice(0, None, 5)})
                sample_data = sample.values
                print(f"  ✅ Sampled to shape: {sample_data.shape}")
                
                # Save as 2D structure
                pd.DataFrame(sample_data).to_csv(f'{name}_sample.csv', index=False)
                print(f"  ✅ Saved sample to: {name}_sample.csv")
                
        except MemoryError:
            print(f"  ❌ Still too large, try smaller subset")
        except Exception as e:
            print(f"  ❌ Error processing {name}: {e}")
    
print(f"\n=== SUMMARY ===")
print("Created files:")
import os
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
for f in csv_files:
    size = os.path.getsize(f) / 1024  # KB
    print(f"  {f}: {size:.1f} KB")

=== HANDLING LARGE VARIABLES ===

=== SUMMARY ===
Created files:
  13857_prof.csv: 0.9 KB
  13857_prof_converted.csv: 0.9 KB


In [7]:
# Method 1: Direct conversion using to_dataframe()
try:
    # Convert xarray dataset to pandas DataFrame
    df = ds.to_dataframe()
    print("✅ Successfully converted using to_dataframe()")
    print(f"DataFrame shape: {df.shape}")
    print(f"Index levels: {df.index.names}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())
    
except Exception as e:
    print(f"❌ Error with to_dataframe(): {e}")
    print("This might happen if the dataset has incompatible dimensions")
    
    # Method 2: Reset index to flatten the DataFrame
    try:
        df = ds.to_dataframe().reset_index()
        print("✅ Successfully converted using to_dataframe().reset_index()")
        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print("\nFirst few rows:")
        print(df.head())
    except Exception as e2:
        print(f"❌ Error with reset_index(): {e2}")
        print("Let's try a manual approach...")

✅ Successfully converted using to_dataframe()
DataFrame shape: (0, 58)
Index levels: ['N_PROF', 'N_PARAM', 'N_LEVELS', 'N_CALIB', 'N_HISTORY']
Columns: ['DATA_TYPE', 'FORMAT_VERSION', 'HANDBOOK_VERSION', 'REFERENCE_DATE_TIME', 'DATE_CREATION', 'DATE_UPDATE', 'PLATFORM_NUMBER', 'PROJECT_NAME', 'PI_NAME', 'STATION_PARAMETERS', 'CYCLE_NUMBER', 'DIRECTION', 'DATA_CENTRE', 'DC_REFERENCE', 'DATA_STATE_INDICATOR', 'DATA_MODE', 'PLATFORM_TYPE', 'FLOAT_SERIAL_NO', 'FIRMWARE_VERSION', 'WMO_INST_TYPE', 'JULD', 'JULD_QC', 'JULD_LOCATION', 'LATITUDE', 'LONGITUDE', 'POSITION_QC', 'POSITIONING_SYSTEM', 'PROFILE_PRES_QC', 'PROFILE_TEMP_QC', 'VERTICAL_SAMPLING_SCHEME', 'CONFIG_MISSION_NUMBER', 'PRES', 'PRES_QC', 'PRES_ADJUSTED', 'PRES_ADJUSTED_QC', 'PRES_ADJUSTED_ERROR', 'TEMP', 'TEMP_QC', 'TEMP_ADJUSTED', 'TEMP_ADJUSTED_QC', 'TEMP_ADJUSTED_ERROR', 'PARAMETER', 'SCIENTIFIC_CALIB_EQUATION', 'SCIENTIFIC_CALIB_COEFFICIENT', 'SCIENTIFIC_CALIB_COMMENT', 'SCIENTIFIC_CALIB_DATE', 'HISTORY_INSTITUTION', 'HISTO

In [None]:
# Method 3: Manual conversion (if automatic methods fail)
print("=== MANUAL CONVERSION APPROACH ===")

# Create a dictionary to store all data
data_dict = {}

# Add coordinates as columns
for coord_name, coord in ds.coords.items():
    print(f"Coordinate: {coord_name} - shape: {coord.shape}")
    if coord.ndim == 1:
        data_dict[coord_name] = coord.values

# Add data variables as columns  
for var_name, var in ds.data_vars.items():
    print(f"Variable: {var_name} - shape: {var.shape} - dims: {var.dims}")
    
    if var.ndim == 1:
        # 1D variable - add directly
        data_dict[var_name] = var.values
    elif var.ndim == 2:
        # 2D variable - flatten or handle differently
        data_dict[var_name] = var.values.flatten()
    else:
        # Higher dimensions - take first slice or flatten
        data_dict[var_name] = var.values.flatten()

# Check if we have data to work with
if data_dict:
    # Make sure all arrays have the same length
    lengths = [len(v) for v in data_dict.values()]
    print(f"\nArray lengths: {lengths}")
    
    if len(set(lengths)) == 1:
        # All same length - perfect!
        df_manual = pd.DataFrame(data_dict)
        print(f"✅ Manual conversion successful!")
        print(f"DataFrame shape: {df_manual.shape}")
        print(f"Columns: {list(df_manual.columns)}")
        print("\nFirst few rows:")
        print(df_manual.head())
    else:
        print(f"⚠️  Arrays have different lengths: {set(lengths)}")
        print("Need to handle this - let's see the structure first...")
else:
    print("No data found in the dataset!")

In [18]:
# Save the DataFrame to CSV
if 'df' in locals():
    # Use the successfully created DataFrame
    csv_filename = '13857_prof_converted.csv'
    df.to_csv(csv_filename, index=False)
    print(f"✅ DataFrame saved to: {csv_filename}")
    
    # Show some statistics about the data
    print(f"\n=== DATA SUMMARY ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nData types:")
    print(df.dtypes)
    print(f"\nBasic statistics:")
    print(df.describe())
    
elif 'df_manual' in locals():
    # Use the manually created DataFrame
    csv_filename = '13857_prof_converted.csv'
    df_manual.to_csv(csv_filename, index=False)
    print(f"✅ DataFrame saved to: {csv_filename}")
    
    # Show some statistics about the data
    print(f"\n=== DATA SUMMARY ===")
    print(f"Shape: {df_manual.shape}")
    print(f"Columns: {list(df_manual.columns)}")
    print(f"\nData types:")
    print(df_manual.dtypes)
    print(f"\nBasic statistics:")
    print(df_manual.describe())
    
else:
    print("❌ No DataFrame was successfully created. Let's debug the dataset structure...")
    print("\nDataset info:")
    print(ds.info())

✅ DataFrame saved to: 13857_prof_converted.csv

=== DATA SUMMARY ===
Shape: (0, 58)
Columns: ['DATA_TYPE', 'FORMAT_VERSION', 'HANDBOOK_VERSION', 'REFERENCE_DATE_TIME', 'DATE_CREATION', 'DATE_UPDATE', 'PLATFORM_NUMBER', 'PROJECT_NAME', 'PI_NAME', 'STATION_PARAMETERS', 'CYCLE_NUMBER', 'DIRECTION', 'DATA_CENTRE', 'DC_REFERENCE', 'DATA_STATE_INDICATOR', 'DATA_MODE', 'PLATFORM_TYPE', 'FLOAT_SERIAL_NO', 'FIRMWARE_VERSION', 'WMO_INST_TYPE', 'JULD', 'JULD_QC', 'JULD_LOCATION', 'LATITUDE', 'LONGITUDE', 'POSITION_QC', 'POSITIONING_SYSTEM', 'PROFILE_PRES_QC', 'PROFILE_TEMP_QC', 'VERTICAL_SAMPLING_SCHEME', 'CONFIG_MISSION_NUMBER', 'PRES', 'PRES_QC', 'PRES_ADJUSTED', 'PRES_ADJUSTED_QC', 'PRES_ADJUSTED_ERROR', 'TEMP', 'TEMP_QC', 'TEMP_ADJUSTED', 'TEMP_ADJUSTED_QC', 'TEMP_ADJUSTED_ERROR', 'PARAMETER', 'SCIENTIFIC_CALIB_EQUATION', 'SCIENTIFIC_CALIB_COEFFICIENT', 'SCIENTIFIC_CALIB_COMMENT', 'SCIENTIFIC_CALIB_DATE', 'HISTORY_INSTITUTION', 'HISTORY_STEP', 'HISTORY_SOFTWARE', 'HISTORY_SOFTWARE_RELEASE', '