
# ERA5 All Subfolders Check

This notebook scans **all** `nc_*` subfolders under `DATA_ROOT`, loads them lazily with xarray, and builds a side‑by‑side summary:
- file counts
- time range & length
- spatial coverage & resolution
- variable names
- basic per‑variable stats (first time slice only; optional & limited for speed)
- pairwise alignment checks (do time/lat/lon shapes match?)

> Tip: If you have many files, this still stays fairly light because we use `open_mfdataset(..., combine='by_coords')` lazily.


In [1]:

from pathlib import Path
import numpy as np
import pandas as pd
import xarray as xr

# ------------ CONFIG -------------
# Change this to your path:
# DATA_ROOT = Path('/home/ryukomura/Desktop/WeatherForcastWithHrmAndSatSwinMAE/raw_data')
DATA_ROOT = Path('./raw_data')

# NetCDF engine. Set to None to let xarray decide automatically.
NETCDF_ENGINE = None  # 'netcdf4' or 'h5netcdf' or None

# Whether to compute quick per-variable stats on first time slice for each subfolder.
# Set to 0 to skip, or a positive integer to limit to the first N variables in that folder.
PER_VAR_STATS_LIMIT = 6  # e.g., 6; set 0 to disable
# ---------------------------------

print('DATA_ROOT =', DATA_ROOT.resolve())


DATA_ROOT = /home/ryukomura/Desktop/WeatherForcastWithHrmAndSatSwinMAE/dataset/raw_data


In [2]:

def list_nc_subdirs(root: Path):
    return sorted([p for p in root.iterdir() if p.is_dir() and p.name.startswith('nc_')])

def list_files(folder: Path, pattern='*.nc'):
    return sorted([str(p) for p in folder.glob(pattern)])

def open_mf(files, engine=None):
    if not files:
        return None
    if engine:
        return xr.open_mfdataset(files, combine='by_coords', engine=engine)
    return xr.open_mfdataset(files, combine='by_coords')

def safe_float(x):
    try:
        return float(x)
    except Exception:
        return None


## Scan all subfolders and build a summary table

In [3]:

subs = list_nc_subdirs(DATA_ROOT)
print(f'Found {len(subs)} subfolders:', [s.name for s in subs])

rows = []
pervar_rows = []

for s in subs:
    files = list_files(s)
    rec = {
        'subdir': s.name,
        'n_files': len(files),
        'time_len': None, 'time_start': None, 'time_end': None,
        'lat_len': None, 'lon_len': None,
        'lat_min': None, 'lat_max': None,
        'lon_min': None, 'lon_max': None,
        'lat_res': None, 'lon_res': None,
        'variables': None,
        'error': None,
    }
    if not files:
        rows.append(rec)
        continue
    ds = None
    try:
        ds = open_mf(files, NETCDF_ENGINE)
        
        print("####################################################")
        print(f'Processing {s.name} with {len(files)} files...')
        print(ds)
        
        # Dims
        dims = dict(ds.dims)
        rec['time_len'] = int(dims.get('time', -1)) if 'time' in dims else None
        rec['lat_len']  = int(dims.get('latitude', dims.get('lat', -1)))
        rec['lon_len']  = int(dims.get('longitude', dims.get('lon', -1)))

        # Coords and ranges
        latn = 'latitude' if 'latitude' in ds.coords else ('lat' if 'lat' in ds.coords else None)
        lonn = 'longitude' if 'longitude' in ds.coords else ('lon' if 'lon' in ds.coords else None)

        if 'time' in ds.coords:
            tvals = ds['time'].values
            if len(tvals) > 0:
                import pandas as pd
                rec['time_start'] = pd.to_datetime(tvals[0]).to_pydatetime()
                rec['time_end']   = pd.to_datetime(tvals[-1]).to_pydatetime()

        if latn:
            lat = ds[latn].values
            rec['lat_min'] = safe_float(lat.min())
            rec['lat_max'] = safe_float(lat.max())
            if len(lat) > 1:
                rec['lat_res'] = safe_float(np.abs(lat[1] - lat[0]))

        if lonn:
            lon = ds[lonn].values
            rec['lon_min'] = safe_float(lon.min())
            rec['lon_max'] = safe_float(lon.max())
            if len(lon) > 1:
                rec['lon_res'] = safe_float(np.abs(lon[1] - lon[0]))

        # Variables
        vnames = list(ds.data_vars)
        rec['variables'] = ','.join(vnames[:12]) + ('...' if len(vnames) > 12 else '')

        # Optional quick per-variable stats (first time slice only)
        if PER_VAR_STATS_LIMIT and len(vnames) > 0:
            limit = min(PER_VAR_STATS_LIMIT, len(vnames))
            for vn in vnames[:limit]:
                try:
                    da = ds[vn]
                    if 'time' in da.dims:
                        da0 = da.isel(time=0)
                    else:
                        da0 = da
                    a = da0.values
                    pervar_rows.append({
                        'subdir': s.name, 'var': vn,
                        'min': float(np.nanmin(a)),
                        'max': float(np.nanmax(a)),
                        'mean': float(np.nanmean(a)),
                        'std': float(np.nanstd(a)),
                        'shape': str(list(a.shape)),
                        'dtype': str(a.dtype),
                    })
                except Exception as e:
                    pervar_rows.append({'subdir': s.name, 'var': vn, 'error': str(e)})
    except Exception as e:
        rec['error'] = str(e)
    finally:
        if ds is not None:
            ds.close()
    rows.append(rec)

summary_df = pd.DataFrame(rows).sort_values('subdir').reset_index(drop=True)
summary_df

Found 7 subfolders: ['nc_cp', 'nc_r', 'nc_sp', 'nc_ssrd', 'nc_t', 'nc_u10', 'nc_v10']
####################################################
Processing nc_cp with 25 files...
<xarray.Dataset> Size: 104MB
Dimensions:     (valid_time: 25, latitude: 721, longitude: 1440)
Coordinates:
    number      int64 8B 0
  * valid_time  (valid_time) datetime64[ns] 200B 2025-08-01 ... 2025-08-02
  * latitude    (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude   (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
    expver      <U4 16B '0005'
Data variables:
    cp          (valid_time, latitude, longitude) float32 104MB dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:         

  dims = dict(ds.dims)


####################################################
Processing nc_r with 25 files...
<xarray.Dataset> Size: 623MB
Dimensions:         (valid_time: 25, pressure_level: 6, latitude: 721,
                     longitude: 1440)
Coordinates:
    number          int64 8B 0
  * valid_time      (valid_time) datetime64[ns] 200B 2025-08-01 ... 2025-08-02
  * pressure_level  (pressure_level) float64 48B 1e+03 850.0 ... 400.0 300.0
  * latitude        (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0
  * longitude       (longitude) float64 12kB 0.0 0.25 0.5 ... 359.2 359.5 359.8
    expver          <U4 16B '0005'
Data variables:
    r               (valid_time, pressure_level, latitude, longitude) float32 623MB dask.array<chunksize=(1, 3, 361, 720), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             Europea

  dims = dict(ds.dims)


####################################################
Processing nc_sp with 25 files...
<xarray.Dataset> Size: 104MB
Dimensions:     (valid_time: 25, latitude: 721, longitude: 1440)
Coordinates:
    number      int64 8B 0
  * valid_time  (valid_time) datetime64[ns] 200B 2025-08-01 ... 2025-08-02
  * latitude    (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude   (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
    expver      <U4 16B '0005'
Data variables:
    sp          (valid_time, latitude, longitude) float32 104MB dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-08-12T01:22 GRIB to CDM+CF via cfgrib-0.9.1...


  dims = dict(ds.dims)


####################################################
Processing nc_ssrd with 25 files...
<xarray.Dataset> Size: 104MB
Dimensions:     (valid_time: 25, latitude: 721, longitude: 1440)
Coordinates:
    number      int64 8B 0
  * valid_time  (valid_time) datetime64[ns] 200B 2025-08-01 ... 2025-08-02
  * latitude    (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude   (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
    expver      <U4 16B '0005'
Data variables:
    ssrd        (valid_time, latitude, longitude) float32 104MB dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-08-12T02:11 GRIB to CDM+CF via cfgrib-0.9.1...


  dims = dict(ds.dims)


####################################################
Processing nc_t with 310 files...
<xarray.Dataset> Size: 922MB
Dimensions:         (valid_time: 37, pressure_level: 6, latitude: 721,
                     longitude: 1440)
Coordinates:
    number          int64 8B 0
  * valid_time      (valid_time) datetime64[ns] 296B 2024-08-01T12:00:00 ... ...
  * pressure_level  (pressure_level) float64 48B 1e+03 850.0 ... 400.0 300.0
  * latitude        (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0
  * longitude       (longitude) float64 12kB 0.0 0.25 0.5 ... 359.2 359.5 359.8
    expver          (valid_time) <U4 592B '0001' '0001' '0001' ... '0005' '0005'
Data variables:
    t               (valid_time, pressure_level, latitude, longitude) float32 922MB dask.array<chunksize=(1, 3, 361, 720), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:       

  dims = dict(ds.dims)


####################################################
Processing nc_u10 with 25 files...
<xarray.Dataset> Size: 104MB
Dimensions:     (valid_time: 25, latitude: 721, longitude: 1440)
Coordinates:
    number      int64 8B 0
  * valid_time  (valid_time) datetime64[ns] 200B 2025-08-01 ... 2025-08-02
  * latitude    (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude   (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
    expver      <U4 16B '0005'
Data variables:
    u10         (valid_time, latitude, longitude) float32 104MB dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-08-12T01:34 GRIB to CDM+CF via cfgrib-0.9.1...


  dims = dict(ds.dims)


####################################################
Processing nc_v10 with 25 files...
<xarray.Dataset> Size: 104MB
Dimensions:     (valid_time: 25, latitude: 721, longitude: 1440)
Coordinates:
    number      int64 8B 0
  * valid_time  (valid_time) datetime64[ns] 200B 2025-08-01 ... 2025-08-02
  * latitude    (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude   (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
    expver      <U4 16B '0005'
Data variables:
    v10         (valid_time, latitude, longitude) float32 104MB dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-08-12T01:43 GRIB to CDM+CF via cfgrib-0.9.1...


  dims = dict(ds.dims)


Unnamed: 0,subdir,n_files,time_len,time_start,time_end,lat_len,lon_len,lat_min,lat_max,lon_min,lon_max,lat_res,lon_res,variables,error
0,nc_cp,25,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,cp,
1,nc_r,25,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,r,
2,nc_sp,25,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,sp,
3,nc_ssrd,25,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,ssrd,
4,nc_t,310,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,t,
5,nc_u10,25,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,u10,
6,nc_v10,25,,,,721,1440,-90.0,90.0,0.0,359.75,0.25,0.25,v10,


### Optional: per-variable quick stats table (first time slice only)

In [4]:

import pandas as pd
if len(pervar_rows):
    pervar_df = pd.DataFrame(pervar_rows)
    # show a few rows; you can view full DataFrame in VS Code or export to CSV
    pervar_df.head(20)
else:
    print("PER_VAR_STATS_LIMIT == 0 or no variables found")


## Pairwise alignment checks

In [5]:

# Build a pairwise table indicating whether time/lat/lon lengths match across subdirs
subs_names = list(summary_df['subdir'].values)
n = len(subs_names)
time_eq = np.full((n,n), True)
lat_eq  = np.full((n,n), True)
lon_eq  = np.full((n,n), True)

tlen = summary_df.set_index('subdir')['time_len'].to_dict()
llen = summary_df.set_index('subdir')['lat_len'].to_dict()
mlen = summary_df.set_index('subdir')['lon_len'].to_dict()

for i,a in enumerate(subs_names):
    for j,b in enumerate(subs_names):
        time_eq[i,j] = (tlen.get(a) == tlen.get(b))
        lat_eq[i,j]  = (llen.get(a) == llen.get(b))
        lon_eq[i,j]  = (mlen.get(a) == mlen.get(b))

time_eq_df = pd.DataFrame(time_eq, index=subs_names, columns=subs_names)
lat_eq_df  = pd.DataFrame(lat_eq,  index=subs_names, columns=subs_names)
lon_eq_df  = pd.DataFrame(lon_eq,  index=subs_names, columns=subs_names)

print("time length equal (True/False)")
display(time_eq_df)
print("lat length equal (True/False)")
display(lat_eq_df)
print("lon length equal (True/False)")
display(lon_eq_df)


time length equal (True/False)


Unnamed: 0,nc_cp,nc_r,nc_sp,nc_ssrd,nc_t,nc_u10,nc_v10
nc_cp,True,True,True,True,True,True,True
nc_r,True,True,True,True,True,True,True
nc_sp,True,True,True,True,True,True,True
nc_ssrd,True,True,True,True,True,True,True
nc_t,True,True,True,True,True,True,True
nc_u10,True,True,True,True,True,True,True
nc_v10,True,True,True,True,True,True,True


lat length equal (True/False)


Unnamed: 0,nc_cp,nc_r,nc_sp,nc_ssrd,nc_t,nc_u10,nc_v10
nc_cp,True,True,True,True,True,True,True
nc_r,True,True,True,True,True,True,True
nc_sp,True,True,True,True,True,True,True
nc_ssrd,True,True,True,True,True,True,True
nc_t,True,True,True,True,True,True,True
nc_u10,True,True,True,True,True,True,True
nc_v10,True,True,True,True,True,True,True


lon length equal (True/False)


Unnamed: 0,nc_cp,nc_r,nc_sp,nc_ssrd,nc_t,nc_u10,nc_v10
nc_cp,True,True,True,True,True,True,True
nc_r,True,True,True,True,True,True,True
nc_sp,True,True,True,True,True,True,True
nc_ssrd,True,True,True,True,True,True,True
nc_t,True,True,True,True,True,True,True
nc_u10,True,True,True,True,True,True,True
nc_v10,True,True,True,True,True,True,True


In [6]:
# Print all variable names for each file in every subfolder
import xarray as xr

for s in subs:
    files = list_files(s)
    print(f"\nSubfolder: {s.name}")
    for f in files:
        try:
            ds = xr.open_dataset(f, engine=NETCDF_ENGINE)
            vnames = list(ds.data_vars)
            print(f"  File: {f}")
            print(f"    Variables: {vnames}")
            ds.close()
        except Exception as e:
            print(f"  File: {f}")
            print(f"    Error: {e}")


Subfolder: nc_cp
  File: raw_data/nc_cp/ERA5_2025-08-01T00_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T01_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T02_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T03_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T04_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T05_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T06_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T07_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T08_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T09_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T10_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T11_00_00_cp.nc
    Variables: ['cp']
  File: raw_data/nc_cp/ERA5_2025-08-01T12_00_00_cp.nc
    Variables: [