In [9]:
import geopandas as gpd
import xarray as xr
import numpy as np
import pandas as pd
from shapely.geometry import box

In [10]:
ds = xr.open_dataset("..\..\Data\Interim\Meteorological_data\ERA5_NetCDF\ERA5_meteo_SL_c.nc")
print(ds)

<xarray.Dataset> Size: 497MB
Dimensions:      (valid_time: 2486, latitude: 61, longitude: 41)
Coordinates:
  * valid_time   (valid_time) datetime64[ns] 20kB 2015-08-03T14:00:00 ... 202...
  * latitude     (latitude) float64 488B 43.0 42.9 42.8 42.7 ... 37.2 37.1 37.0
  * longitude    (longitude) float64 328B -10.0 -9.9 -9.8 ... -6.2 -6.1 -6.0
Data variables: (12/21)
    u100         (valid_time, latitude, longitude) float32 25MB ...
    v100         (valid_time, latitude, longitude) float32 25MB ...
    cbh          (valid_time, latitude, longitude) float32 25MB ...
    hcc          (valid_time, latitude, longitude) float32 25MB ...
    lcc          (valid_time, latitude, longitude) float32 25MB ...
    mcc          (valid_time, latitude, longitude) float32 25MB ...
    ...           ...
    t2m          (valid_time, latitude, longitude) float32 25MB ...
    u10          (valid_time, latitude, longitude) float32 25MB ...
    v10          (valid_time, latitude, longitude) float32 25MB .

In [11]:
ds_PL = xr.open_dataset(r"../../Data/Interim/Meteorological_data/ERA5_NetCDF/ERA5_meteo_PL_c.nc", engine="netcdf4")
print(ds_PL)

<xarray.Dataset> Size: 746MB
Dimensions:         (valid_time: 2486, pressure_level: 5, latitude: 61,
                     longitude: 41)
Coordinates:
  * valid_time      (valid_time) datetime64[ns] 20kB 2015-08-03T14:00:00 ... ...
  * pressure_level  (pressure_level) float64 40B 950.0 850.0 700.0 500.0 300.0
  * latitude        (latitude) float64 488B 43.0 42.9 42.8 ... 37.2 37.1 37.0
  * longitude       (longitude) float64 328B -10.0 -9.9 -9.8 ... -6.2 -6.1 -6.0
Data variables:
    z               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    r               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    t               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    u               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    v               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    w               (valid_time, pressure_level, latitude, longitude) float3

In [None]:
# ============================================================
# 1. Ler shapefile e NetCDF
# ============================================================
shp = gpd.read_file(r"../../Data/Interim/PT-FireSprd_v2.1/L2_FireBehavior/PT-FireProg_v2.1_L2_valid.shp")
# shp = gpd.read_file(r"..\..\PT-FireProg_v2.1_L2_p_meteo_short.shp")

shp["sdate"] = pd.to_datetime(shp["sdate"], errors="coerce")
shp["edate"] = pd.to_datetime(shp["edate"], errors="coerce")
shp = shp.dropna(subset=["sdate", "edate"])

# ============================================================
# 2. PREPARAR C√âLULAS DA GRADE
# ============================================================
shp_4326 = shp.to_crs("EPSG:4326")
lats = ds.latitude.values
lons = ds.longitude.values
lat_res = 0.1
lon_res = 0.1

cell_polys = []
for lat in lats:
    for lon in lons:
        cell_poly = box(lon - lon_res/2, lat - lat_res/2, 
                        lon + lon_res/2, lat + lat_res/2)
        cell_polys.append(cell_poly)

cells_gdf = gpd.GeoDataFrame({
    "latitude": np.repeat(lats, len(lons)),
    "longitude": np.tile(lons, len(lats)),
    "geometry": cell_polys
}, crs="EPSG:4326")

# ============================================================
# 3. GERAR COMBINA√á√ïES PARA TODOS OS POL√çGONOS + EXTENT
# ============================================================
all_times = pd.to_datetime(ds.valid_time.values)
all_combinations = []

for polygon_id, polygon_row in shp_4326.iterrows():
    
    # >>> calcular extent do pol√≠gono
    minx, miny, maxx, maxy = polygon_row.geometry.bounds
    
    # Encontrar c√©lulas que intersectam
    intersecting_cells = cells_gdf[cells_gdf.intersects(polygon_row.geometry)]
    
    # Se n√£o houver interse√ß√£o direta, usar buffer
    if len(intersecting_cells) == 0:
        buffered_poly = polygon_row.geometry.buffer(0.05)
        intersecting_cells = cells_gdf[cells_gdf.intersects(buffered_poly)]
    
    # Extrair intervalo de tempo
    sdate = polygon_row["sdate"]
    edate = polygon_row["edate"]
    
    # Filtrar tempos dentro do intervalo com minutos = 0
    polygon_times = [
        time for time in all_times
        if sdate <= time <= edate and time.minute == 0
    ]
    
    # Gerar combina√ß√µes
    for _, cell in intersecting_cells.iterrows():
        lat = cell.latitude
        lon = cell.longitude
        
        for time in polygon_times:
            all_combinations.append({
                'latitude': lat,
                'longitude': lon,
                'time': time,
                'polygon_id': polygon_id,
                'minx': minx,
                'miny': miny,
                'maxx': maxx,
                'maxy': maxy
            })

# ============================================================
# 4. OUTPUT FINAL
# ============================================================
print(f"Total de combina√ß√µes geradas: {len(all_combinations)}")
print(f"N√∫mero de pol√≠gonos processados: {len(shp_4326)}")

'''print("\nPrimeiras 5 combina√ß√µes:")
for i, combo in enumerate(all_combinations):
    print(
        f"{i+1}: Polygon {combo['polygon_id']} - "
        f"lat={combo['latitude']:.3f}, lon={combo['longitude']:.3f}, "
        f"time={combo['time'].strftime('%Y-%m-%d %H:%M')}, "
        f"extent=({combo['minx']:.3f}, {combo['miny']:.3f}, {combo['maxx']:.3f}, {combo['maxy']:.3f})"
    )'''

'''# Salvar em CSV
df_combinations = pd.DataFrame(all_combinations)
df_combinations.to_csv(r"PT-FireProg_v2.1_L2_combinations_with_extent.csv", index=False)'''


  shp["edate"] = pd.to_datetime(shp["edate"], errors="coerce")


Total de combina√ß√µes geradas: 15890
N√∫mero de pol√≠gonos processados: 1715


In [13]:
import numpy as np

# ============================================================
# CRIAR M√ÅSCARA PARA AS COMBINA√á√ïES QUE VAMOS MANTER
# ============================================================

# Criar um array booleano inicializado com False
keep_mask = xr.full_like(ds.u100, False, dtype=bool)  # usa u100 como template

# Converter os tempos do dataset para pandas Timestamp
ds_times = pd.to_datetime(ds.valid_time.values)

print(f"Processando {len(shp_4326)} pol√≠gonos...")

# Para cada pol√≠gono no shapefile
for polygon_id, polygon_row in shp_4326.iterrows():
    # Intervalo de tempo do pol√≠gono
    sdate = polygon_row["sdate"]
    edate = polygon_row["edate"]
    
    # Filtrar apenas os tempos que est√£o dentro do intervalo do pol√≠gono e minutos=0
    polygon_times = [time for time in ds_times if sdate <= time <= edate and time.minute == 0]

    # Encontrar c√©lulas da grade NetCDF que intersectam o pol√≠gono
    intersecting_cells = cells_gdf[cells_gdf.intersects(polygon_row.geometry)]
    
    # Se n√£o houver interse√ß√£o direta, usar buffer opcional
    if len(intersecting_cells) == 0:
        buffered_poly = polygon_row.geometry.buffer(0.05)
        intersecting_cells = cells_gdf[cells_gdf.intersects(buffered_poly)]
    
    # Marcar as c√©lulas intersectantes na m√°scara
    for _, cell in intersecting_cells.iterrows():
        # Encontrar √≠ndices exatos da c√©lula no NetCDF
        lat_idx = np.where(ds.latitude.values == cell['latitude'])[0][0]
        lon_idx = np.where(ds.longitude.values == cell['longitude'])[0][0]

        for time in polygon_times:
            time_idx = np.where(ds_times == time)[0]
            if len(time_idx) > 0:
                keep_mask[time_idx[0], lat_idx, lon_idx] = True
            else:
                # Se n√£o encontrar o tempo exato, usar o mais pr√≥ximo
                time_diff = np.abs(ds_times - time)
                closest_time_idx = time_diff.argmin()
                keep_mask[closest_time_idx, lat_idx, lon_idx] = True
                print(f"‚ö†Ô∏è Tempo n√£o encontrado exatamente: {time}. Usando mais pr√≥ximo: {ds_times[closest_time_idx]}")

print(f"Combina√ß√µes a manter: {keep_mask.sum().values}")
print(f"Combina√ß√µes a descartar: {(~keep_mask).sum().values}")

# ============================================================
# APLICAR NAN √ÄS COMBINA√á√ïES QUE N√ÉO VAMOS USAR
# ============================================================

# Criar uma c√≥pia do dataset original
ds_filtered = ds.copy()

# Lista de vari√°veis meteorol√≥gicas (excluindo coordenadas e spatial_ref)
data_vars = [var for var in ds.data_vars if var not in ['spatial_ref']]

print(f"Aplicando NaN a {len(data_vars)} vari√°veis...")

# Aplicar NaN apenas √†s combina√ß√µes que N√ÉO vamos usar
for var_name in data_vars:
    print(f"Processando {var_name}...")
    ds_filtered[var_name] = ds[var_name].where(keep_mask)

ds_filtered = ds_filtered.drop_vars('spatial_ref')

# ============================================================
# VERIFICAR RESULTADO
# ============================================================

print(f"\n‚úÖ PROCESSO CONCLU√çDO!")
print(f"Dataset original: {ds.nbytes / 1024 / 1024:.1f} MB")
print(f"Dataset filtrado: {ds_filtered.nbytes / 1024 / 1024:.1f} MB")

# Verificar quantos valores n√£o-NaN restaram
for var_name in data_vars[:3]:  # Mostrar apenas as primeiras 3 vari√°veis
    non_nan_count = (~np.isnan(ds_filtered[var_name].values)).sum()
    total_count = ds_filtered[var_name].size
    print(f"{var_name}: {non_nan_count}/{total_count} valores n√£o-NaN ({non_nan_count/total_count*100:.2f}%)")

# Mostrar o dataset resultante
print(f"\nüìä DATASET FILTRADO:")
print(ds_filtered)


Processando 1715 pol√≠gonos...


Combina√ß√µes a manter: 11091
Combina√ß√µes a descartar: 6206395
Aplicando NaN a 20 vari√°veis...
Processando u100...
Processando v100...
Processando cbh...
Processando hcc...
Processando lcc...
Processando mcc...
Processando tcc...
Processando blh...
Processando cape...
Processando cin...
Processando swvl1...
Processando swvl2...
Processando swvl3...
Processando swvl4...
Processando d2m...
Processando t2m...
Processando u10...
Processando v10...
Processando sp...
Processando z...

‚úÖ PROCESSO CONCLU√çDO!
Dataset original: 474.4 MB
Dataset filtrado: 474.4 MB
u100: 11091/6217486 valores n√£o-NaN (0.18%)
v100: 11091/6217486 valores n√£o-NaN (0.18%)
cbh: 3490/6217486 valores n√£o-NaN (0.06%)

üìä DATASET FILTRADO:
<xarray.Dataset> Size: 497MB
Dimensions:     (valid_time: 2486, latitude: 61, longitude: 41)
Coordinates:
  * valid_time  (valid_time) datetime64[ns] 20kB 2015-08-03T14:00:00 ... 2025...
  * latitude    (latitude) float64 488B 43.0 42.9 42.8 42.7 ... 37.2 37.1 37.0
  * longitu

In [14]:
# ============================================================
# SALVAR DATASET SEM REMOVER DIMENS√ïES VAZIAS
# ============================================================

output_path = "../../Data/Interim/Meteorological_data/ERA5_NetCDF/ERA5_meteo_SL_c_short.nc"

# Salvar diretamente sem otimizar/remover dimens√µes
ds_filtered.to_netcdf(output_path, engine="netcdf4")

print(f"\nüíæ Dataset salvo em: {output_path}")
print(f"\nüéØ DATASET FINAL (DIMENS√ïES INTACTAS):")
print(ds_filtered)



üíæ Dataset salvo em: ../../Data/Interim/Meteorological_data/ERA5_NetCDF/ERA5_meteo_SL_c_short.nc

üéØ DATASET FINAL (DIMENS√ïES INTACTAS):
<xarray.Dataset> Size: 497MB
Dimensions:     (valid_time: 2486, latitude: 61, longitude: 41)
Coordinates:
  * valid_time  (valid_time) datetime64[ns] 20kB 2015-08-03T14:00:00 ... 2025...
  * latitude    (latitude) float64 488B 43.0 42.9 42.8 42.7 ... 37.2 37.1 37.0
  * longitude   (longitude) float64 328B -10.0 -9.9 -9.8 -9.7 ... -6.2 -6.1 -6.0
Data variables: (12/20)
    u100        (valid_time, latitude, longitude) float32 25MB nan nan ... nan
    v100        (valid_time, latitude, longitude) float32 25MB nan nan ... nan
    cbh         (valid_time, latitude, longitude) float32 25MB nan nan ... nan
    hcc         (valid_time, latitude, longitude) float32 25MB nan nan ... nan
    lcc         (valid_time, latitude, longitude) float32 25MB nan nan ... nan
    mcc         (valid_time, latitude, longitude) float32 25MB nan nan ... nan
    ...     

In [15]:
import numpy as np
import pandas as pd

# ============================================================
# 1Ô∏è‚É£ Criar m√°scara para ds_PL
# ============================================================

print("üîç VERIFICANDO DIMENS√ïES DO ds_PL:")
print(f"Dimens√µes: {ds_PL.dims}")
print(f"Tamanho valid_time: {len(ds_PL.valid_time)}")
print(f"Tamanho pressure_level: {len(ds_PL.pressure_level)}")
print(f"Tamanho latitude: {len(ds_PL.latitude)}")
print(f"Tamanho longitude: {len(ds_PL.longitude)}")
print(f"Valores de pressure_level: {ds_PL.pressure_level.values}")

# Criar m√°scara com as 4 dimens√µes
keep_mask_PL = xr.full_like(ds_PL.u, False, dtype=bool)

# Converter tempos para pandas Timestamp
ds_PL_times = pd.to_datetime(ds_PL.valid_time.values)

print(f"\nüîÑ Processando {len(shp_4326)} pol√≠gonos para ds_PL...")

valid_combinations_PL = 0

# Para cada pol√≠gono
for polygon_id, polygon_row in shp_4326.iterrows():
    sdate = polygon_row["sdate"]
    edate = polygon_row["edate"]
    
    # Filtrar apenas tempos dentro do intervalo do pol√≠gono
    polygon_times = [time for time in ds_PL_times if sdate <= time <= edate and time.minute == 0]

    # Encontrar c√©lulas que intersectam o pol√≠gono
    intersecting_cells = cells_gdf[cells_gdf.intersects(polygon_row.geometry)]
    
    # Se n√£o houver interse√ß√£o direta, usar buffer opcional
    if len(intersecting_cells) == 0:
        buffered_poly = polygon_row.geometry.buffer(0.05)
        intersecting_cells = cells_gdf[cells_gdf.intersects(buffered_poly)]
    
    # Para cada c√©lula intersectante
    for _, cell in intersecting_cells.iterrows():
        lat_idx = np.where(ds_PL.latitude.values == cell['latitude'])[0][0]
        lon_idx = np.where(ds_PL.longitude.values == cell['longitude'])[0][0]

        for time in polygon_times:
            time_idx = np.where(ds_PL_times == time)[0]
            if len(time_idx) == 0:
                # Usar tempo mais pr√≥ximo se n√£o existir exato
                time_diff = np.abs(ds_PL_times - time)
                time_idx = [time_diff.argmin()]
            time_idx = time_idx[0]

            # Marcar todos os n√≠veis de press√£o
            for pressure_idx in range(len(ds_PL.pressure_level)):
                keep_mask_PL[time_idx, pressure_idx, lat_idx, lon_idx] = True
                valid_combinations_PL += 1

print(f"‚úÖ Combina√ß√µes v√°lidas no ds_PL: {valid_combinations_PL}")
print(f"üìç C√©lulas marcadas na m√°scara: {keep_mask_PL.sum().values}")

# ============================================================
# 2Ô∏è‚É£ Aplicar NaN √†s combina√ß√µes fora do pol√≠gono
# ============================================================

ds_PL_filtered = ds_PL.copy()
data_vars_PL = [var for var in ds_PL.data_vars if var not in ['spatial_ref']]

for var_name in data_vars_PL:
    ds_PL_filtered[var_name] = ds_PL[var_name].where(keep_mask_PL)

ds_PL_filtered = ds_PL_filtered.drop_vars('spatial_ref')

# ============================================================
# 3Ô∏è‚É£ Verificar resultado
# ============================================================

print(f"\n‚úÖ PROCESSO CONCLU√çDO PARA ds_PL!")
print(f"Dataset PL original: {ds_PL.nbytes / 1024 / 1024:.1f} MB")
print(f"Dataset PL filtrado: {ds_PL_filtered.nbytes / 1024 / 1024:.1f} MB")

'''for var_name in data_vars_PL[:4]:
    non_nan_count = (~np.isnan(ds_PL_filtered[var_name].values)).sum()
    total_count = ds_PL_filtered[var_name].size
    percentage = (non_nan_count / total_count) * 100
    print(f"   {var_name}: {non_nan_count:,}/{total_count:,} valores n√£o-NaN ({percentage:.4f}%)")'''

print(f"\nüìä DATASET PL FILTRADO:")
print(ds_PL_filtered)


üîç VERIFICANDO DIMENS√ïES DO ds_PL:
Tamanho valid_time: 2486
Tamanho pressure_level: 5
Tamanho latitude: 61
Tamanho longitude: 41
Valores de pressure_level: [950. 850. 700. 500. 300.]

üîÑ Processando 1715 pol√≠gonos para ds_PL...
‚úÖ Combina√ß√µes v√°lidas no ds_PL: 79450
üìç C√©lulas marcadas na m√°scara: 55455

‚úÖ PROCESSO CONCLU√çDO PARA ds_PL!
Dataset PL original: 711.6 MB
Dataset PL filtrado: 711.6 MB

üìä DATASET PL FILTRADO:
<xarray.Dataset> Size: 746MB
Dimensions:         (valid_time: 2486, pressure_level: 5, latitude: 61,
                     longitude: 41)
Coordinates:
  * valid_time      (valid_time) datetime64[ns] 20kB 2015-08-03T14:00:00 ... ...
  * pressure_level  (pressure_level) float64 40B 950.0 850.0 700.0 500.0 300.0
  * latitude        (latitude) float64 488B 43.0 42.9 42.8 ... 37.2 37.1 37.0
  * longitude       (longitude) float64 328B -10.0 -9.9 -9.8 ... -6.2 -6.1 -6.0
Data variables:
    z               (valid_time, pressure_level, latitude, longitude) flo

In [16]:
# ============================================================
# SALVAR ds_PL SEM REMOVER DIMENS√ïES VAZIAS
# ============================================================

output_path_PL = "../../Data/Interim/Meteorological_data/ERA5_NetCDF/ERA5_meteo_PL_c_short.nc"

# Salvar diretamente sem otimizar/remover dimens√µes
ds_PL_filtered.to_netcdf(output_path_PL, engine="netcdf4")

print(f"\nüíæ Dataset PL salvo em: {output_path_PL}")
print(f"\nüéØ DATASET PL FINAL (DIMENS√ïES INTACTAS):")
print(ds_PL_filtered)



üíæ Dataset PL salvo em: ../../Data/Interim/Meteorological_data/ERA5_NetCDF/ERA5_meteo_PL_c_short.nc

üéØ DATASET PL FINAL (DIMENS√ïES INTACTAS):
<xarray.Dataset> Size: 746MB
Dimensions:         (valid_time: 2486, pressure_level: 5, latitude: 61,
                     longitude: 41)
Coordinates:
  * valid_time      (valid_time) datetime64[ns] 20kB 2015-08-03T14:00:00 ... ...
  * pressure_level  (pressure_level) float64 40B 950.0 850.0 700.0 500.0 300.0
  * latitude        (latitude) float64 488B 43.0 42.9 42.8 ... 37.2 37.1 37.0
  * longitude       (longitude) float64 328B -10.0 -9.9 -9.8 ... -6.2 -6.1 -6.0
Data variables:
    z               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    r               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    t               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    u               (valid_time, pressure_level, latitude, longitude) float32 124MB ...
    v          