In [None]:
import os
import pandas as pd
import numpy as np
import h5py
from scipy.ndimage import label
import multiprocessing

In [None]:
def process_year(args):
    year, aggregated_folder = args
    df_cell_zones = pd.read_parquet("../cell_zones.parquet")
    print(f"Processing year: {year}")

    climate_dir = f"../../../Calabria_dataset/InputReteGood/Climatic/{year}/"
    fire_dir = f"../../../Calabria_dataset/InputReteGood/Target/{year}/"

    climate_files = sorted([f for f in os.listdir(climate_dir) if f.endswith(".h5")])
    year_daily_data = []

    for climate_file in climate_files:
        date_str = climate_file.split(".")[0]
        climate_path = os.path.join(climate_dir, climate_file)
        fire_path = os.path.join(fire_dir, climate_file) 

        with h5py.File(climate_path, "r") as h5_file:
            values_table = h5_file["values/table"][:]
            attributes_table = h5_file["attributes/table"][:]

        attr_names = [attr[0].decode() for attr in attributes_table]
        attr_values = [attr[1][0] for attr in attributes_table]
        attributes_dict = dict(zip(attr_names, attr_values))

        ncols = int(attributes_dict["ncols"])
        nrows = int(attributes_dict["nrows"])
        cellsize = attributes_dict["cellsize"]
        xllcorner = attributes_dict["xllcorner"]
        yllcorner = attributes_dict["yllcorner"]

        index_values = values_table["index"]
        climate_values = values_table["values_block_0"]
        row_coords = index_values // ncols
        col_coords = index_values % ncols

        df_climate = pd.DataFrame({
            "Row": row_coords,
            "Column": col_coords,
            "Precipitation": climate_values[:, 0],
            "Humidity": climate_values[:, 1],
            "Temperature": climate_values[:, 2],
            "Wind": climate_values[:, 3],
            "X_Coord": xllcorner + (col_coords * cellsize),
            "Y_Coord": yllcorner + ((nrows - 1 - row_coords) * cellsize)
        })
        df_climate["Y_Coord"] = df_climate["Y_Coord"].round(-0).astype(float).round(1)

        df_climate_zone = df_cell_zones[["X_Coord", "Y_Coord", "Zone_ID"]].merge(
            df_climate,
            on=["X_Coord", "Y_Coord"],
            how="inner"
        )

        df_zone_climate_all = df_climate_zone.groupby("Zone_ID").agg({
            "Precipitation": "mean",
            "Humidity": "mean",
            "Temperature": "mean",
            "Wind": "mean"
        }).reset_index().rename(columns={
            "Precipitation": "Precipitation_all",
            "Humidity": "Humidity_all",
            "Temperature": "Temperature_all",
            "Wind": "Wind_all"
        })

        fire_exists = os.path.exists(fire_path)
        fire_present = False

        if fire_exists:
            with h5py.File(fire_path, "r") as h5_file:
                fire_values_table = h5_file["values/table"][:]
                fire_index = fire_values_table["index"]
                fire_results = fire_values_table["values_block_0"].flatten()

            row_coords_fire = fire_index // ncols
            col_coords_fire = fire_index % ncols
            fire_grid = np.zeros((nrows, ncols), dtype=int)

            for row, col, fire in zip(row_coords_fire, col_coords_fire, fire_results):
                if fire == 1:
                    fire_grid[row, col] = 1

            fire_present = fire_grid.sum() > 0

        if not fire_present:
            df_zone_day = df_zone_climate_all.copy()
            df_zone_day["Num_Fires"] = 0

            for col in ["Precipitation", "Humidity", "Temperature", "Wind"]:
                df_zone_day[col] = df_zone_day[f"{col}_all"]

            df_zone_day = df_zone_day[["Zone_ID", "Precipitation", "Humidity", "Temperature", "Wind", "Num_Fires"]]
            df_zone_day["Date"] = date_str
            year_daily_data.append(df_zone_day)
            continue

        structure = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
        labeled_fire_grid, _ = label(fire_grid, structure=structure)

        fire_cluster_coords = [
            (labeled_fire_grid[row, col], row, col,
             xllcorner + (col * cellsize),
             yllcorner + ((nrows - 1 - row) * cellsize))
            for row in range(nrows)
            for col in range(ncols)
            if labeled_fire_grid[row, col] > 0
        ]
        df_fire_clusters = pd.DataFrame(
            fire_cluster_coords,
            columns=["Cluster_ID", "Row", "Column", "X_Coord", "Y_Coord"]
        )
        df_fire_clusters["Y_Coord"] = df_fire_clusters["Y_Coord"].round(-0).astype(float).round(1)

        df_fire_clusters_zones = df_fire_clusters.merge(
            df_cell_zones[["X_Coord", "Y_Coord", "Zone_ID"]],
            on=["X_Coord", "Y_Coord"],
            how="left"
        )
        df_dominant_zones = df_fire_clusters_zones.groupby("Cluster_ID")["Zone_ID"] \
            .agg(lambda x: x.mode().iloc[0]).reset_index().rename(columns={"Zone_ID": "Dominant_Zone_ID"})
        df_fire_clusters_zones = df_fire_clusters_zones.drop(columns=["Zone_ID"]).merge(
            df_dominant_zones, on="Cluster_ID", how="left"
        ).rename(columns={"Dominant_Zone_ID": "Zone_ID"})

        all_zones = pd.DataFrame(df_cell_zones["Zone_ID"].unique(), columns=["Zone_ID"])
        df_fire_counts = df_fire_clusters_zones[["Cluster_ID", "Zone_ID"]].drop_duplicates()
        fires_per_zone = df_fire_counts.groupby("Zone_ID").size().reset_index(name="Num_Fires")
        df_zone_fire_counts = all_zones.merge(fires_per_zone, on="Zone_ID", how="left")
        df_zone_fire_counts["Num_Fires"] = df_zone_fire_counts["Num_Fires"].fillna(0).astype(int)

        df_fire_with_climate = df_fire_clusters_zones.merge(
            df_climate[["X_Coord", "Y_Coord", "Precipitation", "Humidity", "Temperature", "Wind"]],
            on=["X_Coord", "Y_Coord"],
            how="left"
        )

        df_cluster_climate = df_fire_with_climate.groupby("Cluster_ID").agg({
            "Zone_ID": "first",
            "Precipitation": "mean",
            "Humidity": "mean",
            "Temperature": "mean",
            "Wind": "mean"
        }).reset_index()

        df_zone_fire_climate = df_cluster_climate.groupby("Zone_ID").agg({
            "Precipitation": "mean",
            "Humidity": "mean",
            "Temperature": "mean",
            "Wind": "mean"
        }).reset_index().rename(columns={
            "Precipitation": "Precipitation_fire",
            "Humidity": "Humidity_fire",
            "Temperature": "Temperature_fire",
            "Wind": "Wind_fire"
        })

        df_zone_day = df_zone_fire_counts.merge(df_zone_climate_all, on="Zone_ID", how="left")
        df_zone_day = df_zone_day.merge(df_zone_fire_climate, on="Zone_ID", how="left")

        for col in ["Precipitation", "Humidity", "Temperature", "Wind"]:
            df_zone_day[col] = df_zone_day.apply(
                lambda row: row[f"{col}_fire"] if row["Num_Fires"] > 0 and not pd.isna(row[f"{col}_fire"])
                else row[f"{col}_all"], axis=1
            )

        df_zone_day = df_zone_day[["Zone_ID", "Precipitation", "Humidity", "Temperature", "Wind", "Num_Fires"]]
        df_zone_day["Date"] = date_str
        year_daily_data.append(df_zone_day)

    df_year = pd.concat(year_daily_data, ignore_index=True)
    df_year.to_csv(f"{aggregated_folder}/{year}.csv", index=False)
    print(f"Finished year: {year}")


In [None]:
if __name__ == "__main__":
    aggregated_folder = "Aggregated_Data"
    os.makedirs(aggregated_folder, exist_ok=True)

    years = sorted(os.listdir("../../../Calabria_dataset/InputReteGood/Climatic/"))
    args = [(year, aggregated_folder) for year in years]

    with multiprocessing.Pool(processes=len(years)) as pool:
        pool.map(process_year, args)


Processing year: 2016
Processing year: 2009
Processing year: 2008Processing year: 2011

Processing year: 2010
Processing year: 2018Processing year: 2017
Processing year: 2012
Processing year: 2013

Processing year: 2014
Processing year: 2015
Finished year: 2009
Finished year: 2018
Finished year: 2013
Finished year: 2014
Finished year: 2010
Finished year: 2016
Finished year: 2011
Finished year: 2008
Finished year: 2015
Finished year: 2012
Finished year: 2017
