In [1]:
import xarray as xr
import os
import glob
import pandas as pd
import geopandas as gpd
from rasterio import features
import rasterio
from shapely.geometry import shape
import os

input_folder = "./raw/pr"
output_folder = "./processed/pr"

In [None]:
  
nc_file_list = glob.glob(os.path.join(input_folder, "*.nc"))

if nc_file_list:
    # Open all files as a single dataset along the time dimension
    combined_ds = xr.open_mfdataset(nc_file_list, combine='by_coords')
    # Group by year (using the 'year' coordinate) and process each year once
    for year, yearly_ds in combined_ds.groupby('time.year'):
        
        augmented_yearly_data = yearly_ds.max(dim='time', keep_attrs=True)
        
        # heavy_rainfall = (yearly_ds['pr'] > 20).sum(dim='time', keep_attrs=True)
        
        # Calculate wet_days and align coordinates to match other variables
        # wet_days = (yearly_ds['pr'] > 5).sum(dim='time', keep_attrs=True)
        
        yearly_sum = yearly_ds['pr'].sum(dim='time', keep_attrs=True)
        
        
        # rolling_5day_sum = yearly_ds['pr'].rolling(time=5, min_periods=5).sum()
        # max_5day_sum = rolling_5day_sum.max(dim='time', keep_attrs=True)
        
        rolling_3day_sum = yearly_ds['pr'].rolling(time=3, min_periods=3).sum()
        max_3day_sum = rolling_3day_sum.max(dim='time', keep_attrs=True)

        # augmented_yearly_data['max_5day_sum'] = max_5day_sum
        augmented_yearly_data['max_3day_sum'] = max_3day_sum
        augmented_yearly_data['sum'] = yearly_sum
        # augmented_yearly_data['over_20'] = heavy_rainfall
        # augmented_yearly_data['wet_days'] = wet_days
        
        out_path = os.path.join(
            output_folder,
            f"pr_year_{year}.nc"
        )
        augmented_yearly_data.to_netcdf(out_path)
    combined_ds.close()



In [3]:
# Find all NetCDF files in the output_folder
nc_files = glob.glob(os.path.join(output_folder, "*.nc"))

# Open and merge all datasets along the 'time' dimension
merged_ds = xr.open_mfdataset(nc_files, combine='nested', concat_dim='year')

# Optionally, save the merged dataset to a new file
merged_ds.to_netcdf(os.path.join(output_folder, "merged.nc"))

In [5]:
merged_pr = xr.open_dataset(os.path.join(output_folder, "climate_data/merged.nc"))

rolling_period = 30

rolling_pr_sum = merged_pr['sum'].rolling(year=rolling_period, center=True, min_periods=1).mean()
# rolling_over_20 = merged_pr['over_20'].rolling(year=5, center=True, min_periods=1).mean()
rolling_max_3day_sum = merged_pr['max_3day_sum'].rolling(year=rolling_period, center=True, min_periods=1).mean()
# rolling_max_5day_sum = merged_pr['max_5day_sum'].rolling(year=5, center=True, min_periods=1).mean()
rolling_pr = merged_pr['pr'].rolling(year=rolling_period, center=True, min_periods=1).mean()
# rolling_wet_days = merged_pr['wet_days'].rolling(year=5, center=True, min_periods=1).mean()
rolling_values = xr.Dataset({
    'sum_rolling': rolling_pr_sum,
    # 'over_20_rolling': rolling_over_20,
    'max_3day_sum_rolling': rolling_max_3day_sum,
    # 'max_5day_sum_rolling': rolling_max_5day_sum,
    'pr_rolling': rolling_pr,
    # 'wet_days_rolling': rolling_wet_days
    })

out_rolling_path = os.path.join(output_folder, "rolling_values_10.nc")
rolling_values.to_netcdf(out_rolling_path)

In [6]:
def change_from_baseline(data, output, name, period=15):
    first_period = data.isel(year=slice(0, period))
    baseline_mean = first_period.mean(dim='year')
    # Remove the first 15 years from rolling_values for further analysis

    # Calculate the difference for each year compared to the baseline mean
    change_from_baseline = data - baseline_mean

    trimmed_change = change_from_baseline.isel(year=slice(15, None))

    # Optionally, save the change dataset
    change_path = os.path.join(output, name)
    trimmed_change.to_netcdf(change_path)
    
change_from_baseline(rolling_values, output_folder, "rolling_change_from_baseline.nc", period=15)
# change_from_baseline(merged_ds, output_folder, "single_change_from_baseline_10.nc", period=15)


In [12]:
def tif_to_geojson(tif_folder, geojson_folder):
    changesYears = [35, 45, 55, 65, 75, 85]
    absYears = [50, 60, 70, 80, 90, 100]
    os.makedirs(geojson_folder, exist_ok=True)
    tif_files = glob.glob(os.path.join(tif_folder, "*.tif"))
    for tif_file in tif_files:
        years = absYears if 'Absolute' in os.path.basename(tif_file) else changesYears
        with rasterio.open(tif_file) as src:
            # Try to get timebands from metadata (e.g., tags or band descriptions)
            timebands = []
            if src.count > 1:
                timebands = [src.descriptions[i] if src.descriptions[i] else f"band_{i+1}" for i in range(src.count)]
            else:
                timeband = src.tags().get('timeband', None)
                timebands = [timeband if timeband else "unknown"]

            # Only keep timebands that match the years list
            selected_indices = [i for i, tb in enumerate(timebands) if any(str(y) in str(tb) for y in years)]
            for band_idx in selected_indices:
                image = src.read(band_idx + 1)
                mask = image != src.nodata
                shapes_gen = features.shapes(image, mask=mask, transform=src.transform)
                geoms = []
                for geom, value in shapes_gen:
                    if value != src.nodata:
                        geoms.append({
                            'geometry': shape(geom),
                            'properties': {'value': value, 'timeband': timebands[band_idx]}
                        })
                gdf = gpd.GeoDataFrame.from_features(geoms, crs=src.crs)
                geojson_path = os.path.join(
                    geojson_folder,
                    f"{os.path.splitext(os.path.basename(tif_file))[0]}_{timebands[band_idx]}.geojson"
                )
                gdf.to_file(geojson_path, driver="GeoJSON")

In [13]:
tif_to_geojson(os.path.join(output_folder, "masked_data"), os.path.join(output_folder, "geojson"))