In [None]:
### First we have downloaded a net CDF from the Copernicus Data Store: https://cds.climate.copernicus.eu

In [None]:
file = '/Users/gbenz/Downloads/tx10pETCCDI_mon_HadGEM3-GC31-LL_historical_r1i1p1f3_b1981-2010_v20190624_185001-201412_v2-0.nc'

In [22]:
import pandas as pd
import xarray as xr
from rasterstats import zonal_stats
import numpy as np

from utils.unzip import unzip_etccdi_package
from utils.correct_longitude import transform_longitudinal_values
from utils.give_metadata import give_metadata
from utils.etccdi_to_pg import generate_etccdi_temporal_tables
from utils.temporal_index import find_etccdi_timeindex
from utils.define_request import generate_and_validate_request

## Access with Copernicus Data Store API:

#### Objective by Oct. 30 is to have this process begin by retreiving (predefined and approved) ETCCDI data parameters from an API

- Works on Mon 28 GB

The proceeding code provides a correct output but requires a unique user API key. This cannot be simplified much further.

The desirable output is to exclusively change:
- 'variable' 
- product_type
- period
#### ------------------
- start_year = '1995'
- start_month = '01'
- end_year = '2000'
- end_month = '12'

Then if you select 'cold days' a decision tree will be printed with the optional parameters that could be selected for product type and period
Other parameters will be kept standard. 

In [48]:
reference_df_y = pd.read_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg__y.csv', index_col=None)
reference_df_y = reference_df_y.drop(columns=['Unnamed: 0'], errors='ignore')

# Convert 'year' to string
reference_df_y['year'] = reference_df_y['year'].astype(str)

print(reference_df_y.dtypes)

reference_df_m = pd.read_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg__m.csv', index_col=None)
reference_df_m = reference_df_m.drop(columns=['Unnamed: 0'], errors='ignore')

# Convert 'year' to string
reference_df_m['year'] = reference_df_m['year'].astype(str)

# Convert 'month' to an integer first (removes decimals) and then to string
reference_df_m['month'] = reference_df_m['month'].astype(int).astype(str)

print(reference_df_m.dtypes)

In [50]:
print(reference_df_m)
print(reference_df_m.dtypes)
reference_df_m.to_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg___m.csv')

print(reference_df_y)
print(reference_df_y.dtypes)

reference_df_y.to_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg___y.csv')

          month_id  priogrid_gid  year month
0                1         62356  1980     1
1                1         79599  1980     1
2                1         79600  1980     1
3                1         79601  1980     1
4                1         80317  1980     1
...            ...           ...   ...   ...
11169715       852        190496  2050    12
11169716       852        190507  2050    12
11169717       852        190508  2050    12
11169718       852        190510  2050    12
11169719       852        190511  2050    12

[11169720 rows x 4 columns]
month_id         int64
priogrid_gid     int64
year            object
month           object
dtype: object
        priogrid_gid  year
0              62356  1980
1              79599  1980
2              79600  1980
3              79601  1980
4              80317  1980
...              ...   ...
930805        190496  2050
930806        190507  2050
930807        190508  2050
930808        190510  2050
930809        190511  2050



In [51]:
ref = pd.read_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg___y.csv')

print(ref.dtypes)

Unnamed: 0      int64
priogrid_gid    int64
year            int64
dtype: object


In [52]:


# Now, calling the function will generate and validate the request
request = generate_and_validate_request(
    variable="consecutive_dry_days",
    product_type="base_independent",
    experiment="historical",
    temporal_aggregation="yearly"
)

display(request)

#-------------------------------------------------------------------
# Load a clean PG dataframe at a consistent temporal resolution
# to the request built
#-------------------------------------------------------------------

temporal_aggregation_value = request['temporal_aggregation'][0]

if temporal_aggregation_value == 'yearly':
    reference_df = pd.read_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg__y.csv')

    reference_df= reference_df.drop(columns=['Unnamed: 0'], errors='ignore')

    # Convert 'year' to string
    reference_df['year'] = reference_df['year'].astype(str)

    print(reference_df.dtypes)

else:
    reference_df = pd.read_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/data/processed/pg__m.csv')

    reference_df = reference_df.drop(columns=['Unnamed: 0'], errors='ignore')

    # Convert 'year' to string
    reference_df['year'] = reference_df['year'].astype(str)

    # Convert 'month' to an integer first (removes decimals) and then to string
    reference_df['month'] = reference_df['month'].astype(int).astype(str)

    print(reference_df.dtypes)


Request is valid.


{'variable': ['consecutive_dry_days'],
 'product_type': ['base_independent'],
 'model': ['hadgem3_gc31_ll'],
 'ensemble_member': ['r1i1p1f3'],
 'experiment': ['historical'],
 'temporal_aggregation': ['yearly'],
 'period': ['1850_2014'],
 'version': ['2_0'],
 'data_format': 'netcdf'}

priogrid_gid     int64
year            object
dtype: object


In [26]:
import cdsapi

dataset = "sis-extreme-indices-cmip6"

# Extract the desired elements from the request dictionary
variable = request["variable"][0]
temporal_aggregation = request["temporal_aggregation"][0]
period = request["period"][0]

# Concatenate them with an underscore or any other separator you prefer
zip_file_name = f"{variable}_{temporal_aggregation}_{period}.zip"

client = cdsapi.Client()
client.retrieve(dataset, request, target=zip_file_name)


2024-11-12 09:11:12,180 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-11-12 09:11:12,181 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2024-11-12 09:11:12,182 INFO [2024-09-16T00:00:00] Remember that you need to have an ECMWF account to use the new CDS. **Your old CDS credentials will not work in new CDS!**
2024-11-12 09:11:12,480 INFO Request ID is 2d6de506-a660-4246-825c-ef5b60341df8
2024-11-12 09:11:12,585 INFO status has been updated to accepted
2024-11-12 09:11:23,672 INFO status has been updated to successful


432716f0a4b4f4f0049e5f147da23357.zip:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

'consecutive_dry_days_yearly_1850_2014.zip'

In [27]:
netcdf_file, etccdi_index = unzip_etccdi_package(zip_file_name)

cddETCCDI
Extracted file names: cddETCCDI_yr_HadGEM3-GC31-LL_historical_r1i1p1f3_no-base_v20190624_1850-2014_v2-0.nc


### Report Metadata from the selected ETTCDI netCDF file:

Move the pg shapefile to the github repo so this can be accessed without references to local paths

Accomplishes:
- checks to ensure the correct netcdf file is being processed
- provides spatial and temporal metadata

From preprocessing, we know that the ETCCDI climate data is not packaged in a desirable format, that is, the original longitudinal range is: 0.9375 to 359.0625
- Adjust the Longitude range 
- save an 'adjusted netcdf' file.


28-10 -- What would perhaps be most desirable is to first transform, then, report metadata with two seperate functions.

In [28]:
etccdi = transform_longitudinal_values(etccdi_index, netcdf_file)

The variable 'cddETCCDI' was found in the file path and the world continues to spin.
Original Latitude range: -89.375 to 89.375
Original Longitude range: 0.9375 to 359.0625
Adjusted Longitude range: -179.0625 to 179.0625
Adjusted dataset saved to: /Users/gbenz/Downloads/adjusted_cddETCCDI_yr_HadGEM3-GC31-LL_historical_r1i1p1f3_no-base_v20190624_1850-2014_v2-0.nc.nc


In [29]:
give_metadata(etccdi)

Latitude range: -89.375 to 89.375
Longitude range: -179.0625 to 179.0625
Latitude resolution: 1.25
Longitude resolution: 1.875
Global Metadata:
CDI: Climate Data Interface version 1.8.0 (http://mpimet.mpg.de/cdi)
history: Tue Nov 24 08:58:40 2020: cdo mergetime tasmax_day_HadGEM3-GC31-LL_historical_r1i1p1f3_gn_18500101-19491230.nc tasmax_day_HadGEM3-GC31-LL_historical_r1i1p1f3_gn_19500101-20141230.nc ./merged/tasmax_day_HadGEM3-GC31-LL_historical_r1i1p1f3_gn_18500101-20141230.nc
2019-06-19T11:16:23Z ; CMOR rewrote data to be consistent with CMIP6, CF-1.7 CMIP-6.2 and CF standards.;
2019-06-19T11:07:16Z MIP Convert v1.1.0, Python v2.7.12, Iris v1.13.0, Numpy v1.13.3, netcdftime v1.4.1.
source: HadGEM3-GC31-LL (2016): 
aerosol: UKCA-GLOMAP-mode
atmos: MetUM-HadGEM3-GA7.1 (N96; 192 x 144 longitude/latitude; 85 levels; top level 85 km)
atmosChem: none
land: JULES-HadGEM3-GL7.1
landIce: none
ocean: NEMO-HadGEM3-GO6.0 (eORCA1 tripolar primarily 1 deg with meridional refinement down to 1/3 de

In [45]:
print(reference_df.dtypes)

Unnamed: 0      int64
priogrid_gid    int64
year            int64
dtype: object


In [55]:
#-----------------------------------------------------------
# Define Start Year & Month
#-----------------------------------------------------------
start_year = '1990'
start_month = '01'
#-----------------------------------------------------------
# Define End Year & Month
end_year = '1993'
end_month = '12'
#-----------------------------------------------------------
#-----------------------------------------------------------
# Establish Start and End index values:
start_index_val, loc_start_month, loc_start_year =  find_etccdi_timeindex(start_year, start_month, etccdi)
end_index_val, loc_end_month, loc_end_year = find_etccdi_timeindex(end_year, end_month, etccdi)
#-----------------------------------------------------------
print(f'The start index is: {start_index_val}, referencing Month: {loc_start_month} and Year: {loc_start_year}')
print(f'The end index is: {end_index_val}, referencing Month: {loc_end_month} and Year: {loc_end_year}')
#-----------------------------------------------------------
index_list = list(range(start_index_val, end_index_val + 1))
#-----------------------------------------------------------

#-----------------------------------------------------------
# Filter the PG reference file to the temporal parameters now established:
#-----------------------------------------------------------

# For annual:
# the etccdi dataframe will contain a month field but this is irrelevant because the temporal resolution is 1-year

# For monthly:
# why don't you filter for a monthly attribute?: Because all months will be included when subsetting by year.
reference_filtered_time = reference_df.loc[(reference_df['year'] >= loc_start_year) & (reference_df['year'] <= loc_end_year)]
#-----------------------------------------------------------
print(reference_filtered_time)

No data found for 01 of the year 1990 but located data for the first available month.
Validation: Found data for Year: 1990, Month: 06 at index 140.
No data found for 12 of the year 1993 but located data for the first available month.
Validation: Found data for Year: 1993, Month: 06 at index 143.
The start index is: 140, referencing Month: 06 and Year: 1990
The end index is: 143, referencing Month: 06 and Year: 1993
        priogrid_gid  year
131100         62356  1990
131101         79599  1990
131102         79600  1990
131103         79601  1990
131104         80317  1990
...              ...   ...
183535        190496  1993
183536        190507  1993
183537        190508  1993
183538        190510  1993
183539        190511  1993

[52440 rows x 2 columns]


### Testing on the first n elements:

In [54]:
sub_index = index_list[:2]

time_length_subset = len(sub_index)
time_length = len(index_list)

print(time_length_subset)
print(time_length)

print(sub_index)

2
25
[140, 141]


### Puts it all together

Parameters:
1. references the sub_index which supplies the list (or sublist) of indexes to iterate over. Index specifically references time. This can be confusing because the ETCCDI variables are themselves climate indices.
2. Creates a single geotiff from the current time selection. We do this because the NetCDF itself is not a format that can be incorporated into rigorous analysis so as we iterate through the time series we convert the working item to a geotiff which is a format that can be operated on.


#### Parameters:

1. NetCDF file
2. (TEMPORAL) sub_index or full index (specify index to loop over)
3. etccdi index ex(tx10pETCCDI)


ADF -- Decision to just save to ONE WORKING raster that will continously be rewritten 
rationale: The purpose of having unique tifs is to visualize holes in the data. However, this is not worth the space. If holes appear in the tabular dataset, a new geotiff corresponding to that month / year can quickly be produced!

ADF -- Rationalize why this is best:
    # Resample the raster data to the new resolution
    resampled_raster = raster_data.rio.reproject(
        raster_data.rio.crs,
        shape=(
            int(raster_data.shape[1] * 10),  # Increase number of rows by a factor of 10
            int(raster_data.shape[2] * 10)   # Increase number of columns by a factor of 10
        ),
        resampling=Resampling.bilinear  # Use the correct resampling method
    )


### Params:

- time_index_list,
- netcdf, climate_index, 
- shapefile_path


In [68]:
import rioxarray
import rasterio
from rasterio.enums import Resampling
import matplotlib.pyplot as plt
import os
import geopandas as gpd
from rasterstats import zonal_stats
import pandas as pd
import numpy as np

def generate_etccdi_temporal_tables(param_time_index_list, param_netcdf, param_climate_index, param_shapefile_path):
    all_stats = []

    # Retrieve the first and last time indices for file naming
    first_time_index = param_time_index_list[0]
    last_time_index = param_time_index_list[-1]

    for i in param_time_index_list:
        print(f"Processing time index: {i}")
        
        # Select the data for the specified climate index
        data = param_netcdf[param_climate_index]
        
        # Check the data type and process accordingly
        data_type = data.dtype
        if data_type == 'timedelta64[ns]':
            data_days = data / np.timedelta64(1, 'D')  # Convert to days if it's timedelta
            raster_data = data_days.isel(time=i)
        elif data_type == 'float32':
            raster_data = data.isel(time=i)  # Use as-is if it's already float32
        else:
            raise TypeError(f"Unsupported data type '{data_type}' for variable '{param_climate_index}'. Expected 'timedelta64[ns]' or 'float32'.")
        
        # Plotting and other processing steps
        plt.figure(figsize=(10, 6))
        raster_data.plot(cmap='viridis')
        plt.title(f'{param_climate_index} at Time Index {i}')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.show()

        # Convert spatial dimensions
        raster_data = raster_data.rename({'lon': 'x', 'lat': 'y'})
        raster_data = raster_data.rio.set_spatial_dims(x_dim='x', y_dim='y')

        # Get the date and time information
        date_time = str(param_netcdf['time'].isel(time=i).values.item())
        year = date_time.split('-')[0]
        month = date_time.split('-')[1]
        print("Year:", date_time)

        # Set CRS if not already defined
        if not raster_data.rio.crs:
            print("CRS is not set. Setting CRS to EPSG:4326")
            raster_data = raster_data.rio.write_crs("EPSG:4326")

        # Check bounds and transform
        print("Bounds before saving:", raster_data.rio.bounds())
        print("Transform before saving:", raster_data.rio.transform())

        # Convert to float for consistent data type in raster operations
        raster_data = raster_data.astype('float32')

        # Check for NaN values and mask if needed (Do not overwrite NaNs to 0 here)
        print("NaN values check before masking:", raster_data.isnull().sum().item())
        raster_data = raster_data.where(~np.isnan(raster_data), other=np.nan)  # Keep NaNs for now
        
        # Save the raster to GeoTIFF
        raster_file_path = 'working_etccdi_file.tif'
        raster_data.rio.to_raster(raster_file_path)
        print(f"GeoTIFF saved at: {raster_file_path}")

        # Resample Raster (with NaNs preserved)
        raster_data = rioxarray.open_rasterio(raster_file_path)

        # Calculate current and new resolutions
        current_resolution_x = abs(raster_data.x[1] - raster_data.x[0])
        current_resolution_y = abs(raster_data.y[1] - raster_data.y[0])
        new_resolution_x = current_resolution_x / 10
        new_resolution_y = current_resolution_y / 10

        # Resample without introducing NoData values
        print("Resampling raster...")
        resampled_raster = raster_data.rio.reproject(
            raster_data.rio.crs,
            shape=(
                int(raster_data.shape[1] * 10),  
                int(raster_data.shape[2] * 10)  
            ),
            resampling=Resampling.bilinear
        )

        # Check bounds and transform after resampling
        print("Bounds after resampling:", resampled_raster.rio.bounds())
        print("Transform after resampling:", resampled_raster.rio.transform())

        # Save resampled raster
        resampled_raster_path = 'working_etccdi_file_resampled.tif'
        resampled_raster.rio.to_raster(resampled_raster_path)
        print(f"Resampled GeoTIFF saved at: {resampled_raster_path}")

        # Calculate zonal statistics
        gdf = gpd.read_file(param_shapefile_path)
        gdf = gdf[['gid', 'geometry', 'xcoord', 'ycoord']]
        stats = zonal_stats(gdf, resampled_raster_path, stats='mean', geojson_out=True)
        stats_gdf = gpd.GeoDataFrame.from_features(stats)

        # Add Year and Month fields to stats_gdf
        stats_gdf['year'] = year
        stats_gdf['month'] = month
        stats_gdf.rename(columns={'mean': param_climate_index}, inplace=True)

        # Plot the zonal statistics
        fig, ax = plt.subplots(figsize=(10, 6))
        stats_gdf.plot(column=param_climate_index, ax=ax, legend=True, cmap='viridis', edgecolor='none')
        ax.set_title(f'{param_climate_index} Statistics by Region - {year}-{month}')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')
        plt.show()

        # Append the stats_gdf to the all_stats list
        all_stats.append(stats_gdf)

    # Concatenate all DataFrames into one
    final_gdf = pd.concat(all_stats, ignore_index=True)

    # Construct the output filename
    first_date_time = str(param_netcdf['time'].isel(time=first_time_index).values.item())
    last_date_time = str(param_netcdf['time'].isel(time=last_time_index).values.item())
    first_year, first_month = first_date_time.split('-')[0], first_date_time.split('-')[1]
    last_year, last_month = last_date_time.split('-')[0], last_date_time.split('-')[1]
    
    # Save the final DataFrame to a CSV file
    folder = 'etccdi_out_files'
    os.makedirs(folder, exist_ok=True)
    output_file_path = os.path.join(folder, f"{param_climate_index}_{first_year}_{first_month}__{last_year}_{last_month}.csv")
    
    final_gdf.to_csv(output_file_path, index=False)
    print(f"Final DataFrame saved to: {output_file_path}")


In [113]:
import rioxarray
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from rasterstats import zonal_stats
import xarray as xr
import rasterio
from rasterio.enums import Resampling
import geopandas as gpd
from shapely.geometry import Polygon

def generate_etccdi_temporal_tables(param_time_index_list, param_netcdf, param_climate_index, param_shapefile_path):
    all_stats = []

    # Retrieve the first and last time indices for file naming
    first_time_index = param_time_index_list[0]
    last_time_index = param_time_index_list[-1]

    for i in param_time_index_list:
        print(f"Processing time index: {i}")
        
        # Select the data for the specified climate index
        data = param_netcdf[param_climate_index]
        
        # Check the data type and process accordingly
        data_type = data.dtype
        if data_type == 'timedelta64[ns]':
            data_days = data / np.timedelta64(1, 'D')  # Convert to days if it's timedelta
            raster_data = data_days.isel(time=i)
        elif data_type == 'float32':
            raster_data = data.isel(time=i)  # Use as-is if it's already float32
        else:
            raise TypeError(f"Unsupported data type '{data_type}' for variable '{param_climate_index}'. Expected 'timedelta64[ns]' or 'float32'.")
        
        # Convert spatial dimensions
        raster_data = raster_data.rename({'lon': 'x', 'lat': 'y'})
        raster_data = raster_data.rio.set_spatial_dims(x_dim='x', y_dim='y')

        # Get the date and time information
        date_time = str(param_netcdf['time'].isel(time=i).values.item())
        year, month = date_time.split('-')[:2]
        print("Year:", year, "Month:", month)

        # Set CRS if not already defined
        if not raster_data.rio.crs:
            print("CRS is not set. Setting CRS to EPSG:4326")
            raster_data = raster_data.rio.write_crs("EPSG:4326")

        # Resample with interpolation to fill NaN values
        def resample_and_interpolate(raster_data, factor=3):
            # Perform resampling using bilinear interpolation (to fill NaN values)
            upsampled_raster = raster_data.rio.reproject(
                raster_data.rio.crs,
                shape=(
                    int(raster_data.sizes['y'] * factor),
                    int(raster_data.sizes['x'] * factor)
                ),
                resampling=Resampling.bilinear
            )
            
            # Interpolate to fill NaN values
            upsampled_raster = upsampled_raster.interpolate_na(dim='x', method='linear').interpolate_na(dim='y', method='linear')
            
            return upsampled_raster

        # Resample with interpolation
        upsampled_raster = resample_and_interpolate(raster_data, factor=3)

        # Plot the resampled raster with NaNs filled
        plt.figure(figsize=(10, 6))
        upsampled_raster.plot(cmap='viridis')
        plt.title(f'Upsampled and Interpolated {param_climate_index} at Time Index {i}')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.show()

        # Save the resampled raster to GeoTIFF
        upsampled_raster_path = 'working_etccdi_file_upsampled_interpolated.tif'
        upsampled_raster.rio.to_raster(upsampled_raster_path)
        print(f"Upsampled GeoTIFF saved at: {upsampled_raster_path}")

        # Create a null mask for the original raster
        null_mask = raster_data.isnull()

        # Convert the null mask into polygons (vectorize the mask)
        null_mask_poly = null_mask.rio.to_raster('null_mask.tif')
        null_mask_gdf = gpd.read_file('null_mask.tif')

        # Load the shapefile for zonal statistics
        gdf = gpd.read_file(param_shapefile_path)
        gdf = gdf[['gid', 'geometry', 'xcoord', 'ycoord']]

        # Calculate zonal statistics on the upsampled raster
        stats = zonal_stats(gdf, upsampled_raster_path, stats='mean', geojson_out=True)
        stats_gdf = gpd.GeoDataFrame.from_features(stats)

        # Assign NaN values to zonal statistics if the zone falls within the null mask
        for index, row in stats_gdf.iterrows():
            # Check if the centroid of the zone is inside the null mask
            zone_centroid = row.geometry.centroid
            if null_mask_gdf.geometry.contains(zone_centroid).any():
                stats_gdf.at[index, param_climate_index] = np.nan

        # Add year and month fields
        stats_gdf['year'] = year
        stats_gdf['month'] = month
        stats_gdf.rename(columns={'mean': param_climate_index}, inplace=True)

        # Ensure stats_gdf has valid geometry and data
        stats_gdf = stats_gdf[stats_gdf.geometry.notnull() & stats_gdf[param_climate_index].notnull()]

        # Plot the zonal statistics if there is data
        if not stats_gdf.empty:
            fig, ax = plt.subplots(figsize=(10, 6))
            stats_gdf.plot(column=param_climate_index, ax=ax, legend=True, cmap='viridis', edgecolor='none')
            ax.set_title(f'{param_climate_index} Statistics by Region - {year}-{month}')
            ax.set_xlabel('Longitude')
            ax.set_ylabel('Latitude')
            plt.show()
        else:
            print(f"No valid zonal statistics data to plot for {param_climate_index} at time index {i}")

        # Append to list
        all_stats.append(stats_gdf)

    # Concatenate all DataFrames
    final_gdf = pd.concat(all_stats, ignore_index=True)

    # Save final DataFrame to CSV
    folder = 'etccdi_out_files'
    os.makedirs(folder, exist_ok=True)
    output_file_path = os.path.join(folder, f"{param_climate_index}_{first_time_index}_{last_time_index}.csv")
    
    final_gdf.to_csv(output_file_path, index=False)
    print(f"Final DataFrame saved to: {output_file_path}")


ADF: Treating NaN values in the Raster 

### Options
Mask Out NaNs in the Original Raster Before Resampling:
- Create a mask to identify NaNs in the original raster.
- Use an interpolation method that only processes non-NaN cells, allowing it to "skip" over NaN values.

In [114]:
generate_etccdi_temporal_tables(index_list, etccdi, etccdi_index, '/Users/gbenz/Downloads/pg_extent/pgm_viewser_extent.shp')

Processing time index: 140
Year: 1990 Month: 06
CRS is not set. Setting CRS to EPSG:4326


ValueError: Index 'y' must be monotonically increasing

### Review:

#### Validate completeness of the output index at PG resolution:

Temporally: check that all total number of time periods match
- Spatially: Full extent of PG (for each temporal unit!)

How to do this:

1. Load the 'compiled' etccdi index .csv
2. check for null values

Total length should be: X

check length of temporal units should be: X (dependent on input parameters)
check length of spatial units should be: X

3. Plot the data

### What we need:

A complete 'clean' dataframe to reference from VIEWSER! 11.11


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_monthly_average_tx10pETCCDI(dataframe):
    # Group by 'date' and calculate the average 'tx10pETCCDI' for each date
    monthly_avg = dataframe.groupby('date')['tx10pETCCDI'].mean().reset_index()
    
    # Plot the data
    plt.figure(figsize=(12, 6))
    plt.plot(monthly_avg['date'], monthly_avg['tx10pETCCDI'], marker='o', linestyle='-')
    plt.title('Average tx10pETCCDI by Year-Month')
    plt.xlabel('Date')
    plt.ylabel('Average tx10pETCCDI')
    plt.grid(True)
    plt.show()

# Assuming your DataFrame is named df


In [None]:
import geopandas as gpd

# Specify the path to your shapefile or other spatial data file
file_path = '/Users/gbenz/Downloads/pg_extent/pgm_viewser_extent.shp'

# Load the data into a GeoDataFrame
gdf = gpd.read_file(file_path)

spatial_extent = len(pd.unique(gdf['gid']))

In [81]:
validate_etccdi = pd.read_csv('/Users/gbenz/Documents/Climate Data/climate_extremes/etccdi_out_files/cddETCCDI_1990_06__1993_06.csv')
validate_etccdi['date'] = validate_etccdi['year'].astype(str) + '-' + validate_etccdi['month'].astype(str).str.zfill(2)
validate_etccdi['year'] = validate_etccdi['year'].astype(str)

etccdi_time_length = len(pd.unique(validate_etccdi['date']))
etccdi_spatial_length = len(pd.unique(validate_etccdi['gid']))

#-------------------------------------------------------------------------------------------------------------------------------------------------
# Report null values:

# Specify the columns to check for null values
columns_to_check = ['gid', 'year', 'month', 'date', etccdi_index]
print('This prints a summary of Null values:')
# Count null values in the specified columns
null_counts = validate_etccdi[columns_to_check].isnull().sum()
#-------------------------------------------------------------------------------------------------------------------------------------------------
print()
display(null_counts)

print()
print('Summary of temporal and spatial units:')
print()
print(f'This dataset expects to see {time_length} and found {etccdi_time_length}')
print(f'This dataset expects to see {spatial_extent} and found {etccdi_spatial_length}')
print()
print('Average over time to expose any temporal gaps')
#plot_monthly_average_tx10pETCCDI(validate_etccdi)

This prints a summary of Null values:



gid             0
year            0
month           0
date            0
cddETCCDI    2936
dtype: int64


Summary of temporal and spatial units:

This dataset expects to see 25 and found 4


NameError: name 'spatial_extent' is not defined

In [82]:
# Perform a left join on the 'priogrid_gid' and 'gid' columns, and 'year'
merged_df = reference_filtered_time.merge(
    validate_etccdi,
    how='left',  # Keeps all rows from reference_filtered_time
    left_on=['priogrid_gid', 'year'],  # Columns in reference_filtered_time
    right_on=['gid', 'year'],  # Columns in validate_etccdi
    suffixes=('_reference', '_validate')  # Optional: Adds suffixes to distinguish overlapping columns
)

# Check the result
print(merged_df.head())


   priogrid_gid  year                                           geometry  \
0         62356  1990  POLYGON ((37.499999999800195 -47.0000000003996...   
1         79599  1990  POLYGON ((18.999999999900126 -35.0000000000998...   
2         79600  1990  POLYGON ((19.499999999800195 -35.0000000000998...   
3         79601  1990  POLYGON ((19.999999999700265 -35.0000000000998...   
4         80317  1990  POLYGON ((18.000000000099988 -34.5000000001998...   

     gid  xcoord  ycoord  cddETCCDI  month     date  
0  62356   37.75  -46.75   8.174884      6  1990-06  
1  79599   19.25  -34.75  22.465351      6  1990-06  
2  79600   19.75  -34.75  21.393824      6  1990-06  
3  79601   20.25  -34.75  21.253667      6  1990-06  
4  80317   18.25  -34.25  28.487264      6  1990-06  


In [84]:
# Filter the merged DataFrame to find rows where the 'cddETCCDI' field is NaN
null_cddETCCDI_rows = merged_df[merged_df['cddETCCDI'].isna()]

# Get a summary of the rows with NaN in the 'cddETCCDI' field
summary_null_cddETCCDI = null_cddETCCDI_rows[['priogrid_gid', 'year', 'cddETCCDI']]  # Adjust columns to display
print(summary_null_cddETCCDI)


       priogrid_gid  year  cddETCCDI
7472         154458  1990        NaN
7473         154459  1990        NaN
7474         154460  1990        NaN
7475         154461  1990        NaN
7476         154462  1990        NaN
...             ...   ...        ...
50604        173221  1993        NaN
50605        173222  1993        NaN
50606        173223  1993        NaN
50607        173224  1993        NaN
50608        173225  1993        NaN

[2936 rows x 3 columns]


### Resample to Fine (Granular) Pixel 

Parameters to consider: 
1. What is an appropriate resolution
2. What is the most appropriate resampling method
