# Notebook to explore netCDF files and change resolution, plus Python plotting
These files are downloaded from [Copernicus Climate Data Store](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land-monthly-means?tab=download), using `cdsapi`. Get the data running the Python script `inout.py`:
```
python onehealth_db/inout.py
```

The downloaded files are stored in `data/in`. The `area` option uses values `90`, `90`, `-90`, `-90` for `North`, `East`, `South`, `West`, respectively.

Question: What is the coordinate reference system for the era5 dataset? NUTS3 either on EPSG 3035, 4326, 3857.

-> According to [ERA5-Land's documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation):
> The data is referenced in the horizontal with respect to the WGS84 ellipse (which defines the major/minor axes) and in the vertical it is referenced to the EGM96 geoid over land but over ocean it is referenced to mean sea level, with the approximation that this is assumed to be coincident with the geoid. 

Then according to [this page](https://spatialreference.org/ref/epsg/9707/), it seems like the coordinate reference system for ERA5-Land is EPSG:9707

> ERA5-Land produces a total of 50 variables describing the
water and energy cycles over land, globally, hourly, and at a
spatial resolution of 9 km, matching the ECMWF triangular–
cubic–octahedral (TCo1279) operational grid (Malardel
et al., 2016).

In [None]:
from pathlib import Path
import xarray as xr
from matplotlib import pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

The following cells aim to explore the data structure

In [None]:
folder_path = Path("../data/in")

#### ERA5-Land from CDS

In [None]:
f_area_before_celsius = folder_path / "era5_data_2024_01_02_2t_tp_monthly_area.nc"
f_area_after_celsius = folder_path / "era5_data_2024_01_02_2t_tp_monthly_area_celsius.nc"

In [None]:
# load netCDF files
ds_area_before_celsius = xr.open_dataset(f_area_before_celsius)
ds_area_after_celsius = xr.open_dataset(f_area_after_celsius)

In [None]:
ds_area_before_celsius

In [None]:
ds_area_before_celsius.sel(latitude=20.0, longitude=10.0, method="nearest").to_dataframe().head(5)

In [None]:
ds_area_before_celsius["tp"].attrs

In [None]:
ds_area_after_celsius

In [None]:
ds_area_after_celsius["tp"].attrs

In [None]:
ds_area_after_celsius.sel(latitude=20.0, longitude=10.0, method="nearest").to_dataframe().head(5)

In [None]:
lat = 20.0
lon = 10.0
ds_area_after_celsius["t2m"].sel(latitude=lat, longitude=lon, method="nearest").plot(color="blue", marker="o")
plt.title("2m temperature in 2024 at lat-{}, lon-{}".format(lat, lon))
plt.show()


In [None]:
# plot the data for the first month
ds_area_after_celsius.t2m[0].plot(size = 7)

In [None]:
ds_area_after_celsius.tp[0].plot(size = 7)

In [None]:
# convert to dataframe
df = ds_area_after_celsius.to_dataframe().reset_index()
df

#### Population data from ISIMIP

In [None]:
f_popu_data = folder_path / "population_histsoc_30arcmin_annual_1901_2021.nc"
ds_popu_data = xr.open_dataset(f_popu_data)

In [None]:
ds_popu_data

In [None]:
ds_popu_data["total-population"].attrs

In [None]:
test_popu_data = ds_popu_data.sel(lat=20.0, lon=10.0, method="nearest").to_dataframe()
test_popu_data.head(5)

In [None]:
test_popu_data["total-population"].plot()

In [None]:
ds_popu_data["total-population"][-1].plot(figsize=(9, 5))

In [None]:
# a file from provided materials
f_popu_dens_2024 = folder_path / "pop_dens_2024_global_0.5.nc"
ds_popu_dens_2024 = xr.open_dataset(f_popu_dens_2024, decode_times=False) # add decode_times=False to avoid error
f_dens_example = folder_path / "dens_example.nc"
ds_dens_example = xr.open_dataset(f_dens_example)

In [None]:
ds_popu_dens_2024

In [None]:
ds_popu_dens_2024["dens"].attrs

In [None]:
ds_popu_dens_2024.sel(lat=20.0, lon=10.0, method="nearest").to_dataframe().head(5)

In [None]:
ds_dens_example

In [None]:
ds_dens_example.to_dataframe().head(5)

## Downsampling of the data and setting the correct accuracy for the dataframe

In [None]:
# aggregate the data to a 1/2 degree grid, about 50km x 50 km
# already here the numerical accuracy of the grid values is problematic, so we need to round
output_grid_resolution = 1/2
input_grid_resolution = np.round((ds_area_after_celsius.longitude[1]-ds_area_after_celsius.longitude[0]).item(),2)
print("Initial grid resolution is {}, downsampling to {} degree resolution".format(input_grid_resolution, output_grid_resolution))
weight = int(np.ceil(output_grid_resolution / input_grid_resolution))
print("Weight is {}".format(weight))

In [None]:
ds_area_after_celsius_resampled = ds_area_after_celsius.coarsen(longitude=weight, boundary="pad").mean().coarsen(latitude=weight, boundary="pad").mean()
ds_area_after_celsius_resampled

In [None]:
downsampled_grid = float(ds_area_after_celsius_resampled.longitude[1] - ds_area_after_celsius_resampled.longitude[0])
print("Downsampled grid resolution is {}".format(downsampled_grid))

In [None]:
# plot the data for the first month
ds_area_after_celsius_resampled.t2m[0].plot(size = 5)

In [None]:
# convert to dataframe
df = ds_area_after_celsius_resampled.to_dataframe().reset_index()
df

In [None]:
out_data = df[["valid_time", "latitude", "longitude", "t2m"]]
out_data = out_data[out_data['valid_time'] == '2024-02-01']

In [None]:
# drop all nan values and filter by time
out_data_clean = out_data.dropna()
out_data_clean

In [None]:
out_data.to_csv("../data/out/era5_data_2024_01_02_monthly_area_celsius_with_NaN_february_resampled_05degree.csv", index=False)
out_data_clean.to_csv("../data/out/era5_data_2024_01_02_monthly_area_celsius_february_resampled_05degree.csv", index=False)

## Export to geopandas for other plotting options and geospatial analysis

In [None]:
# xarray data to geopandas
# Create geometry column using latitude and longitude
geometry = [Point(xy) for xy in zip(out_data_clean['longitude'], out_data_clean['latitude'])]

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(out_data_clean, geometry=geometry)

# Set the coordinate reference system (CRS) if known (e.g., WGS84)
gdf.set_crs(epsg=4326, inplace=True)

# Save to a GeoJSON file
gdf.to_file("../data/out/era5_data_2024_01_02_monthly_area_celsius_February_resampled_05degree.geojson", driver="GeoJSON")

# Display the GeoDataFrame
gdf.head()

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
gdf.plot(ax=ax, column="t2m", legend=True, markersize=0.5)
fig.tight_layout()
fig.savefig("february.pdf")

## Resample to NUTS3 level
Use the same `crs` for geopandas export and the shapefile export from Eurostat.

In [None]:
# read the shapefile
shapefile_path = Path("../data/in/NUTS_RG_20M_2024_4326.shp")
nuts3 = gpd.GeoDataFrame.from_file(shapefile_path)
nuts3

In [None]:
gdf

In [None]:
# Spatial join for points in polygons
merge = gpd.tools.sjoin(gdf, nuts3, how='left')

# drop non-merged obs
matched = merge[~merge['NUTS_NAME'].isna()]
# show result
matched.head()

In [None]:
aggregated_by_NUTS3 = matched.groupby("NUTS_ID")["t2m"].mean().reset_index()
aggregated_by_NUTS3

In [None]:
nuts = nuts3.merge(aggregated_by_NUTS3, on="NUTS_ID")
nuts = nuts.filter(["NUTS_ID",'geometry', 't2m'])
nuts

In [None]:
# plot the NUTS3 regions with the t2m
fig, ax = plt.subplots(figsize=(8, 5))
nuts.plot(ax=ax, column='t2m', legend=True, markersize=0.5, cmap='coolwarm')
plt.tight_layout()
fig.savefig('nuts3_export.pdf')

In [None]:
# export the NUTS3 regions with the t2m as csv
nuts.to_csv("../data/out/era5_data_2024_01_02_monthly_area_celsius_february_resampled_05degree_NUTS3.csv", index=False)