# ERA5-Land Hourly Data

ERA5-Land is a high-resolution reanalysis dataset that provides a consistent and detailed view of land variables over several decades, combining model data with atmospheric forcing from ERA5 to ensure accuracy. By correcting input variables for altitude differences and leveraging indirect observational influences, it offers enhanced precision for land surface applications like flood and drought forecasting. Despite some inherent uncertainties, ERA5-Land's extensive temporal and spatial resolution makes it a valuable resource for decision-making and environmental analysis.

**Information on Dataset:**
* Source: [ERA5-Land Hourly Data](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=overview')
* Author:
* Notebook Version: 1.1 (Updated: December 09, 2024)

## 1. Specifying the paths and working directories

In [1]:
import os

''' ---- Hier die Verzeichnisse angeben ---- '''
download_folder = r".\data\era5-land-hourly-data\download"
working_folder = r".\data\era5-land-hourly-data\working"
geotiff_folder = r".\data\era5-land-hourly-data\geotiff"
csv_folder = r".\data\era5-land-hourly-data\csv"
output_folder = r".\data\era5-land-hourly-data\output"
''' ----- Ende der Eingaben ---- '''

os.makedirs(download_folder, exist_ok=True)
os.makedirs(working_folder, exist_ok=True)
os.makedirs(geotiff_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

## 2. Download and Extract Dataset

### 2.1 Authentication

In [2]:
import cdsapi

def main():
    api_key = "fdae60fd-35d4-436f-825c-c63fedab94a4"
    api_url = "https://cds.climate.copernicus.eu/api"
    client = cdsapi.Client(url=api_url, key=api_key)
    return client

### 2.2 Request Definition and Download

#### 1. Create list of variables

The list of variables are available at [ERA5-Land Hourly](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=download)

*Please check and modify the predefined variable name if any issue arises with variable name.* 

In [3]:
# Variable Groups
var_group_temperature = [
    "2m_dewpoint_temperature",
    "2m_temperature",
    "skin_temperature",
    "soil_temperature_level_1",
    "soil_temperature_level_2",
    "soil_temperature_level_3",
    "soil_temperature_level_4"]

var_group_lake = [
    "lake_bottom_temperature",
    "lake_ice_depth",
    "lake_ice_temperature",
    "lake_mix_layer_depth",
    "lake_mix_layer_temperature",
    "lake_shape_factor",
    "lake_total_layer_temperature"]

var_group_snow = [
    "snow_albedo",
    "snow_cover",
    "snow_density",
    "snow_depth",
    "snow_depth_water_equivalent",
    "snowfall",
    "snowmelt",
    "temperature_of_snow_layer"]

var_group_soil_water = [
    "skin_reservoir_content",
    "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4"]

var_group_radiation_and_heat = [
    "forecast_albedo",
    "surface_latent_heat_flux",
    "surface_net_solar_radiation",
    "surface_net_thermal_radiation",
    "surface_sensible_heat_flux",
    "surface_solar_radiation_downwards",
    "surface_thermal_radiation_downwards"]

var_group_evaporation_and_runoff = [
    "evaporation_from_bare_soil",
    "evaporation_from_open_water_surfaces_excluding_oceans",
    "evaporation_from_the_top_of_canopy",
    "evaporation_from_vegetation_transpiration",
    "potential_evaporation",
    "runoff",
    "snow_evaporation",
    "sub_surface_runoff",
    "surface_runoff",
    "total_evaporation"]

var_group_wind_pressure_and_precipitation = [
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "surface_pressure",
    "total_precipitation"]

var_group_vegetation = [
    "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation"]

# List of variable group
var_group_list = ['var_group_temperature',
                  'var_group_lake',
                  'var_group_soil_water',
                  'var_group_radiation_and_heat',
                  'var_group_evaporation_and_runoff',
                  'var_group_wind_pressure_and_precipitation',
                  'var_group_vegetation']

#### 2. Define request parameters for download

In [4]:
import ipywidgets as widgets

selected_variable_group = widgets.Dropdown(
    options=var_group_list,
    value=var_group_list[0],
    description="Select a variable group",
    style=dict(description_width='initial'),
    layout=widgets.Layout(width='50%'),
)

selected_variable_group

Dropdown(description='Select a variable group', layout=Layout(width='50%'), options=('var_group_temperature', …

In [5]:
current_variable_group = globals().get(selected_variable_group.value)

selected_variable = widgets.Dropdown(
    options=current_variable_group,
    value=current_variable_group[1],
    description="Select the variable of interest",
    style=dict(description_width='initial'),
    layout=widgets.Layout(width='50%'),
)

selected_variable

Dropdown(description='Select the variable of interest', index=1, layout=Layout(width='50%'), options=('2m_dewp…

#### 3. Define Bounding Box Extents (Bbox) for the Dataset

In [6]:
# Define additional request fields to ensure the request stays within the file size limit.
# These coordinates were obtained using the BBox Extractor tool:
# https://str-ucture.github.io/bbox-extractor/

bbox_wgs84_deutschland = [56.0, 5.8, 47.2, 15.0] # North, West, South, East
bbox_wgs84_konstanz = [47.9, 8.9, 47.6, 9.3]

# Alternatively, use a shapefile for precise geographic filtering
import geopandas as gpd
import math

# Example: Load shapefile of Konstanz (WGS84 projection)
de_shapefile = r"./shapefiles/de_boundary.shp"
de_gdf = gpd.read_file(de_shapefile)
de_bounds = de_gdf.total_bounds

# Adjust and buffer
de_bounds_adjusted = [(math.floor(de_bounds[0]* 10)/10)-0.1,
                      (math.floor(de_bounds[1]* 10)/10)-0.1,
                      (math.ceil(de_bounds[2]* 10)/10)+0.1,
                      (math.ceil(de_bounds[3]* 10)/10)+0.1]

bbox_de_bounds_adjusted = [de_bounds_adjusted[3], de_bounds_adjusted[0],
                           de_bounds_adjusted[1], de_bounds_adjusted[2]]

In [7]:
from datetime import datetime

selected_year = widgets.Dropdown(
    options=[str(year) for year in range(1950, 2024+1)],
    value=str(2024),
    description="Select the year for downloading data:",
    disabled=False,
    style=dict(description_width='initial'),
    layout=widgets.Layout(width='50%'),
)

selected_year

Dropdown(description='Select the year for downloading data:', index=74, layout=Layout(width='50%'), options=('…

#### 4. Define request parameters

In [8]:
# Download data for the entire year for a variable group

dataset = "reanalysis-era5-land"
request = {
    "variable": selected_variable.value,
    "year": selected_year.value,
    "month": [str(month) for month in range(13)],
    "day": [str(day) for day in range(32)],
    "time": [f"{hour:02d}:00" for hour in range(24)],
    "data_format": "netcdf",
    "download_format": "unarchived",
    "area": bbox_de_bounds_adjusted
}

In [9]:
# Run this cell to download the dataset:

def main_retrieve():
    dataset_filename = f"{dataset}-{selected_variable.value}-{selected_year.value}.nc"
    dataset_filepath = os.path.join(download_folder, dataset_filename)

    # Download the dataset only if the dataset has not been downloaded before
    if not os.path.isfile(dataset_filepath):
        # Download the dataset with the defined request parameters
        client.retrieve(dataset, request, dataset_filepath)
    else:
        print("Dataset already downloaded.")

if __name__ == "__main__":
    client = main()
    main_retrieve()

2025-02-11 09:23:37,309 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.




Dataset already downloaded.


### 2.3 Extract the Zip folder

(**Note**: Since the downloaded dataset is only for 1 single variable, 1 netCDF file is downloaded and CDS does not create a zip file for single variable netCDF file)

In [10]:
# import zipfile

# extract_folder = os.path.join(working_folder, f"{selected_variable.value}")
# os.makedirs(extract_folder, exist_ok=True)

# # Extract the zip file
# try:
#     if not os.listdir(extract_folder):
#         dataset_filename = f"{dataset}-{selected_variable.value}-{selected_year.value}.nc"
#         dataset_filepath = os.path.join(download_folder, dataset_filename)

#         with zipfile.ZipFile(dataset_filepath, 'r') as zip_ref:
#             zip_ref.extractall(extract_folder)
#             print(f"Successfully extracted files to: {extract_folder}")
#     else:
#         print("Folder is not empty. Skipping extraction.")
# except FileNotFoundError:
#     print(f"Error: The file {dataset_filepath} was not found.")
# except zipfile.BadZipFile:
#     print(f"Error: The file {dataset_filepath} is not a valid zip file.")
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")

## 3. Read the netCDF file and print the metadata

In [11]:
import re
import pandas as pd
import netCDF4 as nc

def meta(filename):
    match = re.search(r"(?P<dataset>reanalysis-era5-land)-(?P<ds_variable>\d+m_[a-z_]+)-(?P<year>\d{4})",filename)

    def get_nc_variable():
        with nc.Dataset(os.path.join(download_folder, filename), 'r') as nc_dataset:
            nc_variable_name = nc_dataset.variables.keys()
            return [*nc_variable_name][5]

    return dict(
        filename=filename,
        path=os.path.join(download_folder, filename),
        # index=match.group('index'),
        dataset=match.group('dataset'),
        ds_variable=match.group('ds_variable'),
        variable_name=get_nc_variable(),
        year=match.group('year')
    )

# Create DataFrame from the list of files inside the extracted directory
nc_files = [meta(f) for f in os.listdir(download_folder) if f.endswith('.nc')]
nc_files = sorted(nc_files, key=lambda x: x['year']) # Sort by year
df_nc_files = pd.DataFrame.from_dict(nc_files)

# Modify pandas display options
pd.options.display.max_colwidth = 24

# Display the DataFrame
df_nc_files.head().loc[:, df_nc_files.columns != 'path']

Unnamed: 0,filename,dataset,ds_variable,variable_name,year
0,reanalysis-era5-land...,reanalysis-era5-land,2m_temperature,t2m,1950
1,reanalysis-era5-land...,reanalysis-era5-land,2m_temperature,t2m,1951
2,reanalysis-era5-land...,reanalysis-era5-land,2m_temperature,t2m,1952
3,reanalysis-era5-land...,reanalysis-era5-land,2m_temperature,t2m,1953
4,reanalysis-era5-land...,reanalysis-era5-land,2m_temperature,t2m,1954


### 3.1 Print unique Variable name(s) and Available Variables

In [12]:
import netCDF4 as nc

seen_variables = set()
for i, nc_file in enumerate(nc_files):
    variable_name = nc_file['variable_name']
    
    if variable_name in seen_variables:
        continue

    # Open the NetCDF file in read mode
    nc_dataset = nc.Dataset(nc_file['path'], mode='r')

    # List all variables in the dataset
    variables_list = nc_dataset.variables.keys()
    print(f"{i+1:<2} {variable_name:<18}: Available variables: {list(variables_list)}")
    
    # Add the variable name to the seen set
    seen_variables.add(variable_name)

1  t2m               : Available variables: ['number', 'valid_time', 'latitude', 'longitude', 'expver', 't2m']


In [13]:
seen_variables = set()

for i, nc_file in enumerate(nc_files):
    variable_name = nc_file['variable_name']
    
    if variable_name in seen_variables:
        continue
    
    nc_dataset = nc.Dataset(nc_file['path'], mode='r')
    variable_data = nc_dataset[variable_name]
    
    # Generate summary of the primary variable
    summary = {
        "Variable Name": variable_name,
        "Data Type": variable_data.dtype,
        "Shape": variable_data.shape,
        "Variable Info": f"{variable_name}({', '.join(variable_data.dimensions)})",
        "Units": getattr(variable_data, "units", "N/A"),
        "Long Name": getattr(variable_data, "long_name", "N/A"),
    }
    
    # Display dataset summary as a DataFrame for better visualization
    nc_summary = pd.DataFrame(list(summary.items()), columns=['Description', 'Remarks'])

    # Display the summary DataFrame
    print(f"{i+1}. {nc_file['variable_name']}:")
    display(nc_summary)
    
    # Add the variable name to the seen set
    seen_variables.add(variable_name)

    if i >= 1:
        break

1. t2m:


Unnamed: 0,Description,Remarks
0,Variable Name,t2m
1,Data Type,float32
2,Shape,"(8759, 82, 96)"
3,Variable Info,"t2m(valid_time, lati..."
4,Units,K
5,Long Name,2 metre temperature


## 4. Export Dataset to CSV

In [14]:
import xarray as xr

# Open the NetCDF dataset using xarray
def netcdf_to_dataframe(nc_file, bounding_box=None):
    with xr.open_dataset(nc_file['path']) as nc_dataset:
        variable_data = nc_dataset[nc_file['variable_name']]
        
        # Ensure latitude and longitude names are correct
        latitude_name = 'latitude' if 'latitude' in nc_dataset.coords else 'lat'
        longitude_name = 'longitude' if 'longitude' in nc_dataset.coords else 'lon'
        
        if bounding_box:
            # Filter the data based on latitude and longitude
            filtered_data = variable_data.where(
                (nc_dataset[latitude_name] >= bounding_box[1]) & (nc_dataset[latitude_name] <= bounding_box[3]) &
                (nc_dataset[longitude_name] >= bounding_box[0]) & (nc_dataset[longitude_name] <= bounding_box[2]),
                drop=True
            )
        else:
            filtered_data = variable_data

        df = filtered_data.to_dataframe().reset_index()
        df['valid_time'] = pd.to_datetime(df['valid_time'])
        df['date'] = df['valid_time'].dt.date
        df['time'] = df['valid_time'].dt.time
        df = df.set_index(['date', 'time', latitude_name, longitude_name])
 
        return df

### 4.1 Filter by Bounding Box, Create DataFrame and Export as merged CSV file

In [15]:
# Define additional request fields to ensure the request stays within the file size limit.
# These coordinates were obtained using the BBox Extractor tool:
# https://str-ucture.github.io/bbox-extractor/

bbox_wgs84_konstanz = [47.9, 8.9, 47.6, 9.3]  # Format: [North, West, South, East]
bbox_wgs84_konstanz_standard = [9.0, 47.6, 9.3, 47.8]  # Standard format: [West, South, East, North]
bbox_wgs84_de_standard = [5.7, 47.1, 15.2, 55.2]  # Standard format: [West, South, East, North]

In [16]:
from tqdm.notebook import tqdm
import textwrap

# csv_filename = f"{nc_files[0]['variable_name']}_{nc_files[0]['year']}.csv.zip"
# csv_path = os.path.join(csv_folder, csv_filename)

dataframes = netcdf_to_dataframe(nc_files[0], bounding_box=bbox_wgs84_konstanz_standard)
dataframes

# dataframes.to_csv(csv_path, sep=',', encoding='utf8', compression='zip')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,valid_time,number,expver,t2m
date,time,latitude,longitude,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1950-01-01,01:00:00,47.8,9.1,1950-01-01 01:00:00,0,0001,270.765015
1950-01-01,01:00:00,47.8,9.2,1950-01-01 01:00:00,0,0001,270.690796
1950-01-01,01:00:00,47.8,9.3,1950-01-01 01:00:00,0,0001,270.634155
1950-01-01,01:00:00,47.7,9.1,1950-01-01 01:00:00,0,0001,271.120483
1950-01-01,01:00:00,47.7,9.2,1950-01-01 01:00:00,0,0001,271.468140
...,...,...,...,...,...,...,...
1950-12-31,23:00:00,47.8,9.2,1950-12-31 23:00:00,0,0001,267.417236
1950-12-31,23:00:00,47.8,9.3,1950-12-31 23:00:00,0,0001,267.421143
1950-12-31,23:00:00,47.7,9.1,1950-12-31 23:00:00,0,0001,267.548096
1950-12-31,23:00:00,47.7,9.2,1950-12-31 23:00:00,0,0001,267.382080


In [17]:
# from tqdm.notebook import tqdm
# import textwrap

# message_printed = False

# for i, nc_file in enumerate(tqdm(nc_files)):
#     csv_filename = f"{nc_files[i]['variable_name']}_{nc_files[i]['year']}.csv.zip"
#     csv_path = os.path.join(csv_folder, csv_filename)

#     if not os.path.isfile(csv_path):
#         # Assuming bbox_wgs84_de_standard is defined elsewhere
#         print(f"Creating {csv_filename}")
#         dataframe = netcdf_to_dataframe(nc_file)
#         dataframe.to_csv(csv_path, sep=',', encoding='utf8', compression='zip')
#         print(f"{csv_filename} created successfully.")

#     else:
#         if not message_printed:
#             print("Some files already exist. Skipping export for those.")
#             message_printed = True

# dataframe = pd.read_csv(csv_path).set_index(['date', 'time', 'latitude', 'longitude'])
# dataframe