In [1]:
### Set up path
import os

dir = os.path.dirname(os.path.abspath('__file__')) # os.path.abspath('')
data_path = os.path.join(dir, '../data/raw')
if not os.path.exists(data_path):
    os.makedirs(data_path)
else:
    print("Directory for raw data exists.")

pro_data_path = os.path.join(data_path, '../processed')
if not os.path.exists(pro_data_path):
    os.makedirs(pro_data_path)
else:
    print("Directory for processed data exists.")

slp_path = os.path.join(data_path, 'slp')
if not os.path.exists(slp_path):
    os.makedirs(slp_path)
z500_path = os.path.join(data_path, 'z500')
if not os.path.exists(z500_path):
    os.makedirs(z500_path)

Directory for raw data exists.
Directory for processed data exists.


In [2]:
import xarray as xr
import matplotlib.pyplot as plt

years = [str(y) for y in range(1950, 1981)]  # Match paper's 1950-1980 period
months = [f"{m:02d}" for m in range(1, 13)]  # All 12 months
days = [f"{d:02d}" for d in range(1, 32)]  # 1-31 days
times = ["00:00"]  # Daily data at midnight

# Define region [N, W, S, E]
area = [75, -65, 30, 45]  # Matches the paper's study region

slp_path = os.path.join(data_path, 'slp')
z500_path = os.path.join(data_path, 'z500')

## ERA5 - training

In [None]:
import cdsapi

# Initialize the API client
# You need to set up API first !!!!!
c = cdsapi.Client()

# Download ERA5 Sea-Level Pressure (SLP) - Single Levels
for y in years:
    year_path = os.path.join(slp_path, y)
    if not os.path.exists(year_path):
        os.makedirs(year_path)
    for m in months:
        dpath = os.path.join(year_path, f"{m}_era5_slp.nc")
        if not os.path.exists(dpath):
            c.retrieve(
                "reanalysis-era5-single-levels",
                {
                    "variable": ["mean_sea_level_pressure"],
                    "product_type": ["reanalysis"],
                    "year": [y],
                    "month": [m],
                    "day": days,
                    "time": times,
                    "format": "netcdf",
                    "area": area,
                },
                dpath,
            )
        
# Download ERA5 Geopotential at 500 hPa (Z500) - Pressure Levels
for y in years:
    year_path = os.path.join(z500_path, y)
    if not os.path.exists(year_path):
        os.makedirs(year_path)
    for m in months:
        dpath = os.path.join(year_path, f"{m}_era5_z500.nc")
        if not os.path.exists(dpath):
            c.retrieve(
                "reanalysis-era5-pressure-levels",
                {
                    "variable": ["geopotential"],
                    "pressure_level": ["500"],
                    "product_type": ["reanalysis"],
                    "year": [y],
                    "month": [m],
                    "day": days,
                    "time": times,
                    "format": "netcdf",
                    "area": area,
                },
                dpath,
            )

2025-02-05 20:36:08,306 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-02-05 20:36:08,676 INFO Request ID is a46031b9-961e-4047-b7c8-40da4394527a
2025-02-05 20:36:08,762 INFO status has been updated to accepted
2025-02-05 20:36:13,863 INFO status has been updated to running
2025-02-05 20:36:30,442 INFO status has been updated to successful
2025-02-05 20:36:38,764 INFO Request ID is 850ad341-4c4e-4527-81b9-a53c23748bef         
2025-02-05 20:36:38,860 INFO status has been updated to accepted
2025-02-05 20:36:44,157 INFO status has been updated to running
2025-02-05 20:36:52,752 INFO status has been updated to successful
2025-02-05 20:36:55,298 INFO Request ID is 831611ed-e3f8-4813-ac53-9b94516b3bc9          
2025-02-05 20:36:55,408 INFO status has been updated to accepted
2025-02-05 20:37:00,389 INFO status has been updated to running
2025-02-05 20:37:08,972 INFO status has been updated to successful
2025-0

In [5]:
def merge_yearly_files(dirpath, years, months, type):
    """ Merges monthly NetCDF files into one yearly file and deletes individual monthly files. """

    dirpath = os.path.join(dirpath, type)
    for y in years:
        year_path = os.path.join(dirpath, y)
        datasets = [xr.open_dataset(os.path.join(year_path, f"{m}_era5_{type}.nc")) for m in months]
        merged_ds = xr.concat(datasets, dim="valid_time")
        if type == 'z500':
            merged_ds = merged_ds.squeeze("pressure_level", drop=True)
        # Save the merged file
        output_file = os.path.join(dirpath, f"{y}_era5_{type}.nc")
        merged_ds.to_netcdf(output_file)

In [4]:
merge_yearly_files(data_path, years, months, 'slp')
merge_yearly_files(data_path, years, months, 'z500')

In [3]:
def merge_all_files(dirpath, years, type):
    """ Merges monthly NetCDF files into one yearly file and deletes individual monthly files. """
    datasets = [xr.open_dataset(os.path.join(dirpath, type, f"{y}_era5_{type}.nc")) for y in years]
    merged_ds = xr.concat(datasets, dim="valid_time")
    output_file = os.path.join(dirpath, f"era5_{type}.nc")
    merged_ds.to_netcdf(output_file)

In [5]:
merge_all_files(data_path, years, 'slp')
merge_all_files(data_path, years, 'z500')

In [None]:
# Grosswetterlagen Types
import pandas as pd
import glob

file_list = sorted(glob.glob(os.path.join(data_path, "Grosswetterlagen", "gwl_*.txt")))

gwl_data = []

def is_leap_year(year):
    return (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)


for file in file_list:
    month = int(file.split("_")[-1].split(".")[0])

    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split()  # Split line into parts
            year = int(parts[0])  # First element is the year
            daily_types = parts[1:-1]  # Middle elements are daily circulation types

            days_in_month = {1: 31, 2: 29 if is_leap_year(year) else 28, 3: 31, 4: 30, 5: 31, 6: 30, 
                             7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}
            
            expected_days = days_in_month[month]
            assert len(daily_types) == expected_days, f"Unexpected number of days in {file} for year {year}"

            gwl_data.append([year, month, daily_types])


df_gwl = pd.DataFrame(gwl_data, columns=["year", "month", "circ_type"])


df_gwl.to_csv(os.path.join(pro_data_path, "gwl_1940_1980.csv"), index=False)
df_gwl.head(2)

   year  month                                          circ_type
0  1940      1  [HNA, HNA, SZ, SZ, SZ, HFA, HFA, HFA, HFA, HM,...
1  1941      1  [HFZ, HFZ, HFZ, HFA, HFA, HFA, HNA, HNA, HNA, ...
2  1942      1  [HM, HM, HM, TRM, TRM, TRM, NWA, NWA, NWA, NEA...
3  1943      1  [WZ, TM, TM, TM, HNFA, HNFA, HNFA, HM, HM, HM,...
4  1944      1  [NWZ, NWZ, NWZ, NWZ, NWA, NWA, NWA, WZ, WZ, WZ...


In [None]:
# Check classes in paper and our data
class_names = ["WA", "WZ", "WS", "WW", "SWA", "SWZ", "NWA", "NWZ", "HM", "BM", 
               "TM", "NA", "NZ", "HNA", "HNZ", "HB", "TRM", "NEA", "NEZ", "HFA", 
               "HFZ", "HNFA","HNFZ", "SEA", "SEZ", "SA", "SZ", "TB", "TRW", "U"]
         # used in paper
circ_type = set()
for types_list in df_gwl["circ_type"]:
    circ_type.update(types_list) # our data type

circ_type = sorted(circ_type)
class_names = sorted(class_names)
print("Extra:", set(circ_type) - set(class_names))
print("Lack:", set(class_names) - set(circ_type))

Extra: {'U'}
Lack: set()


## CanESM2 - past & future compare

historical-r1, historical-r3, historical-r5

r2i1p1, r4i1p1, r6i1p1, r8i1p1

download from

slp:
https://crd-data-donnees-rdc.ec.gc.ca/CCCMA/products/CanSISE/output/CCCma/CanESM2/historical-r1/day/atmos/psl/r2i1p1/

z500:
https://crd-data-donnees-rdc.ec.gc.ca/CCCMA/products/CanSISE/output/CCCma/CanESM2/historical-r1/day/atmos/zg/r2i1p1/

## Labels of HB circulation types

see appendix in
https://d-nb.info/98319906X/34

manually organized in data/raw/Grosswetterlagen