In [1]:
import cdsapi
import os

# Creating a CDS API Client
client = cdsapi.Client()

# Define the dataset name
dataset = "reanalysis-era5-land"

# Define the variables to download
variables = [
    "total_precipitation",
    "total_evaporation",
    "runoff",
    "2m_temperature",
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "surface_pressure",
]

# Define the time resolution to be every 6 hours
time_intervals = [
    "00:00", "06:00", "12:00", "18:00"
]

# Set the data storage directory
data_dir = "/mnt/d/Parsing_data"

# Make sure the target directory exists
os.makedirs(data_dir, exist_ok=True)

# Download full year data, split by month
for month in range(1, 12):  # From Jan to Dec
    # Generate output file name
    output_filename = os.path.join(data_dir, f"era5_2024_month_{month:02d}.nc")
    
    # Check if the file already exists
    if os.path.exists(output_filename):
        print(f"File for month {month} already exists, skipping download...")
        continue

    # Constructing request parameters
    request = {
        "variable": variables,
        "year": "2024",
        "month": f"{month:02d}",  # Format as a two-digit month
        "day": [
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
            "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
            "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"
        ],
        "time": time_intervals,
        "format": "netcdf",  # Data format is NetCDF
    }

    # Data Download
    print(f"Downloading data for month {month}...")
    client.retrieve(dataset, request, output_filename)
    print(f"Data for month {month} has been saved as {output_filename}")


2024-11-24 18:36:16,168 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-11-24 18:36:16,169 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2024-11-24 18:36:16,170 INFO [2024-09-16T00:00:00] Remember that you need to have an ECMWF account to use the new CDS. **Your old CDS credentials will not work in new CDS!**


File for month 1 already exists, skipping download...
File for month 2 already exists, skipping download...
File for month 3 already exists, skipping download...
File for month 4 already exists, skipping download...
File for month 5 already exists, skipping download...
File for month 6 already exists, skipping download...
File for month 7 already exists, skipping download...
File for month 8 already exists, skipping download...
File for month 9 already exists, skipping download...
File for month 10 already exists, skipping download...
File for month 11 already exists, skipping download...


In [14]:
# # 检查目标目录
# migrated_files = os.listdir(target_dir)
# # print(f"Files in {target_dir}: {migrated_files}")

Files in /mnt/d/Parsing_data: ['era5_2024_month_01.nc', 'era5_2024_month_02.nc', 'era5_2024_month_03.nc', 'era5_2024_month_04.nc', 'era5_2024_month_05.nc']


In [4]:
print(f"Longitude values: {ds.longitude.values}")


Longitude values: [ 0.   0.1  0.2 ... -0.3 -0.2 -0.1]


In [5]:
import xarray as xr
import os

# 定义北美区域的经纬度范围
lat_min, lat_max = 10, 85  # 纬度范围
lon_min, lon_max = -170, -50  # 经度范围

# 数据文件路径
data_directory = "/mnt/d/Parsing_data"
output_directory = "/mnt/d/Reduced_data"
months = range(1, 12)  # 1 到 11 月
file_template = f"{data_directory}/era5_2024_month_{{month:02d}}.nc"

# 循环处理每个月的数据
for month in months:
    file_path = file_template.format(month=month)
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue

    # 打开数据集
    ds = xr.open_dataset(file_path)

    # 检查并转换经度范围
    print(f"Original longitude range: {ds.longitude.min().values} to {ds.longitude.max().values}")
    if ds.longitude.max() > 180:  # 如果经度是 [0, 360]
        ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180))
        print(f"Converted longitude range: {ds.longitude.min().values} to {ds.longitude.max().values}")

    # 四舍五入经度值以避免浮点数误差
    ds = ds.assign_coords(longitude=ds.longitude.round(1))

    # 提取北美区域数据
    try:
        subset_ds = ds.sel(
            latitude=slice(lat_max, lat_min),
            longitude=slice(lon_min, lon_max)
        )
    except KeyError as e:
        print(f"KeyError while slicing longitude: {e}")
        continue

    # 保存提取后的数据
    output_path = f"{output_directory}/era5_2024_month_{month:02d}_north_america.nc"
    subset_ds.to_netcdf(output_path)
    print(f"Saved North America subset for month {month} to {output_path}")


Original longitude range: 0.0 to 359.9
Converted longitude range: -179.90000000000595 to 179.9999999999941


PermissionError: [Errno 13] Permission denied: '/mnt/d/Reduced_data/era5_2024_month_01_north_america.nc'

In [13]:
# import shutil

# # 当前目录
# source_dir = "/home/siruwww/5550_final_project/Parsing_data"

# # 目标目录
# target_dir = "/mnt/d/Parsing_data"

# # 迁移整个文件夹
# shutil.move(source_dir, target_dir)
# print(f"Moved directory {source_dir} to {target_dir}")


Moved directory /home/siruwww/5550_final_project/Parsing_data to /mnt/d/Parsing_data


In [3]:
import xarray as xr
import os

# Define the data directory and file template
data_directory = "/mnt/d/Parsing_data"  # Replace with the path where your data is stored
file_template = "era5_2024_month_{month:02d}.nc"  # File naming template
months = range(1, 12)  # Months from January to November 2024

# List to store the check results
file_check_results = []

# Check each month's data
for month in months:
    file_path = os.path.join(data_directory, file_template.format(month=month))
    if not os.path.exists(file_path):
        print(f"File missing: {file_path}")
        file_check_results.append((month, "Missing", None, None, None))
        continue

    try:
        # Open the dataset
        dataset = xr.open_dataset(file_path)

        # Get basic information
        time_range = (str(dataset.valid_time[0].values), str(dataset.valid_time[-1].values))
        variables = list(dataset.data_vars.keys())
        size = os.path.getsize(file_path) / (1024 * 1024)  # File size in MB

        print(f"File {file_path} is OK: Time range {time_range}, Variables {variables}, Size {size:.2f} MB")
        file_check_results.append((month, "OK", time_range, variables, size))

    except Exception as e:
        print(f"Unable to read file {file_path}: {e}")
        file_check_results.append((month, "Error", None, None, None))

# Print summary of the checks
print("\nSummary of checks:")
for result in file_check_results:
    print(f"Month: {result[0]}, Status: {result[1]}, Time Range: {result[2]}, Variables: {result[3]}, Size: {result[4]} MB")


File /mnt/d/Parsing_data/era5_2024_month_01.nc is OK: Time range ('2024-01-01T00:00:00.000000000', '2024-01-31T18:00:00.000000000'), Variables ['tp', 'e', 'ro', 't2m', 'u10', 'v10', 'sp'], Size 3218.32 MB
File /mnt/d/Parsing_data/era5_2024_month_02.nc is OK: Time range ('2024-02-01T00:00:00.000000000', '2024-02-29T18:00:00.000000000'), Variables ['tp', 'e', 'ro', 't2m', 'u10', 'v10', 'sp'], Size 3011.77 MB
File /mnt/d/Parsing_data/era5_2024_month_03.nc is OK: Time range ('2024-03-01T00:00:00.000000000', '2024-03-31T18:00:00.000000000'), Variables ['tp', 'e', 'ro', 't2m', 'u10', 'v10', 'sp'], Size 3220.62 MB
File /mnt/d/Parsing_data/era5_2024_month_04.nc is OK: Time range ('2024-04-01T00:00:00.000000000', '2024-04-30T18:00:00.000000000'), Variables ['tp', 'e', 'ro', 't2m', 'u10', 'v10', 'sp'], Size 3127.70 MB
File /mnt/d/Parsing_data/era5_2024_month_05.nc is OK: Time range ('2024-05-01T00:00:00.000000000', '2024-05-31T18:00:00.000000000'), Variables ['tp', 'e', 'ro', 't2m', 'u10', 'v10'