In [4]:
import cdsapi
import xarray as xr
import os

### Data Crawling

In [5]:
# Initialize CDS API client
client = cdsapi.Client()

# Dataset name
dataset = "reanalysis-era5-land"

# Variables: Add more related variables
variables = [
    "total_precipitation",  # Total Precipitation (m)
    "2m_temperature",       # 2m Temperature (K)
    "surface_solar_radiation_downwards",  # Surface solar radiation downwards (J/m^2)
    "soil_temperature_level_1",  # Soil temperature level 1 (K)
]

# Define a small geographic area for the Capitol region (DC)
dc_extent = {
    "area": [38.8977, -77.0365, 38.8975, -77.0363],  # [North Latitude, West Longitude, South Latitude, East Longitude]
}

# Time resolution: 4 hours
time_intervals = [
    "00:00", "06:00", "12:00", "18:00"
]

# Output directory for monthly data
output_dir = "DC_Month_Data"
os.makedirs(output_dir, exist_ok=True)

# Download data for each month from January to November
for month in range(1, 12):
    print(f"Downloading data for month {month:02d}...")

    # Request parameters
    request = {
        "format": "netcdf",
        "product_type": "reanalysis",
        "variable": variables,
        "year": "2024",
        "month": f"{month:02d}",
        "day": [f"{day:02d}" for day in range(1, 32)],  # Days in the month
        "time": time_intervals,
        **dc_extent,  # Add geographic range
    }

    # Output file path for each month
    output_filename = os.path.join(output_dir, f"dc_weather_2024_month_{month:02d}.nc")

    # Perform data retrieval
    client.retrieve(dataset, request, output_filename)
    print(f"Data for month {month:02d} saved to {output_filename}.")

2024-12-02 21:07:19,488 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-12-02 21:07:19,489 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2024-12-02 21:07:19,490 INFO [2024-09-16T00:00:00] Remember that you need to have an ECMWF account to use the new CDS. **Your old CDS credentials will not work in new CDS!**


Downloading data for month 01...


2024-12-02 21:07:20,139 INFO Request ID is c0382a0d-c430-427f-932e-f69022791beb
2024-12-02 21:07:20,527 INFO status has been updated to accepted
2024-12-02 21:07:25,135 INFO status has been updated to running
2024-12-02 21:09:12,385 INFO status has been updated to successful
                                                                                         

Data for month 01 saved to DC_Month_Data/dc_weather_2024_month_01.nc.
Downloading data for month 02...


2024-12-02 21:09:15,019 INFO Request ID is c576fbd9-8b14-4b49-80b1-4778bd89dfea
2024-12-02 21:09:15,225 INFO status has been updated to accepted
2024-12-02 21:09:24,309 INFO status has been updated to running
2024-12-02 21:11:08,258 INFO status has been updated to successful
                                                                                        

Data for month 02 saved to DC_Month_Data/dc_weather_2024_month_02.nc.
Downloading data for month 03...


2024-12-02 21:11:11,425 INFO Request ID is db3eedfb-e306-444e-b3d2-1d015ce71365
2024-12-02 21:11:11,574 INFO status has been updated to accepted
2024-12-02 21:11:18,450 INFO status has been updated to running
2024-12-02 21:13:05,405 INFO status has been updated to successful
                                                                               

Data for month 03 saved to DC_Month_Data/dc_weather_2024_month_03.nc.
Downloading data for month 04...


2024-12-02 21:13:07,300 INFO Request ID is 28122057-1ae3-48fa-a3f6-4997282bdd02
2024-12-02 21:13:07,458 INFO status has been updated to accepted
2024-12-02 21:13:13,074 INFO status has been updated to running
2024-12-02 21:15:02,094 INFO status has been updated to successful
                                                                                         

Data for month 04 saved to DC_Month_Data/dc_weather_2024_month_04.nc.
Downloading data for month 05...


2024-12-02 21:15:05,103 INFO Request ID is 8fd5fe60-e5bc-4675-a3b6-f619b17f44a5
2024-12-02 21:15:05,311 INFO status has been updated to accepted
2024-12-02 21:15:08,887 INFO status has been updated to running
2024-12-02 21:16:58,319 INFO status has been updated to successful
                                                                                         

Data for month 05 saved to DC_Month_Data/dc_weather_2024_month_05.nc.
Downloading data for month 06...


2024-12-02 21:17:02,041 INFO Request ID is f1451250-1a8c-4e7d-bc09-317c4958bd81
2024-12-02 21:17:02,241 INFO status has been updated to accepted
2024-12-02 21:17:11,283 INFO status has been updated to running
2024-12-02 21:18:18,224 INFO status has been updated to successful
                                                                                        

Data for month 06 saved to DC_Month_Data/dc_weather_2024_month_06.nc.
Downloading data for month 07...


2024-12-02 21:18:20,309 INFO Request ID is 45ac23a6-5535-4228-8fc3-90f469ceda3a
2024-12-02 21:18:20,520 INFO status has been updated to accepted
2024-12-02 21:18:26,058 INFO status has been updated to running
2024-12-02 21:21:09,372 INFO status has been updated to successful
                                                                                         

Data for month 07 saved to DC_Month_Data/dc_weather_2024_month_07.nc.
Downloading data for month 08...


2024-12-02 21:21:12,412 INFO Request ID is 331502d4-e378-405f-9660-b2950ca0096c
2024-12-02 21:21:12,557 INFO status has been updated to accepted
2024-12-02 21:21:18,099 INFO status has been updated to running
2024-12-02 21:24:00,414 INFO status has been updated to successful
                                                                                        

Data for month 08 saved to DC_Month_Data/dc_weather_2024_month_08.nc.
Downloading data for month 09...


2024-12-02 21:24:02,957 INFO Request ID is fabbea1d-20ec-429a-a645-a35b5bcd4242
2024-12-02 21:24:03,193 INFO status has been updated to accepted
2024-12-02 21:24:08,707 INFO status has been updated to running
2024-12-02 21:26:51,450 INFO status has been updated to successful
                                                                                         

Data for month 09 saved to DC_Month_Data/dc_weather_2024_month_09.nc.
Downloading data for month 10...


2024-12-02 21:26:53,473 INFO Request ID is 3a6e4db1-ad7d-4379-8807-616081808947
2024-12-02 21:26:53,627 INFO status has been updated to accepted
2024-12-02 21:26:58,941 INFO status has been updated to running
2024-12-02 21:31:07,293 INFO status has been updated to successful
                                                                                          

Data for month 10 saved to DC_Month_Data/dc_weather_2024_month_10.nc.
Downloading data for month 11...


2024-12-02 21:31:09,947 INFO Request ID is cb9f5354-fb4c-4ab6-980f-afe8ee7a51d9
2024-12-02 21:31:10,278 INFO status has been updated to accepted
2024-12-02 21:31:15,065 INFO status has been updated to running
2024-12-02 21:37:18,250 INFO status has been updated to successful
                                                                                         

Data for month 11 saved to DC_Month_Data/dc_weather_2024_month_11.nc.
Merging all monthly NetCDF files into a single dataset...




ValueError: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'h5netcdf', 'scipy']. Consider explicitly selecting one of the installed engines via the ``engine`` parameter, or installing additional IO dependencies, see:
https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html

In [2]:
nc_file_path = "DC_combined_precipitation_2024.nc"
output_csv_path = "data/DC_combined_precipitation_2024.csv"

ds = xr.open_dataset(nc_file_path)

df = ds.to_dataframe().reset_index()

df.to_csv(output_csv_path, index=False)

print(f"Data has been successfully saved to {output_csv_path}")

Data has been successfully saved to data/DC_combined_precipitation_2024.csv


In [37]:
nc_file_path = "DC_combined_precipitation_2024.nc"
output_csv_path = "data/DC_combined_precipitation_2024_cleaned.csv"

ds = xr.open_dataset(nc_file_path)

df = ds.to_dataframe().reset_index()

columns_to_drop = ["latitude", "longitude", "number", "expver"]
df = df.drop(columns=columns_to_drop, errors="ignore")

df.to_csv(output_csv_path, index=False)

print(f"Cleaned data has been successfully saved to {output_csv_path}")

Cleaned data has been successfully saved to data/DC_combined_precipitation_2024_cleaned.csv


In [40]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

file_path = "data/DC_combined_precipitation_2024_cleaned.csv"
data = pd.read_csv(file_path)

# Define the columns that need to be standardized
columns_to_scale = ["tp", "t2m", "ssrd", "stl1"]

# Standardization
scaler = StandardScaler()
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

processed_file_path = "data/processed_data.csv"
data.to_csv(processed_file_path, index=False)

print(f"Processed data saved to {processed_file_path}")

Processed data saved to data/processed_data.csv
