### This one works

In [20]:
import cdsapi
import json
import os
import pandas as pd
from datetime import datetime
from calendar import monthrange
import xarray as xr

# Load the configuration
with open("config.json", "r") as config_file:
    config = json.load(config_file)

locations = config["locations"]
variables = config["variables"]
times = config["time"]  # Get the timeslots from the configuration file

# Define the CDS API client
c = cdsapi.Client()

# Base directories for temporary and output files
base_temp_dir = "temp_weather_data"
base_output_dir = "output_weather_data"
os.makedirs(base_temp_dir, exist_ok=True)
os.makedirs(base_output_dir, exist_ok=True)

# Generate data for each month
for month in range(1, 12):  # Loop through months 1 to 12
    month_str = f"{month:02}"  # Ensure two-digit month format

    # Determine the number of days in the current month
    num_days = monthrange(2023, month)[1]
    days = [f"{day:02}" for day in range(1, num_days + 1)]

    print(f"Processing month: {month_str}...")

    # Create a directory for this month
    temp_dir = os.path.join(base_temp_dir, "2023", month_str)
    os.makedirs(temp_dir, exist_ok=True)

    # Process each location
    for location in locations:
        lat, lon, name = location["latitude"], location["longitude"], location["name"]
        file_name = os.path.join(temp_dir, f"{name}_2023-{month_str}.grib")
        
        if not os.path.exists(file_name):  # Avoid re-downloading if file exists
            print(f"Requesting data for {name} ({lat}, {lon}) for month {month_str}...")
            request = {
                "variable": variables,
                "year": "2023",
                "month": [month_str],
                "day": days,  # Dynamically calculated days
                "time": times,  # Time slots from configuration
                "format": "grib",
                "area": [lat + 0.1, lon - 0.1, lat - 0.1, lon + 0.1],
            }
            try:
                c.retrieve("reanalysis-era5-land", request, file_name)
                print(f"Data saved to {file_name}.")
            except Exception as e:
                print(f"Error retrieving data for {name}, {month_str}: {e}")
        else:
            print(f"File {file_name} already exists. Skipping download.")

# Consolidate all downloaded data
all_data = []

# Process each month directory
year_dir = os.path.join(base_temp_dir, "2023")
for month_dir in sorted(os.listdir(year_dir)):
    month_path = os.path.join(year_dir, month_dir)
    if os.path.isdir(month_path):
        print(f"Processing month directory: {month_path}...")
        for file in os.listdir(month_path):
            if file.endswith(".grib"):
                file_path = os.path.join(month_path, file)
                print(f"Processing file: {file_path}...")
                try:
                    ds = xr.open_dataset(file_path, engine="cfgrib")
                    df = ds.to_dataframe().reset_index()

                    # Extract location name from file
                    location_name = file.split("_2023-")[0]

                    # Add metadata
                    df["location_name"] = location_name

                    all_data.append(df)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# Combine all data into a single DataFrame
if all_data:
    print("Combining all data into a single DataFrame...")
    combined_df = pd.concat(all_data, ignore_index=True)

    # Save combined data to CSV
    output_csv_file = os.path.join(base_output_dir, "weather_data_2023.csv")
    combined_df.to_csv(output_csv_file, index=False)
    print(f"Consolidated weather data saved to {output_csv_file}.")
else:
    print("No data to consolidate.")


2024-12-09 16:15:23,286 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-12-09 16:15:23,287 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2024-12-09 16:15:23,288 INFO [2024-09-16T00:00:00] Remember that you need to have an ECMWF account to use the new CDS. **Your old CDS credentials will not work in new CDS!**


Processing month: 01...
File temp_weather_data/2023/01/Madrid_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Barcelona_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Seville_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Valencia_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Castile-La Mancha (Wind)_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Aragon (Wind)_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Andalusia (Wind)_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Escatrón-Chiprana-Samper (Solar)_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Talasol Solar (Solar)_2023-01.grib already exists. Skipping download.
File temp_weather_data/2023/01/Talayuela Solar (Solar)_2023-01.grib already exists. Skipping download.
Processing month:

2024-12-09 16:15:23,792 INFO Request ID is 61ae0ccb-4d1d-49f5-abe0-db9633427aeb
2024-12-09 16:15:23,926 INFO status has been updated to accepted


KeyboardInterrupt: 

In [18]:
import os
import pandas as pd
import json

# Configuration
input_csv = "output_weather_data/weather_data_2023.csv"
output_json = "output_weather_data/weather_data_by_location_2023.json"
output_csv = "output_weather_data/consolidated_weather_data_by_time_2023.csv"

# Load the CSV file
print(f"Loading {input_csv}...")
df = pd.read_csv(input_csv)

# Convert time columns to datetime for easier processing
df["valid_time"] = pd.to_datetime(df["valid_time"])

# Initialize the main dictionary
weather_data = {}

# Group the data by location and valid_time
print("Processing data...")
grouped = df.groupby(["location_name", "valid_time"])

for (location, valid_time), group in grouped:
    # Initialize the location's data if not already present
    if location not in weather_data:
        weather_data[location] = []

    # Aggregate data for the time slot
    aggregated_data = {
        "valid_time": valid_time.isoformat(),
        "latitude": group["latitude"].mean(skipna=True),
        "longitude": group["longitude"].mean(skipna=True),
        "temperature": group["t2m"].mean(skipna=True),
        "solar_radiation": group["ssr"].mean(skipna=True),
        "wind_u_component": group["u10"].mean(skipna=True),
        "wind_v_component": group["v10"].mean(skipna=True),
        "surface_pressure": group["sp"].mean(skipna=True),
        "total_precipitation": group["tp"].mean(skipna=True),
    }
    weather_data[location].append(aggregated_data)

# Save the dictionary to a JSON file
print(f"Saving data by location to {output_json}...")
with open(output_json, "w") as json_file:
    json.dump(weather_data, json_file, indent=4)

# Consolidate data by valid_time
print("Consolidating data by time...")
consolidated_data = {}

# Process each location
for location, records in weather_data.items():
    for record in records:
        valid_time = record["valid_time"]

        # Ensure the valid_time exists in the consolidated data
        if valid_time not in consolidated_data:
            consolidated_data[valid_time] = {}

        # Flatten location-specific data into consolidated_data
        consolidated_data[valid_time].update({
            f"{location}_latitude": record["latitude"],
            f"{location}_longitude": record["longitude"],
            f"{location}_temperature": record["temperature"],
            f"{location}_solar_radiation": record["solar_radiation"],
            f"{location}_wind_u_component": record["wind_u_component"],
            f"{location}_wind_v_component": record["wind_v_component"],
            f"{location}_surface_pressure": record["surface_pressure"],
            f"{location}_total_precipitation": record["total_precipitation"],
        })

# Convert the consolidated dictionary into a DataFrame
df_consolidated = pd.DataFrame.from_dict(consolidated_data, orient="index").reset_index()

# Rename the index column to "valid_time"
df_consolidated.rename(columns={"index": "valid_time"}, inplace=True)

# Save the DataFrame to a CSV file
print(f"Saving consolidated data to {output_csv}...")
df_consolidated.to_csv(output_csv, index=False)

print("Processing complete.")


Loading output_weather_data/weather_data_2023.csv...
Processing data...
Saving data by location to output_weather_data/weather_data_by_location_2023.json...
Consolidating data by time...
Saving consolidated data to output_weather_data/consolidated_weather_data_by_time_2023.csv...
Processing complete.


### The current process is above

In [None]:
import os
import pandas as pd
import json

# Configuration
year = 2023
base_dir = "temp"

# Initialize the main dictionary
weather_data = {}

# Iterate through all months
for month in range(1, 13):
    month_dir = os.path.join(base_dir, str(year), f"{month:02d}")
    month_file = os.path.join(month_dir, "weather_data.csv")
    
    # Check if the month's file exists
    if not os.path.exists(month_file):
        print(f"Skipping {month_file}: File not found.")
        continue

    # Load the CSV file for the month
    print(f"Processing {month_file}...")
    df = pd.read_csv(month_file)

    # Convert time columns to datetime for easier processing
    df["valid_time"] = pd.to_datetime(df["valid_time"])

    # Group the data by location
    for location, group in df.groupby("location_name"):
        # Initialize the location's data if not already present
        if location not in weather_data:
            weather_data[location] = []

        # Drop duplicate time slots by keeping the first non-NaN row for each `valid_time`
        group = group.sort_values("valid_time").dropna(subset=["t2m", "ssr", "u10", "v10", "sp", "tp"], how='all')
        group = group.groupby("valid_time").first().reset_index()

        # Process each row in the location's data
        for _, row in group.iterrows():
            # Create a dictionary for the current time slot
            time_slot_data = {
                "valid_time": row["valid_time"].isoformat(),
                "latitude": row["latitude"],
                "longitude": row["longitude"],
                "temperature": row["t2m"],
                "solar_radiation": row["ssr"],
                "wind_u_component": row["u10"],
                "wind_v_component": row["v10"],
                "surface_pressure": row["sp"],
                "total_precipitation": row["tp"],
            }
            weather_data[location].append(time_slot_data)

# Save the dictionary to a JSON file
output_file = os.path.join(base_dir, f"{year}_weather_data_by_location.json")
with open(output_file, "w") as json_file:
    json.dump(weather_data, json_file, indent=4)

print(f"Weather data organized by location saved to {output_file}.")


Loading data from output_weather_data/weather_data_2023.csv...
Processing data...
Saving processed data to temp/2023_weather_data_by_location.json...


FileNotFoundError: [Errno 2] No such file or directory: 'temp/2023_weather_data_by_location.json'

In [None]:
import pandas as pd
import json

# Load the JSON file
input_file = "weather_data_by_location.json"
output_csv_file = "consolidated_weather_data_by_time.csv"

with open(input_file, "r") as json_file:
    weather_data = json.load(json_file)

# Create a dictionary to hold data for the DataFrame
consolidated_data = {}

# Process each location
for location, records in weather_data.items():
    for record in records:
        valid_time = record["valid_time"]

        # Ensure the valid_time exists in the consolidated data
        if valid_time not in consolidated_data:
            consolidated_data[valid_time] = {}

        # Flatten location-specific data into consolidated_data
        consolidated_data[valid_time].update({
            f"{location}_latitude": record["latitude"],
            f"{location}_longitude": record["longitude"],
            f"{location}_temperature": record["temperature"],
            f"{location}_solar_radiation": record["solar_radiation"],
            f"{location}_wind_u_component": record["wind_u_component"],
            f"{location}_wind_v_component": record["wind_v_component"],
            f"{location}_surface_pressure": record["surface_pressure"],
            f"{location}_total_precipitation": record["total_precipitation"],
        })

# Convert the consolidated dictionary into a DataFrame
df = pd.DataFrame.from_dict(consolidated_data, orient="index").reset_index()

# Rename the index column to "valid_time"
df.rename(columns={"index": "valid_time"}, inplace=True)

# Save the DataFrame to a CSV file
df.to_csv(output_csv_file, index=False)

print(f"Consolidated weather data by time saved to {output_csv_file}.")


Consolidated weather data by time saved to consolidated_weather_data_by_time.csv.


### validation shit I think

In [16]:
import os
import pandas as pd
import json

# Configuration
input_csv = "output_weather_data/weather_data_2023.csv"
output_json = "output_weather_data/weather_data_by_location_2023.json"
output_csv = "output_weather_data/consolidated_weather_data_by_time_2023.csv"

# Load the CSV file
print(f"Loading {input_csv}...")
df = pd.read_csv(input_csv)

# Convert time columns to datetime for easier processing
df["valid_time"] = pd.to_datetime(df["valid_time"])

# Initialize the main dictionary
weather_data = {}

# Group the data by location and valid_time
print("Processing data...")
grouped = df.groupby(["location_name", "valid_time"])

for (location, valid_time), group in grouped:
    # Initialize the location's data if not already present
    if location not in weather_data:
        weather_data[location] = []

    # Aggregate data for the time slot
    aggregated_data = {
        "valid_time": valid_time.isoformat(),
        "latitude": group["latitude"].mean(skipna=True),
        "longitude": group["longitude"].mean(skipna=True),
        "temperature": group["t2m"].mean(skipna=True),
        "solar_radiation": group["ssr"].mean(skipna=True),
        "wind_u_component": group["u10"].mean(skipna=True),
        "wind_v_component": group["v10"].mean(skipna=True),
        "surface_pressure": group["sp"].mean(skipna=True),
        "total_precipitation": group["tp"].mean(skipna=True),
    }
    weather_data[location].append(aggregated_data)

# Save the dictionary to a JSON file
print(f"Saving data by location to {output_json}...")
with open(output_json, "w") as json_file:
    json.dump(weather_data, json_file, indent=4)

# Consolidate data by valid_time
print("Consolidating data by time...")
consolidated_data = {}

# Process each location
for location, records in weather_data.items():
    for record in records:
        valid_time = record["valid_time"]

        # Ensure the valid_time exists in the consolidated data
        if valid_time not in consolidated_data:
            consolidated_data[valid_time] = {}

        # Flatten location-specific data into consolidated_data
        consolidated_data[valid_time].update({
            f"{location}_latitude": record["latitude"],
            f"{location}_longitude": record["longitude"],
            f"{location}_temperature": record["temperature"],
            f"{location}_solar_radiation": record["solar_radiation"],
            f"{location}_wind_u_component": record["wind_u_component"],
            f"{location}_wind_v_component": record["wind_v_component"],
            f"{location}_surface_pressure": record["surface_pressure"],
            f"{location}_total_precipitation": record["total_precipitation"],
        })

# Convert the consolidated dictionary into a DataFrame
df_consolidated = pd.DataFrame.from_dict(consolidated_data, orient="index").reset_index()

# Rename the index column to "valid_time"
df_consolidated.rename(columns={"index": "valid_time"}, inplace=True)

# Save the DataFrame to a CSV file
print(f"Saving consolidated data to {output_csv}...")
df_consolidated.to_csv(output_csv, index=False)

print("Processing complete.")


Loading output_weather_data/weather_data_2023.csv...
Processing data...
Saving data by location to output_weather_data/weather_data_by_location_2023.json...
Consolidating data by time...
Saving consolidated data to output_weather_data/consolidated_weather_data_by_time_2023.csv...
Processing complete.
