Combining the CSVs according to Model name SSP and parameters

In [None]:
# ============================================================
# 📦 DEPENDENCIES
# ============================================================
import os
import glob
import pandas as pd
import re
from tqdm import tqdm

# === 📂 DIRECTORY SETUP ===
input_folder = r"D:\Global_Historical_climate_data\WorldClim_v1\Extracted_CSV_from_rasters"
output_folder = r"D:\Global_Historical_climate_data\WorldClim_v1\Combined_CSV_files"
os.makedirs(output_folder, exist_ok=True)

# === 🧠 FILE NAME PARSER ===
# Example: wc2.1_2.5m_prec_ACCESS-CM2_ssp585_2081-2100.csv
filename_pattern = re.compile(
    r"^.+?_(?P<param>prec|tmin|tmax)_(?P<model>[^_]+)_(?P<ssp>ssp\d{3})_(?P<duration>\d{4}-\d{4})\.csv$"
)

# === 📦 STORAGE STRUCTURE ===
# Grouped by (model, ssp, param)
grouped_data = {}

# === 🔍 COLLECT & PROCESS FILES ===
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))
print(f"🔎 Found {len(csv_files)} CSV files in {input_folder}\n")

for file in tqdm(csv_files, desc="Parsing files"):
    filename = os.path.basename(file)
    match = filename_pattern.match(filename)

    if not match:
        print(f"⚠️ Skipped unrecognized format: {filename}")
        continue

    param = match.group("param")
    model = match.group("model")
    ssp = match.group("ssp")
    duration = match.group("duration")

    try:
        df = pd.read_csv(file)

        expected = {"NAME", "LAT", "LONG"} | {f"Month{i}" for i in range(1, 13)}
        if not expected.issubset(df.columns):
            print(f"❌ Missing expected columns in {filename}")
            continue

        df["Duration"] = duration

        key = (model, ssp, param)
        if key not in grouped_data:
            grouped_data[key] = []
        grouped_data[key].append(df)

    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")

# === 💾 EXPORT COMBINED FILES ===
print("\n📤 Writing combined files to:", output_folder)

for (model, ssp, param), df_list in grouped_data.items():
    combined = pd.concat(df_list, ignore_index=True)
    combined.sort_values(by=["NAME", "Duration"], inplace=True)

    output_filename = f"{model}_{ssp}_{param}.csv"
    output_path = os.path.join(output_folder, output_filename)
    combined.to_csv(output_path, index=False)

    print(f"✅ Saved {output_filename} — {len(combined)} rows")

print("\n🎉 All combinations exported successfully!")
