Data extraction from tiff to CSV

In [None]:
# ============================================================
# 📦 DEPENDENCIES
# ============================================================

import subprocess, sys
import rasterio
import pandas as pd
import os, glob, logging, traceback, concurrent.futures
from tqdm import tqdm
from pyproj import Transformer

# ============================================================
# ⚙️ CONFIGURATION OR USER SETTINGS (EDITABLE OPTIONS)
# ============================================================

input_folder = r"D:\Global_Historical_climate_data\WorldClim_v1\Future_data_rasters"
output_folder = r"D:\Global_Historical_climate_data\WorldClim_v1\Extracted_CSV_from_rasters"
station_file = r"E:\Elbe\Models\Vector_output\Climate_stations_per_subbasian.csv"
log_file = os.path.join(output_folder, "extraction_log.txt")
missing_log = os.path.join(output_folder, "missing_band_values.txt")

os.makedirs(output_folder, exist_ok=True)

# === 📝 LOGGING SETUP ===
logging.basicConfig(filename=log_file, filemode='w', level=logging.INFO)

# === 📍 LOAD STATION DATA ===
stations = pd.read_csv(station_file, sep=None, engine="python")
station_names = stations["NAME"]
lats = stations["LAT"]
lons = stations["LONG"]
original_coords = list(zip(lons, lats))

# ============================================================
# ⚙️ PROCESS FUNCTION
# ============================================================

def process_tiff(tiff_path):
    try:
        basename = os.path.splitext(os.path.basename(tiff_path))[0]
        output_csv = os.path.join(output_folder, f"{basename}.csv")

        if os.path.exists(output_csv):
            return "SKIPPED", basename

        with rasterio.open(tiff_path) as src:
            if src.count != 12:
                raise ValueError(f"{basename} doesn't have 12 bands.")

            # Reproject coordinates if needed
            src_crs = src.crs
            if not src_crs:
                raise ValueError("Missing CRS in raster.")
            if src_crs.to_epsg() != 4326:
                transformer = Transformer.from_crs("EPSG:4326", src_crs, always_xy=True)
                coords = [transformer.transform(x, y) for x, y in original_coords]
            else:
                coords = original_coords

            # Extract monthly values
            month_data = []
            missing_count = 0

            for band in range(1, 13):
                values = list(src.sample(coords, indexes=band))
                flat = [v[0] if v is not None and v[0] is not None else None for v in values]
                month_data.append(flat)
                missing_count += sum(v is None for v in flat)

            # Transpose to station-wise rows
            month_cols = [f"Month{m}" for m in range(1, 13)]
            df = pd.DataFrame(month_data).T
            df.columns = month_cols
            df.insert(0, "LONG", lons)
            df.insert(0, "LAT", lats)
            df.insert(0, "NAME", station_names)

            df.to_csv(output_csv, index=False)

            if missing_count > 0:
                with open(missing_log, "a") as m:
                    m.write(f"{basename}: {missing_count} missing values\n")

            return "SUCCESS", basename

    except Exception as e:
        logging.error(f"[{tiff_path}] {e}")
        traceback.print_exc()
        return "ERROR", os.path.basename(tiff_path)

# === 🚀 RUN MULTITHREADING ===
tiff_files = glob.glob(os.path.join(input_folder, "*.tif"))
results = []

print(f"\n🛰️  Starting extraction from {len(tiff_files)} TIFF files...\n")

with concurrent.futures.ThreadPoolExecutor() as executor:
    for r in tqdm(executor.map(process_tiff, tiff_files), total=len(tiff_files), desc="Processing"):
        results.append(r)

# === ✅ SUMMARY ===
success = [r[1] for r in results if r[0] == "SUCCESS"]
skipped = [r[1] for r in results if r[0] == "SKIPPED"]
errors  = [r[1] for r in results if r[0] == "ERROR"]

print(f"\n✅ Completed: {len(success)} succeeded")
print(f"⏭️  Skipped:   {len(skipped)} (already exist)")
print(f"❌ Failed:    {len(errors)} with errors")
if errors:
    print("⚠️ Files with errors:")
    for name in errors:
        print(f"   • {name}")
