<a href="https://colab.research.google.com/github/simulate111/Climatic_Data/blob/main/turku_finland_meteorological_institute_FMI(2015_2024)10minutes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import time
from datetime import datetime, timedelta, timezone

# --- CONFIGURATION ---
# Turku Artukainen (Temperature/Wind)
STATION_ID_WEATHER = "100949"

# Turku Area Solar (Kaarina Ylt√∂inen ~10km away)
# Artukainen often lacks solar sensors; Kaarina is the reliable source.
STATION_ID_SOLAR = "100932"

YEARS = range(2015, 2025) # 2015-2024
OUTPUT_FILE = "Turku_10Yr_10Min_Averages_2015-2024.csv"

# TASKS
tasks = [
    {
        "name": "Weather",
        "station": STATION_ID_WEATHER,
        "query": "fmi::observations::weather::simple",
        "params": "t2m,ws_10min",
        "map": {"t2m": "Temperature_C", "ws_10min": "Wind_Speed_ms"}
    },
    {
        "name": "Radiation",
        "station": STATION_ID_SOLAR,
        "query": "fmi::observations::radiation::simple",
        "params": "r_10min", # Global radiation 10min average
        "map": {"r_10min": "Global_Radiation_Wm2"}
    }
]

def get_chunks(start_date, end_date):
    # FMI WFS limits responses, so we break requests into 7-day chunks
    s = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    e = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    chunks = []
    curr = s
    while curr < e:
        nxt = min(curr + timedelta(days=7), e)
        chunks.append((curr.strftime('%Y-%m-%dT%H:%M:%SZ'), nxt.strftime('%Y-%m-%dT%H:%M:%SZ')))
        curr = nxt
    return chunks

def fetch_fmi_data(task, start_ch, end_ch):
    """Fetches a single chunk of data from FMI"""
    params = {
        "service": "WFS", "version": "2.0.0", "request": "getFeature",
        "storedquery_id": task['query'],
        "fmisid": task['station'],
        "parameters": task['params'],
        "starttime": start_ch, "endtime": end_ch
    }

    try:
        r = requests.get("http://opendata.fmi.fi/wfs", params=params, timeout=10)
        if r.status_code == 200:
            root = ET.fromstring(r.content)
            ns = {'wfs': 'http://www.opengis.net/wfs/2.0', 'BsWfs': 'http://xml.fmi.fi/schema/wfs/2.0'}

            rows = []
            for member in root.findall('.//wfs:member', ns):
                elm = member.find('.//BsWfs:BsWfsElement', ns)
                if elm is not None:
                    t = elm.find('BsWfs:Time', ns).text
                    p = elm.find('BsWfs:ParameterName', ns).text
                    v_node = elm.find('BsWfs:ParameterValue', ns)

                    try:
                        val_text = v_node.text
                        if val_text and val_text != 'NaN':
                            v = float(val_text)
                            # FMI solar is sometimes negative at night (sensor noise), clamp to 0
                            if "Radiation" in task['name'] and v < 0:
                                v = 0
                            rows.append({'Time': t, 'Type': task['map'][p], 'Value': v})
                    except: continue
            return rows
    except Exception as e:
        print(f"    ! Error: {e}")
    return []

# --- MAIN LOOP ---
all_data_frames = []

print(f"Starting Extraction for Turku ({YEARS[0]}-{YEARS[-1]})...")

# We loop by Year first to keep memory manageable
for year in YEARS:
    print(f"\n--- Processing Year {year} ---")
    start_str = f"{year}-01-01"
    end_str = f"{year+1}-01-01"
    chunks = get_chunks(start_str, end_str)

    year_rows = []

    # Process both tasks (Weather + Radiation)
    for task in tasks:
        print(f"   > Fetching {task['name']}...", end=" ", flush=True)
        count = 0
        for i, (s_ch, e_ch) in enumerate(chunks):
            chunk_data = fetch_fmi_data(task, s_ch, e_ch)
            year_rows.extend(chunk_data)
            count += len(chunk_data)
            # Simple progress indicator
            if i % 10 == 0: print(".", end="", flush=True)
            time.sleep(0.05)
        print(f" Done ({count} records)")

    if year_rows:
        df = pd.DataFrame(year_rows)
        df['Time'] = pd.to_datetime(df['Time'])

        # Pivot to get columns: [Time, Temperature_C, Wind_Speed_ms, Global_Radiation_Wm2]
        # We use 'last' to handle duplicates if any overlap occurred
        df_pivot = df.pivot_table(index='Time', columns='Type', values='Value', aggfunc='last')

        # Resample to strict 10-min grid to align Weather and Solar
        df_pivot = df_pivot.resample('10min').mean()

        # Interpolate small gaps (up to 1 hour)
        df_pivot = df_pivot.interpolate(method='time', limit=6)

        all_data_frames.append(df_pivot)

# --- MERGE & AVERAGE ---
if all_data_frames:
    print("\nMerging all years...")
    master_df = pd.concat(all_data_frames)

    # Filter cleanup
    if 'Global_Radiation_Wm2' in master_df.columns:
        master_df['Global_Radiation_Wm2'] = master_df['Global_Radiation_Wm2'].fillna(0)

    print("Calculating 10-Year 10-Minute Averages...")

    # 1. Group by Month, Day, Hour, MINUTE
    # Note: FMI times are UTC.
    grouped = master_df.groupby([
        master_df.index.month,
        master_df.index.day,
        master_df.index.hour,
        master_df.index.minute
    ]).mean()

    grouped.index.names = ['Month', 'Day', 'Hour', 'Minute']
    df_avg = grouped.reset_index()

    # 2. Create Dummy Timestamp for Display (using 2024 for Leap Year safety)
    df_avg['Dummy_Timestamp'] = pd.to_datetime(
        '2024-' + df_avg['Month'].astype(str) + '-' + df_avg['Day'].astype(str) + ' ' +
        df_avg['Hour'].astype(str) + ':' + df_avg['Minute'].astype(str) + ':00',
        errors='coerce'
    )
    df_avg = df_avg.dropna(subset=['Dummy_Timestamp']).sort_values('Dummy_Timestamp')

    # 3. Format Output
    df_avg['Display_Date'] = df_avg['Dummy_Timestamp'].dt.strftime('%m-%d')
    df_avg['Display_Time'] = df_avg['Dummy_Timestamp'].dt.strftime('%H:%M')

    # Select columns if they exist
    cols = ['Display_Date', 'Display_Time', 'Temperature_C', 'Wind_Speed_ms', 'Global_Radiation_Wm2']
    final_cols = [c for c in cols if c in df_avg.columns]

    final_output = df_avg[final_cols]

    print(f"Success! Generated {len(final_output)} rows.")
    print(final_output.head())
    final_output.to_csv(OUTPUT_FILE, index=False)
else:
    print("No data retrieved.")

Starting Extraction for Turku (2015-2024)...

--- Processing Year 2015 ---
   > Fetching Weather... ...... Done (104956 records)
   > Fetching Radiation... ...... Done (0 records)

--- Processing Year 2016 ---
   > Fetching Weather... ...... Done (105298 records)
   > Fetching Radiation... ...... Done (0 records)

--- Processing Year 2017 ---
   > Fetching Weather... ...... Done (104994 records)
   > Fetching Radiation... ...... Done (0 records)

--- Processing Year 2018 ---
   > Fetching Weather... ...... Done (105113 records)
   > Fetching Radiation... ...... Done (0 records)

--- Processing Year 2019 ---
   > Fetching Weather... ...... Done (104741 records)
   > Fetching Radiation... ...... Done (0 records)

--- Processing Year 2020 ---
   > Fetching Weather... ...... Done (104784 records)
   > Fetching Radiation... ...... Done (0 records)

--- Processing Year 2021 ---
   > Fetching Weather... ...... Done (104979 records)
   > Fetching Radiation... ...... Done (0 records)

--- Proce