In [None]:
# Step 1: Extract and Load Weather Data from ECA&D Archive

In [None]:
import zipfile
import os
import pandas as pd
from glob import glob
from io import StringIO

In [None]:
# Define paths
zip_path = "../data/ECA_blended_custom.zip"
extract_path = "../data/eca_extracted"

In [None]:
# Ensure extraction directory exists
os.makedirs(extract_path, exist_ok=True)

In [None]:
# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
# Function to load a single station file
def load_station_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

In [None]:
try:
        header_index = next(i for i, line in enumerate(lines) if line.strip().startswith("SOUID"))
    except StopIteration:
        print(f"Skipping {filepath} — header not found.")
        return None

In [None]:
data_lines = lines[header_index + 1:]
    data_str = ''.join(data_lines)

In [None]:
try:
        df = pd.read_csv(StringIO(data_str), skipinitialspace=True)
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

In [None]:
staid = os.path.basename(filepath).split("STAID")[1].split(".")[0]
    df["STAID"] = int(staid)
    return df

In [None]:
# Load and concatenate all station data
station_files = glob(os.path.join(extract_path, "FG_STAID*.txt"))

In [None]:
all_dfs = []
for f in station_files:
    df = load_station_file(f)
    if df is not None:
        all_dfs.append(df)

In [None]:
if not all_dfs:
    raise RuntimeError("No valid station data could be loaded.")

In [None]:
all_data = pd.concat(all_dfs, ignore_index=True)

In [None]:
# Preview data
print(all_data.head())
print(all_data.info())

In [None]:
# Replace missing values (-9999) with NaN and scale FG (0.1 m/s -> m/s)
all_data['FG'] = all_data['FG'].replace(-9999, pd.NA).astype('float') / 10.0

In [None]:
# Convert DATE column to datetime format
all_data['DATE'] = pd.to_datetime(all_data['DATE'], format='%Y%m%d', errors='coerce')

In [None]:
# Drop rows with invalid dates
all_data = all_data.dropna(subset=['DATE'])

In [None]:
# Optional: Keep only valid quality codes (Q_FG == 0)
all_data = all_data[all_data['Q_FG'] == 0]

In [None]:
# Save cleaned dataset for future use
output_path = "../data/cleaned_wind_speed_all_stations.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
all_data.to_csv(output_path, index=False)

In [None]:
print(f"\nCleaned dataset saved to: {output_path}")