In [None]:
import pandas as pd

# List your CSV files - adjust paths/names as needed
files = [
    'data/raw/Actual Generation per Production Type_202001010000-202101010000.csv',
    'data/raw/Actual Generation per Production Type_202101010000-202201010000.csv',
    'data/raw/Actual Generation per Production Type_202201010000-202301010000.csv',
    'data/raw/Actual Generation per Production Type_202301010000-202401010000.csv',
    'data/raw/Actual Generation per Production Type_202401010000-202501010000.csv'
]

# Load CSVs
dfs = [pd.read_csv(f) for f in files]

# Concatenate
df = pd.concat(dfs, ignore_index=True)

# Standardize timestamp
df['MTU'] = pd.to_datetime(df['MTU'].str.split(' - ').str[0])

# Select and rename columns
df = df[['MTU', 'Area', 'Solar - Actual Aggregated [MW]', 
         'Wind Onshore - Actual Aggregated [MW]', 'Wind Offshore - Actual Aggregated [MW]', 
         'Hydro Pumped Storage - Actual Aggregated [MW]', 'Hydro Pumped Storage - Actual Consumption [MW]', 
         'Hydro Water Reservoir - Actual Aggregated [MW]']].rename(columns={
    'MTU': 'timestamp', 'Area': 'country', 
    'Solar - Actual Aggregated [MW]': 'solar_mw',
    'Wind Onshore - Actual Aggregated [MW]': 'wind_onshore_mw',
    'Wind Offshore - Actual Aggregated [MW]': 'wind_offshore_mw',
    'Hydro Pumped Storage - Actual Aggregated [MW]': 'hydro_pumped_gen_mw',
    'Hydro Pumped Storage - Actual Consumption [MW]': 'hydro_pumped_cons_mw',
    'Hydro Water Reservoir - Actual Aggregated [MW]': 'hydro_reservoir_mw'
})

# Handle missing values
df.fillna({'solar_mw': 0, 'wind_onshore_mw': 0, 'wind_offshore_mw': 0, 
           'hydro_pumped_gen_mw': 0, 'hydro_pumped_cons_mw': 0, 'hydro_reservoir_mw': 0}, inplace=True)

# Hydro net for pumped storage
df['hydro_pumped_net_mw'] = df['hydro_pumped_gen_mw'] - df['hydro_pumped_cons_mw']

# Resample to hourly if needed (for 15-min data)
df.set_index('timestamp', inplace=True)
df = df.resample('H').mean().reset_index()

# Save merged CSV
df.to_csv('data/processed/spain_renewables_master_2020_2024.csv', index=False)
print(df.head())
print(df.info())