In [None]:
# Convert 'measure_date_CET' to UTC and add a new column 'measure_date_utc'
import pandas as pd
from pathlib import Path
from pandas.api.types import is_datetime64_any_dtype

# Prefer absolute path, fall back to workspace search
fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/prices_upto_oct/da.csv')
if not fn.exists():
    fn = next(Path('.').rglob('pre_germany.csv'), None)
    if fn is None:
        raise FileNotFoundError('pre_germany.csv not found in workspace')

print('Using file:', fn)
df = pd.read_csv(fn)
col = 'measure_date_CET'
if col not in df.columns:
    raise KeyError(f"Column '{col}' not found in {fn}")

# Try vectorized parse first
try:
    times = pd.to_datetime(df[col].astype(str).str.strip(), errors='coerce', infer_datetime_format=True)
    if not is_datetime64_any_dtype(times):
        raise ValueError('Vectorized parse did not produce datetimelike dtype')

    # Localize naive timestamps to Europe/Berlin and convert to UTC
    try:
        times_local = times.dt.tz_localize('Europe/Berlin', ambiguous='infer', nonexistent='shift_forward')
    except Exception:
        times_local = times.dt.tz_localize('Europe/Berlin', ambiguous='NaT', nonexistent='NaT')
    times_utc = times_local.dt.tz_convert('UTC')
    df['measure_date_utc'] = times_utc.astype(str)

except Exception:
    # Fallback: parse row-by-row using dateutil and pytz (more robust for messy inputs)
    from dateutil import parser
    try:
        import pytz
        berlin_tz = pytz.timezone('Europe/Berlin')
        utc_tz = pytz.UTC
    except Exception:
        pytz = None
        berlin_tz = None
        import datetime
        utc_tz = datetime.timezone.utc

    def parse_row_to_utc(s):
        if pd.isna(s):
            return 'NaT'
        s = str(s).strip()
        if s == '':
            return 'NaT'
        try:
            dt = parser.parse(s)
        except Exception:
            return 'NaT'
        # If naive, localize to Europe/Berlin (best effort)
        if dt.tzinfo is None:
            if berlin_tz is not None:
                try:
                    dt = berlin_tz.localize(dt, is_dst=None)
                except Exception:
                    try:
                        dt = berlin_tz.localize(dt, is_dst=False)
                    except Exception:
                        return 'NaT'
            else:
                # Best-effort: assume CET (UTC+1)
                try:
                    from datetime import timezone, timedelta
                    dt = dt.replace(tzinfo=timezone(timedelta(hours=1)))
                except Exception:
                    return 'NaT'
        # Convert to UTC
        try:
            if pytz is not None:
                dt_utc = dt.astimezone(utc_tz)
                return dt_utc.isoformat(sep=' ')
            else:
                dt_utc = dt.astimezone(utc_tz)
                # isoformat gives timezone as +00:00
                return dt_utc.isoformat(sep=' ')
        except Exception:
            return 'NaT'

    df['measure_date_utc'] = df[col].apply(parse_row_to_utc)

out = fn.with_name(fn.stem + '_utc' + fn.suffix)
df.to_csv(out, index=False)
print('Saved file with UTC column to', out)


In [None]:
# Upsample DA prices (hourly -> 30-minute) and save a new CSV
import pandas as pd
from pathlib import Path

fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/Longs_germany_utc.csv')
if not fn.exists():
    raise FileNotFoundError(f'{fn} not found')

print('Loading', fn)
df = pd.read_csv(fn)

# detect date column
candidates = ['measure_date', 'measure_date_utc', 'timestamp', 'time']
date_col = None
for c in candidates:
    if c in df.columns:
        date_col = c
        break
if date_col is None:
    # fallback to any column that looks like date/time
    for c in df.columns:
        if 'date' in c.lower() or 'time' in c.lower():
            date_col = c
            break
if date_col is None:
    raise KeyError('No date/time column found in Longs file')

print('Using date column:', date_col)

# Parse to datetime
ser = pd.to_datetime(df[date_col].astype(str).str.strip(), errors='coerce', infer_datetime_format=True)

# Ensure timezone-aware UTC (file name suggests UTC). If tz-naive, localize to UTC.
try:
    if getattr(ser.dt, 'tz', None) is None:
        ser = ser.dt.tz_localize('UTC')
    else:
        ser = ser.dt.tz_convert('UTC')
except Exception:
    # best-effort: convert dtype via pd.Timestamp
    ser = pd.to_datetime(ser).dt.tz_localize('UTC')

# Build a DataFrame indexed by the parsed times
df2 = df.copy()
# Drop original date column to avoid duplication after reset_index
if date_col in df2.columns:
    df2 = df2.drop(columns=[date_col])

df2.index = ser
# Sort index
df2 = df2.sort_index()

# Create full 30-minute index across the same span
start = df2.index.min()
end = df2.index.max()
full_idx = pd.date_range(start=start, end=end, freq='30T', tz='UTC')

# Reindex and forward-fill to propagate hourly value across quarter-hours
df_30 = df2.reindex(full_idx)
# Forward fill along rows so hourly values carry to quarter-hours
df_30 = df_30.ffill()

# Reset index back to column 'measure_date' with timezone info preserved
df_30 = df_30.reset_index().rename(columns={'index': 'measure_date'})

out = fn.with_name(fn.stem + '_30min' + fn.suffix)
df_30.to_csv(out, index=False)
print('Saved upsampled DA prices to', out)


In [None]:
# Merge Sunnic files on `measure_date` within the actual file's date range
import pandas as pd
from pathlib import Path
from pandas.api.types import is_datetime64_any_dtype

base = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic')
files = {
    'actual': base / 'actual_neuhardenberg_filled.csv',
    'Longs': base / '/home/renga/Desktop/neoen_data/renga_work/data/sunnic/Longs_germany_utc_30min.csv',
    'pre': base / 'pre_germany_utc.csv',
    'imbalance': base / '/home/renga/Desktop/neoen_data/renga_work/data/sunnic/imb_volumes.csv',
}

def load_and_standardize(fn, prefer_col='measure_date'):
    df = pd.read_csv(fn)
    # Find a datetime-like column (prefer 'measure_date')
    cand = None
    for c in [prefer_col, 'measure_date_utc', 'measure_date_CET'] + list(df.columns):
        if c in df.columns:
            cand = c
            break
    if cand is None:
        raise KeyError(f'No datetime-like column found in {fn}')

    # Parse to datetime
    ser = pd.to_datetime(df[cand], errors='coerce', infer_datetime_format=True)

    # Ensure timezone-aware and in UTC
    if is_datetime64_any_dtype(ser):
        # If tz-naive, assume values labeled *_utc are already UTC; otherwise localize to UTC
        try:
            if getattr(ser.dt, 'tz', None) is None:
                # If the original column name suggests UTC, localize as UTC, otherwise also localize to UTC
                if 'utc' in cand.lower():
                    ser = ser.dt.tz_localize('UTC')
                else:
                    ser = ser.dt.tz_localize('UTC')
            else:
                ser = ser.dt.tz_convert('UTC')
        except Exception:
            # best-effort: leave as is
            pass
    else:
        # Fall back to per-row parse if vectorized failed
        from dateutil import parser
        from datetime import timezone, timedelta
        def parse_row(x):
            if pd.isna(x):
                return pd.NaT
            try:
                dt = parser.parse(str(x).strip())
            except Exception:
                return pd.NaT
            if dt.tzinfo is None:
                # assume UTC when labelled _utc, else assume UTC as best-effort
                return pd.Timestamp(dt).tz_localize('UTC')
            return pd.Timestamp(dt).tz_convert('UTC')
        ser = df[cand].apply(parse_row)

    # Assign standardized column
    df['measure_date'] = ser
    return df

# Load main file and determine date range
main = load_and_standardize(files['actual'])
min_date = main['measure_date'].min()
max_date = main['measure_date'].max()
print('Main date range:', min_date, '->', max_date)

# Load and trim other files, then merge on 'measure_date'
merged = main.copy()
for key in ['Longs', 'pre', 'imbalance']:
    f = files[key]
    if not f.exists():
        print(f'Warning: {f} not found, skipping')
        continue
    other = load_and_standardize(f)
    # Restrict to main date range
    other = other[(other['measure_date'] >= min_date) & (other['measure_date'] <= max_date)]
    # Merge: keep all rows of main, bring columns from other (avoid duplicate measure_date)
    # Drop duplicated columns in other that exist in merged except for 'measure_date'
    cols_to_merge = [c for c in other.columns if c != 'measure_date']
    print(f'Merging {key}: {other.shape[0]} rows, columns: {cols_to_merge[:5]}{"..." if len(cols_to_merge)>5 else ""}')
    merged = merged.merge(other[['measure_date'] + cols_to_merge], on='measure_date', how='left', suffixes=(None, f'_{key}'))

# Drop unwanted columns before saving (if present)
cols_to_drop = [c for c in ['measure_date_CET', 'measure_date_CET_pre'] if c in merged.columns]
if cols_to_drop:
    merged = merged.drop(columns=cols_to_drop)
    print('Dropped columns before saving:', cols_to_drop)

out = base / 'combined_sunnic.csv'
merged.to_csv(out, index=False)
print('Saved merged file to', out)
print('Merged shape:', merged.shape)
merged.head()


In [None]:
# Ensure 'measure_date' is formatted as 'YYYY-MM-DD HH:MM:SS+0000' (UTC)
import pandas as pd
from pathlib import Path

fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/combined_sunnic.csv')
if not fn.exists():
    raise FileNotFoundError(f'{fn} not found')

df = pd.read_csv(fn)
if 'measure_date' not in df.columns:
    raise KeyError("Column 'measure_date' not found in the CSV")

# Parse to timezone-aware UTC datetimes and format without colon in tz offset
df['measure_date'] = pd.to_datetime(df['measure_date'], errors='coerce', utc=True)
df['measure_date'] = df['measure_date'].dt.strftime('%Y-%m-%d %H:%M:%S%z')

out = fn.with_name(fn.stem + '_formatted' + fn.suffix)
df.to_csv(out, index=False)
print('Saved formatted file to', out)
df['measure_date'].head(10)


In [None]:
# Check '/data/sunnic/imbalance_volumes.csv' for values in 'Imbalance Volume'
from pathlib import Path
import pandas as pd

file_path = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/imbalance_volumes.csv')

df_iv = pd.read_csv(file_path)
col = 'Imbalance Volume'
print('File:', file_path)
print('Total rows:', len(df_iv))
if col not in df_iv.columns:
    print(f"Column '{col}' not found. Available columns: {df_iv.columns.tolist()}")
else:
    non_null = df_iv[col].notna().sum()
    # count empty strings (after stripping) as empty
    empty_strings = (df_iv[col].astype(str).str.strip() == '').sum()
    # numeric zeros (coerce non-numeric to NaN first)
    zeros = (pd.to_numeric(df_iv[col], errors='coerce') == 0).sum()
    print(f"Non-null '{col}':", non_null)
    print(f"Empty-string rows in '{col}':", empty_strings)
    print(f"Numeric zeros in '{col}':", zeros)
    # show a small sample of the column
    print('\nSample values:')
    print(df_iv[col].head(10))

In [None]:
# Merge DA prices into combined_sunnic and fill missing Long
import pandas as pd
from pathlib import Path

combined_fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/combined_sunnic_da_filled_Short.csv')
da_fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/prices_upto_oct/pre.csv')

if not combined_fn.exists():
    raise FileNotFoundError(f'{combined_fn} not found')
if not da_fn.exists():
    raise FileNotFoundError(f'{da_fn} not found')

print('Loading', combined_fn)
combined = pd.read_csv(combined_fn)
print('Loading', da_fn)
da = pd.read_csv(da_fn)

# Ensure measure_date column exists
for df, name in [(combined, 'combined'), (da, 'da')]:
    if 'measure_date' not in df.columns:
        raise KeyError(f"Column 'measure_date' not found in {name} file")

# Parse measure_date to timezone-aware UTC datetimes (best-effort)
combined['measure_date'] = pd.to_datetime(combined['measure_date'].astype(str).str.strip(), errors='coerce', utc=True)
da['measure_date'] = pd.to_datetime(da['measure_date'].astype(str).str.strip(), errors='coerce', utc=True)

# Find DA price column in DA file (common names)
price_candidates = ['Long', 'price', 'price_eur', 'value']
Long_col = None
for c in price_candidates:
    if c in da.columns:
        Long_col = c
        break
# If still not found, pick first numeric column that's not measure_date
if Long_col is None:
    numeric_cols = da.select_dtypes(include=['number']).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != 'measure_date']
    if numeric_cols:
        Long_col = numeric_cols[0]
    else:
        raise KeyError('No suitable DA price column found in DA file')

print('Using DA price column:', Long_col)
# Build mapping from measure_date -> Long (ensure no duplicates; keep first)
da_map = da.set_index('measure_date')[Long_col].sort_index()
da_map = da_map[~da_map.index.duplicated(keep='first')]

# Ensure combined has a Long column; if not, create it
if 'Long' not in combined.columns:
    combined['Long'] = pd.NA

# Count missing before
missing_before = combined['Long'].isna().sum()
print('Missing Long before fill:', missing_before)

# Fill missing by mapping using exact datetime matches
# Use pandas Series.map which will align by Timestamp equality
mapped = combined['measure_date'].map(da_map)
combined['Long'] = combined['Long'].fillna(mapped)

# If still missing, try merging on a rounded timestamp (30-min) as fallback
still_missing = combined['Long'].isna().sum()
if still_missing > 0:
    print('Still missing after exact match:', still_missing)
    # Try rounding to nearest 30 minutes both sides and attempt fill
    da_map_30 = da.set_index(da['measure_date'].dt.round('30min'))[Long_col]
    da_map_30 = da_map_30[~da_map_30.index.duplicated(keep='first')]
    mapped30 = combined['measure_date'].dt.round('30min').map(da_map_30)
    combined['Long'] = combined['Long'].fillna(mapped30)
    still_missing = combined['Long'].isna().sum()
    print('Still missing after 30-min fallback:', still_missing)

# Save filled file
out = combined_fn.with_name(combined_fn.stem + '_filled_Long' + combined_fn.suffix)
combined.to_csv(out, index=False)
print('Saved filled combined file to', out)
print('Filled missing Long:', missing_before - combined['Long'].isna().sum())

In [5]:
# Count NaN values per column for the ledger CSV
import pandas as pd
from pathlib import Path

fn = Path('/home/renga/Desktop/neoen_data/renga_work/combined_ledger_neoen_nan_to_0.csv')
df = pd.read_csv(fn)
nan_counts = df.isna().sum()
print('NaN counts per column:')
print(nan_counts)
print('\nNaN percentage per column:')
pct = (nan_counts / len(df)) * 100
print(pct.round(2))

NaN counts per column:
run_time                     0
target_time                  0
lead_hours                   0
forecast_type                0
actual_MWh                   0
forecast_MWh                 0
delta_MWh                    0
da_price                     0
revenue_EUR                  0
applied_price_EUR_per_MWh    0
penalty_EUR                  0
net_revenue_EUR              0
forecast_file                0
dtype: int64

NaN percentage per column:
run_time                     0.0
target_time                  0.0
lead_hours                   0.0
forecast_type                0.0
actual_MWh                   0.0
forecast_MWh                 0.0
delta_MWh                    0.0
da_price                     0.0
revenue_EUR                  0.0
applied_price_EUR_per_MWh    0.0
penalty_EUR                  0.0
net_revenue_EUR              0.0
forecast_file                0.0
dtype: float64


In [4]:
# Replace all NaN values with 0 and save a new CSV
import pandas as pd
from pathlib import Path

fn = Path('/home/renga/Desktop/neoen_data/renga_work/combined_ledger_neoen.csv')
if not fn.exists():
    raise FileNotFoundError(f'{fn} not found')
df = pd.read_csv(fn)
# Fill NaNs with 0 for all columns
df_filled = df.fillna(0)
out = fn.with_name(fn.stem + '_nan_to_0' + fn.suffix)
df_filled.to_csv(out, index=False)
print('Saved filled file to', out)
# Sanity check: total remaining NaNs (should be 0)
print('Remaining NaNs (total):', int(df_filled.isna().sum().sum()))

Saved filled file to /home/renga/Desktop/neoen_data/renga_work/combined_ledger_neoen_nan_to_0.csv
Remaining NaNs (total): 0
