In [None]:
# Compute percentage of rows where DA price > Long price (saves a small summary and sample)
from pathlib import Path
import pandas as pd

fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/neoen/grid_penalty/combined_with_utc.csv')
if not fn.exists():
    raise FileNotFoundError(f'{fn} not found')

df_check = pd.read_csv(fn)
cols = list(df_check.columns)

def find_first_matching(cols, candidates):
    lower_map = {c.lower(): c for c in cols}
    for cand in candidates:
        cand_l = cand.lower()
        for col_lower, original in lower_map.items():
            if cand_l in col_lower:
                return original
    return None

da_candidates = ['da_price', 'da_price_15min', 'price', 'price_eur', 'day_ahead']
long_candidates = ['long', 'positive_imbalance_settlement_price', 'long_price', 'positive_price']

da_col = find_first_matching(cols, da_candidates)
long_col = find_first_matching(cols, long_candidates)

print('Detected columns:')
print(' da_col ->', da_col)
print(' long_col ->', long_col)

if da_col is None or long_col is None:
    raise KeyError('Could not detect DA or Long price column. Check column names in the CSV.')

# Coerce to numeric and compute condition on rows where both are numeric
s_da = pd.to_numeric(df_check[da_col], errors='coerce')
s_long = pd.to_numeric(df_check[long_col], errors='coerce')

valid_mask = s_da.notna() & s_long.notna()
n_valid = int(valid_mask.sum())
if n_valid == 0:
    raise ValueError('No rows with both numeric DA and Long prices')

n_da_gt_long = int((s_da[valid_mask] > s_long[valid_mask]).sum())
pct = float(n_da_gt_long) / n_valid * 100.0

print(f'Rows with both prices numeric: {n_valid}')
print(f'Rows where DA price > Long price: {n_da_gt_long} ({pct:.2f}%)')

# Save summary and a small sample for inspection
out_dir = Path('/home/renga/Desktop/neoen_data/renga_work/outputs_same_tod')
out_dir.mkdir(parents=True, exist_ok=True)
summary = pd.DataFrame({'rows_valid': [n_valid], 'rows_da_gt_long': [n_da_gt_long], 'percent': [pct]})
out_path = out_dir / 'da_gt_long_summary.csv'
summary.to_csv(out_path, index=False)
print('Saved summary to', out_path)

sample = df_check.loc[valid_mask & (s_da > s_long), [da_col, long_col]].head(50)
sample_out = out_dir / 'da_gt_long_sample.csv'
sample.to_csv(sample_out, index=False)
print('Saved sample rows to', sample_out)

# Display a quick preview if running in a notebook environment
try:
    display(sample.head())
except Exception:
    print('Sample where DA > Long (first 10 rows):')

    print(sample.head(10).to_string(index=False))

Detected columns:
 da_col -> da_price
 long_col -> Long
Rows with both prices numeric: 8375
Rows where DA price > Long price: 4451 (53.15%)
Saved summary to /home/renga/Desktop/neoen_data/renga_work/outputs_same_tod/da_gt_long_summary.csv
Saved sample rows to /home/renga/Desktop/neoen_data/renga_work/outputs_same_tod/da_gt_long_sample.csv


Unnamed: 0,da_price,Long
4,18.92,13.24
16,5.9,-7.27
32,12.83,11.17
36,13.76,-3.59
40,17.95,0.58


In [None]:
CSV_PATH = '/home/renga/Desktop/neoen_data/renga_work/data/neoen/grid_penalty/combined_with_utc.csv'
df = pd.read_csv(CSV_PATH)
print("Loaded:", CSV_PATH, "shape:", df.shape)
df.head()


Loaded: /home/renga/Desktop/neoen_data/renga_work/data/grid_penalty/combined_with_utc.csv shape: (28028, 8)


Unnamed: 0,measure_date_CET,measure_date_UTC_str,measure_date,da_price,Long,Short,_hour_utc,da_price_15min
0,2025-01-01 00:00:00+01:00,2024-12-31 23:00:00+00:00,2024-12-31 23:00:00+00:00,12.36,20.34,23.28,2024-12-31 23:00:00+00:00,12.36
1,2025-01-01 00:15:00+01:00,2024-12-31 23:15:00+00:00,2024-12-31 23:15:00+00:00,,19.22,22.0,2024-12-31 23:00:00+00:00,12.36
2,2025-01-01 00:30:00+01:00,2024-12-31 23:30:00+00:00,2024-12-31 23:30:00+00:00,,32.13,36.79,2024-12-31 23:00:00+00:00,12.36
3,2025-01-01 00:45:00+01:00,2024-12-31 23:45:00+00:00,2024-12-31 23:45:00+00:00,,29.13,33.35,2024-12-31 23:00:00+00:00,12.36
4,2025-01-01 01:00:00+01:00,2025-01-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,18.92,13.24,15.16,2025-01-01 00:00:00+00:00,18.92


In [None]:
def find_first_matching(cols, candidates):
    lower_map = {c.lower(): c for c in cols}
    for cand in candidates:
        for col_lower, original in lower_map.items():
            if cand in col_lower:
                return original
    return None

cols = list(df.columns)

measure_candidates  = ["measure_date"]
da_candidates       = [ "da_price","da_price_15min"]
long_candidates     = ["Long","long", "positive_imbalance_settlement_price"]
short_candidates    = ["Short","short", "negative_imbalance_settlement_price"]
imbalance_candidates= ["imbalance"]

measure_col  = find_first_matching(cols, measure_candidates)
da_col       = find_first_matching(cols, da_candidates)
long_col     = find_first_matching(cols, long_candidates)
short_col    = find_first_matching(cols, short_candidates)
#imbalance_col= find_first_matching(cols, imbalance_candidates)

detected = {
    "measure": measure_col,
    "day_ahead_price": da_col,
    "long_price": long_col,
    "short_price": short_col,
    #"imbalance": imbalance_col,
}
print("Detected columns:", detected)

required = [da_col, long_col, short_col]
if any(v is None for v in required):
    raise ValueError("Missing one or more required columns. Adjust candidates or rename your columns.")


Detected columns: {'measure': 'measure_date_CET', 'day_ahead_price': 'da_price_15min', 'long_price': 'Long', 'short_price': 'Short'}


In [16]:
if measure_col is not None:
    df[measure_col] = pd.to_datetime(df[measure_col], errors="coerce")
    print("NaT in time column:", df[measure_col].isna().sum())

for c in [da_col, long_col, short_col]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
    print(f"NaN in {c}:", df[c].isna().sum())

NaT in time column: 19572
NaN in da_price_15min: 0
NaN in Long: 1
NaN in Short: 2


In [12]:
def predicted_sign_row(da, lo, sh, equality_mode="strict"):
    if any(pd.isna(v) for v in [da, lo, sh]):
        return np.nan

    if equality_mode == "equals_as_gt":
        if (lo >= da) and (sh >= da) and ((lo > da) or (sh > da)):
            return 1
        if (lo <= da) and (sh <= da) and ((lo < da) or (sh < da)):
            return -1
        return 0

    if equality_mode == "equals_as_lt":
        if (lo <= da) and (sh <= da) and ((lo < da) or (sh < da)):
            return -1
        if (lo >= da) and (sh >= da) and ((lo > da) or (sh > da)):
            return 1
        return 0

    # strict: equals => indeterminate
    if (lo > da) and (sh > da):
        return 1
    if (lo < da) and (sh < da):
        return -1
    return 0  # mixed or any equality => indeterminate

df["predicted_sign"] = df.apply(
    lambda r: predicted_sign_row(r[da_col], r[long_col], r[short_col], EQUALITY_MODE),
    axis=1
)

# Actual sign from imbalance column (positive/negative/zero)
#df["actual_sign"] = np.sign(df[imbalance_col])

# Match / status
df["match"] = df["predicted_sign"] == df["actual_sign"]
df["status"] = np.select(
    [
        df["predicted_sign"].isna() | df["actual_sign"].isna(),
        df["predicted_sign"] == 0,
        df["match"]
    ],
    [
        "Missing/NaN",
        "Indeterminate (mixed/equal)",
        "Match"
    ],
    default="Mismatch"
)

df[[measure_col, da_col, long_col, short_col, "predicted_sign", "actual_sign", "status"]].head(3)


NameError: name 'EQUALITY_MODE' is not defined

In [None]:
total = len(df)
cnt = df["status"].value_counts(dropna=False)
print("--------Summary--------")
print(cnt.to_string())

indet = (df["status"] == "Indeterminate (mixed/equal)").sum()
missing = (df["status"] == "Missing/NaN").sum()
valid = total - indet - missing
if valid > 0:
    acc = (df["status"] == "Match").sum() / valid
    print(f"\nMatch rate (excluding Indeterminate & Missing): {acc:.2%}")
else:
    print("\nNo valid rows to score.")

print("\nCrosstab (Predicted vs Actual)")
print(pd.crosstab(df["predicted_sign"], df["actual_sign"]).rename_axis(index="Predicted", columns="Actual"))


--------Summary--------
status
Mismatch                       6607
Match                          1644
Indeterminate (mixed/equal)     294

Match rate (excluding Indeterminate & Missing): 19.92%

Crosstab (Predicted vs Actual)
Actual     -1.0   0.0   1.0
Predicted                  
-1          964    23  3550
 0          164     1   129
 1         3013    21   680


In [14]:
df_plot = df.copy()
if measure_col is not None and (START_DATE or END_DATE):
    mask = pd.Series(True, index=df_plot.index)
    if START_DATE:
        mask &= df_plot[measure_col] >= pd.to_datetime(START_DATE)
    if END_DATE:
        mask &= df_plot[measure_col] <= pd.to_datetime(END_DATE)
    df_plot = df_plot[mask]

if len(df_plot) > MAX_PLOT_POINTS:
    df_plot = df_plot.iloc[:MAX_PLOT_POINTS].copy()

print("Plotting rows:", len(df_plot))


NameError: name 'START_DATE' is not defined

In [13]:
if measure_col is None:
    print("No time column detected; skipping time series plot.")
else:
    temp = df_plot[[measure_col, da_col, long_col, short_col]].dropna().sort_values(measure_col)
    if temp.empty:
        print("Nothing to plot in selected window.")
    else:
        plt.figure()
        plt.plot(temp[measure_col], temp[da_col],  label=da_col)
        plt.plot(temp[measure_col], temp[long_col], label=long_col)
        plt.plot(temp[measure_col], temp[short_col],label=short_col)
        plt.legend()
        plt.title("DA vs Long vs Short Prices")
        plt.xlabel("Time")
        plt.ylabel("Price")
        plt.tight_layout()
        plt.show()


NameError: name 'df_plot' is not defined

In [None]:
# Compute percentage of rows where DA price > Long price
from pathlib import Path
import pandas as pd

fn = Path('/home/renga/Desktop/neoen_data/renga_work/data/sunnic/combined_sunnic_upto_oct.csv')

df_check = pd.read_csv(fn)
cols = list(df_check.columns)

def find_first_matching(cols, candidates):
    lower_map = {c.lower(): c for c in cols}
    for cand in candidates:
        cand_l = cand.lower()
        for col_lower, original in lower_map.items():
            if cand_l in col_lower:
                return original
    return None

da_candidates = ['da_price', 'da_price_15min', 'day_ahead']
long_candidates = ['long', 'positive_imbalance_settlement_price', 'long_price', 'positive_price']

da_col = find_first_matching(cols, da_candidates)
long_col = find_first_matching(cols, long_candidates)

print('Detected columns:')
print(' da_col ->', da_col)
print(' long_col ->', long_col)

if da_col is None or long_col is None:
    raise KeyError('Could not detect DA or Long price column. Check column names in the CSV.')

s_da = pd.to_numeric(df_check[da_col], errors='coerce')
s_long = pd.to_numeric(df_check[long_col], errors='coerce')

valid_mask = s_da.notna() & s_long.notna()
n_valid = int(valid_mask.sum())
if n_valid == 0:
    raise ValueError('No rows with both numeric DA and Long prices')

n_da_gt_long = int((s_da[valid_mask] > s_long[valid_mask]).sum())
pct = float(n_da_gt_long) / n_valid * 100.0

print(f'Rows with both prices numeric: {n_valid}')
print(f'Rows where DA price > Long price: {n_da_gt_long} ({pct:.2f}%)')


Detected columns:
 da_col -> da_price
 long_col -> Long
Rows with both prices numeric: 70379
Rows where DA price > Long price: 37809 (53.72%)


In [None]:
# End of notebook - placeholder cell (empty)
