In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

# =========================
# 1) Load and prepare data
# =========================
input_csv = "merged_output2.csv"
df = pd.read_csv(input_csv)

# Ensure DATE_MILADI exists and is datetime
if 'DATE_MILADI' not in df.columns:
    raise ValueError("DATE_MILADI column not found in merged_output2.csv")

df['DATE_MILADI'] = pd.to_datetime(df['DATE_MILADI'], errors='coerce')
df = df[df['DATE_MILADI'].notna()].copy()

# Calendar features
df['year'] = df['DATE_MILADI'].dt.year
df['month'] = df['DATE_MILADI'].dt.month
df['dayofweek'] = df['DATE_MILADI'].dt.dayofweek

# =========================
# 2) Train/test split rules
# =========================
train_mask = (df['DATE_MILADI'].dt.year >= 2021) & (df['DATE_MILADI'].dt.year <= 2023)
test_mask = (df['DATE_MILADI'].dt.year >= 2024)

train_df = df.loc[train_mask].copy()
test_df = df.loc[test_mask].copy()

if train_df.empty or test_df.empty:
    raise ValueError("Train or test subset is empty. Check date ranges and input data.")

# =========================
# 3) Remove POWER==0 and POWER<90 from training
# =========================
target_col = 'POWER'
if target_col not in df.columns:
    raise ValueError("POWER column not found in merged_output2.csv")

train_df = train_df[train_df[target_col] >= 90].copy()
if train_df.empty:
    raise ValueError("Training data became empty after removing POWER<90. Check data quality.")

# =========================
# 4) Features & target
# =========================
exclude_cols = {target_col, 'DATE_MILADI', 'DATE_SHAMSI', 'DATE_SHAM', 'TIMESTAMPS'}
candidate_cols = [c for c in df.columns if c not in exclude_cols]

numeric_cols, categorical_cols = [], []
for c in candidate_cols:
    if pd.api.types.is_numeric_dtype(df[c]):
        numeric_cols.append(c)
    else:
        categorical_cols.append(c)

for cal_col in ['year', 'month', 'dayofweek']:
    if cal_col in df.columns and cal_col not in numeric_cols:
        numeric_cols.append(cal_col)

if 'HOUR' in df.columns and 'HOUR' not in numeric_cols:
    df['HOUR'] = pd.to_numeric(df['HOUR'], errors='coerce')
    numeric_cols.append('HOUR')

numeric_cols = [c for c in numeric_cols if c in df.columns]
categorical_cols = [c for c in categorical_cols if c in df.columns and c not in numeric_cols]

X_train = train_df[numeric_cols + categorical_cols].copy()
y_train = train_df[target_col].copy()
X_test = test_df[numeric_cols + categorical_cols].copy()

# =========================
# 5) Build preprocessing + model
# =========================
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())   # نرمالایز داده‌های عددی
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

# KNN model
model = KNeighborsRegressor(
    n_neighbors=5,
    weights='distance',
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', model)
])

# =========================
# 6) Fit and predict
# =========================
pipeline.fit(X_train, y_train)
declared_pred = pipeline.predict(X_test)

# =========================
# 7) Build output Excel for 2024+
#    If ebraz==0, force DECLARED=0
#    Shift predictions to next day
# =========================
output_df = test_df.copy()
output_df['DECLARED'] = declared_pred

# تاریخ‌ها را یک روز جلو ببریم
output_df['DATE_MILADI'] = output_df['DATE_MILADI'] + pd.Timedelta(days=1)

# Ensure ebraz column exists (case-insensitive fallback)
ebraz_col = None
for c in output_df.columns:
    if c.lower() == 'ebraz':
        ebraz_col = c
        break

if ebraz_col is None:
    raise ValueError("ebraz column not found for zero-forcing DECLARED.")

output_df.loc[output_df[ebraz_col] == 0, 'DECLARED'] = 0

# Keep input columns order + DECLARED appended
ordered_cols = list(df.columns) + ['DECLARED']
output_df = output_df.reindex(columns=ordered_cols)

output_excel = "merged_output2_knn_scaled_2024.xlsx"
output_df.to_excel(output_excel, index=False)

print(f"✅ Done. Saved KNN predictions (shifted to next day) to {output_excel}")


✅ Done. Saved KNN predictions (shifted to next day) to merged_output2_knn_scaled_2024.xlsx
