In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ===== Load Data =====
folder_path = Path("data")  # Make sure there's a 'data/' folder next to this script

building_years = []
for year in range(2015, 2020):
    df = pd.read_csv(folder_path / f'building_details_{year}.csv')  # FIXED
    df['year'] = year
    building_years.append(df)

building_all = pd.concat(building_years, ignore_index=True)
building_all = building_all.drop_duplicates(subset=['acct', 'year'], keep='first')
pivoted = building_all.pivot(index='acct', columns='year')
pivoted.columns = [f'{col}_{year}' for col, year in pivoted.columns]
pivoted = pivoted.reset_index()

train = pd.read_csv(folder_path / 'assessment_history_train.csv')  # FIXED
test = pd.read_csv(folder_path / 'assessment_history_test.csv')    # FIXED

train_merged = train.merge(pivoted, on='acct', how='left')
test_merged = test.merge(pivoted, on='acct', how='left')


  df = pd.read_csv(folder_path + f'building_details_{year}.csv')
  test = pd.read_csv(folder_path + 'assessment_history_test.csv')


In [3]:
acct_test = test_merged[['acct']].copy() if 'acct' in test_merged.columns else None

In [None]:
import numpy as np
import pandas as pd
import optuna
import logging
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split

# === Setup Logging ===
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("OptunaBlender")
optuna.logging.set_verbosity(optuna.logging.INFO)

# === Load base model predictions ===
oof_xgb = np.load("oof_preds_xgbreg.npy")
ridge_oof = np.load("ridgecv_oof_preds.npy")
oof_lgb=np.load("oof_preds_lgbm.npy")
test_xgb = np.load("test_preds_xgbreg.npy")
ridge_test_preds = np.load("ridgecv_test_preds.npy")
test_lgb=np.load("test_preds_lgbm_shap.npy")
# === Targets and prediction stack ===
y_meta = train['TARGET'].values
X_base = np.vstack([oof_xgb, ridge_oof,oof_lgb]).T
X_test_base = np.vstack([test_xgb, ridge_test_preds,test_lgb]).T

# === Holdout split ===
X_train, X_holdout, y_train, y_holdout = train_test_split(X_base, y_meta, test_size=0.2, random_state=42)

# === Objective Function ===
def objective(trial):
    weights = [trial.suggest_float(f"w{i}", 0, 1) for i in range(X_train.shape[1])]
    weights = np.array(weights)
    weights /= weights.sum()  # normalize

    preds = X_holdout @ weights
    rmse = root_mean_squared_error(y_holdout, preds)

    logger.info(f"Trial {trial.number} | Weights: {np.round(weights, 3).tolist()} | RMSE: {rmse:,.4f}")
    return rmse

# === Run Study ===
logger.info(" Starting Optuna optimization for weighted blending...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# === Best weights ===
best_weights = np.array([study.best_trial.params[f"w{i}"] for i in range(X_base.shape[1])])
best_weights /= best_weights.sum()
logger.info(f" Best weights: {np.round(best_weights, 4)}")
logger.info(f" Best RMSE: {study.best_value:.4f}")

# === Final test prediction ===
meta_preds = X_test_base @ best_weights

# === Save predictions ===
np.save("test_preds_optuna_blended.npy", meta_preds)
account_ids = acct_test.values.ravel()
submission = pd.DataFrame({
    "ACCOUNT": account_ids,
    "TARGET": meta_preds
})
submission.to_csv("submission_optuna_blended.csv", index=False)
logger.info(" Saved: test_preds_optuna_blended.npy and submission_optuna_blended.csv")


2025-05-12 07:39:46,871 [INFO] 🔍 Starting Optuna optimization for weighted blending...
[I 2025-05-12 07:39:46,874] A new study created in memory with name: no-name-e87df36d-d44d-46e5-8414-1e7bbd57cc50
2025-05-12 07:39:46,904 [INFO] Trial 0 | Weights: [0.636, 0.327, 0.037] | RMSE: 36,950.4432
[I 2025-05-12 07:39:46,922] Trial 0 finished with value: 36950.443206137075 and parameters: {'w0': 0.8512974733714285, 'w1': 0.4382062860973963, 'w2': 0.050059893275311795}. Best is trial 0 with value: 36950.443206137075.
2025-05-12 07:39:46,927 [INFO] Trial 1 | Weights: [0.245, 0.239, 0.517] | RMSE: 36,368.3838
[I 2025-05-12 07:39:46,929] Trial 1 finished with value: 36368.38375484343 and parameters: {'w0': 0.461510781178444, 'w1': 0.44982409881404184, 'w2': 0.9736670048821278}. Best is trial 1 with value: 36368.38375484343.
2025-05-12 07:39:46,935 [INFO] Trial 2 | Weights: [0.335, 0.131, 0.535] | RMSE: 36,728.7778
[I 2025-05-12 07:39:46,936] Trial 2 finished with value: 36728.777840581635 and par

In [None]:
import numpy as np
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

# === Load OOF + Test Predictions ===
oof_xgb = np.load("oof_preds_xgbreg.npy")
test_xgb = np.load("test_preds_xgbreg.npy")
ridge_oof=np.load("ridgecv_oof_preds.npy")
ridge_test_preds=np.load("ridgecv_test_preds.npy")
oof_lgb=np.load("oof_preds_lgbm.npy")
test_lgb=np.load("test_preds_lgbm_shap.npy")

# === 3. Combine full meta-input feature set ===
X_meta = np.hstack([
    oof_xgb.reshape(-1, 1),
    ridge_oof.reshape(-1, 1),
    oof_lgb.reshape(-1,1)
])
y_meta = train['TARGET'].values

X_meta_test = np.hstack([
    test_xgb.reshape(-1, 1),
    ridge_test_preds.reshape(-1, 1),
    test_lgb.reshape(-1,1)
])

# === 4. Train ElasticNetCV meta-learner ===
meta_model = make_pipeline(
    StandardScaler(),
    ElasticNetCV(
        l1_ratio=[0.1, 0.5, 0.9, 1],
        alphas=np.logspace(-4, 2, 100),
        cv=3,
        max_iter=5000,
        n_jobs=-1
    )
)
meta_model.fit(X_meta, y_meta)

# === 5. Predict and evaluate (optional holdout split) ===
# You can skip this section if you're blending on full train
X_train, X_holdout, y_train, y_holdout = train_test_split(X_meta, y_meta, test_size=0.2, random_state=42)
meta_model.fit(X_train, y_train)
holdout_preds = meta_model.predict(X_holdout)
rmse = root_mean_squared_error(y_holdout, holdout_preds)
print(f"ElasticNetCV Blended Meta Holdout RMSE: {rmse:,.2f}")
best_alpha = meta_model.named_steps['elasticnetcv'].alpha_
print(f" Best alpha selected: {best_alpha}")
# === 6. Final predictions for test set ===
meta_preds = meta_model.predict(X_meta_test)

# === 7. Save blended test predictions ===
np.save("test_preds_elasticnet_blended.npy", meta_preds)
account_ids = acct_test.values.ravel() 
submission = pd.DataFrame({
    "ACCOUNT": account_ids,  # Replace with your actual ID column
    "TARGET": meta_preds
})
submission.to_csv("submission_elasticnet_blended.csv", index=False)
print(" ElasticNetCV blended stacking submission saved.")


📉 ElasticNetCV Blended Meta Holdout RMSE: 36,344.64
🔍 Best alpha selected: 0.01
📤 ElasticNetCV blended stacking submission saved.
