In [None]:
def get_model(model_name='xgb', random_state=42):
    """
    Mengembalikan object model yang sudah dikonfigurasi parameternya.
    """
    # --- LightGBM Config ---
    lgbm_params = {
        'n_estimators': 5000,
        'learning_rate': 0.0325,
        'num_leaves': 50,
        'max_depth': -1,
        'min_child_samples': 20,
        'subsample': 0.65,
        'colsample_bytree': 0.85,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'objective': 'regression',
        'metric': 'rmse',
        'random_state': random_state,
        'n_jobs': 1,
        'verbose': -1
    }

    # --- XGBoost Config ---
    xgb_params = {
        'n_estimators': 2600,
        'learning_rate': 0.029244,
        'max_depth': 8,
        'min_child_weight': 6,
        'subsample': 0.6568,
        'colsample_bytree': 0.8655,
        'reg_alpha': 0.00458,
        'reg_lambda': 7.83e-05,
        'n_jobs': 1,
        'random_state': random_state,
    }

    if model_name == 'lgbm':
        return LGBMRegressor(**lgbm_params)
    elif model_name == 'xgb':
        return XGBRegressor(**xgb_params)
    else:
        raise ValueError("Model not supported. Choose 'xgb' or 'lgbm'")

# Setup Data & Features
num_cols, ord_cols, cyc_cols, sel_features = get_feature_groups(
    train_df, TARGET, solar_lunar
)

In [None]:
def train_and_evaluate(
    model_name, X, y, preprocessor,
    n_splits=5, seed=42, use_sqrt_target=False
):
    """
    Fungsi eksekusi utama: Build Pipeline -> CV -> Print Result
    """
    print('='*60)
    print(f"üöÄ TRAINING STARTED: {model_name.upper()}")
    print('='*60)

    # 1. Ambil Model
    model = get_model(model_name, seed)

    # 2. Bungkus Model (Opsional: TransformedTargetRegressor untuk RMSE lebih stabil)
    if use_sqrt_target:
        # Kalau mau pakai teknik akar kuadrat target (Pipeline_sqrt)
        regressor = TransformedTargetRegressor(
            regressor=model,
            func=np.sqrt,
            inverse_func=np.square
        )
    else:
        regressor = model

    # 3. Buat Pipeline Akhir
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', regressor)
    ])

    # 4. Cross Validation
    print(f"Running {n_splits}-Fold CV...")
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    results = cross_validate(
        final_pipeline, X, y,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        return_train_score=False
    )

    # 5. Reporting
    scores = -results['test_score'] # Convert negative RMSE to positive

    print("-" * 60)
    for i, score in enumerate(scores):
        print(f"  Fold {i+1} MSE: {score:.6f}")

    print("-" * 60)
    print(f"üèÜ {model_name.upper()} AVG MSE: {scores.mean():.6f} (+/- {scores.std():.6f})")
    print("-" * 60)

    return final_pipeline, scores.mean()

X = train_df[sel_features]
y = train_df[TARGET]

# Build Preprocessor
preprocessor = build_preprocessor(num_cols, ord_cols, cyc_cols)

wrapper = True
pipeline_xgb, score_xgb = train_and_evaluate(
    'xgb', X, y, preprocessor, N_SPLITS, SEED, use_sqrt_target=wrapper)

pipeline_lgbm, score_lgbm = train_and_evaluate(
    'lgbm', X, y, preprocessor, N_SPLITS, SEED, use_sqrt_target=wrapper)

In [None]:
def plot_importance_final(pipeline, ord_cols, cyc_cols, num_cols, top_n=25):
    # --- 1. BUKA BUNGKUS MODEL (UNWRAPPER) ---
    # Ambil step 'model' dari pipeline
    wrapper = pipeline.named_steps['model']

    # Cek apakah dia TransformedTargetRegressor?
    if hasattr(wrapper, 'regressor_'):
        print("üì¶ Terdeteksi TransformedTargetRegressor. Mengambil inner model...")
        actual_model = wrapper.regressor_
    else:
        print("‚úÖ Model tidak dibungkus (Standard).")
        actual_model = wrapper

    # --- 2. AMBIL NILAI IMPORTANCE ---
    if hasattr(actual_model, 'feature_importances_'):
        # Sklearn standard / XGBoost Scikit-Learn API
        importances = actual_model.feature_importances_
    elif hasattr(actual_model, 'booster_'):
        # LightGBM Native API
        importances = actual_model.booster_.feature_importance(importance_type='gain')
    else:
        print("‚ùå Error: Model tidak memiliki atribut feature_importances_")
        return

    # --- 3. SUSUN NAMA FITUR (SESUAI URUTAN PREPROCESSOR) ---
    # Logika: Ordinal -> Cyclical (di-expand jadi sin/cos) -> Numerical

    # Expand Cyclical (karena pipeline cyclical memecah 1 kolom jadi 2)
    expanded_cyc_feat = []
    for feat in cyc_cols:
        expanded_cyc_feat.append(f"{feat}_sin")
        expanded_cyc_feat.append(f"{feat}_cos")

    # Gabungkan list nama
    final_names = list(ord_cols) + expanded_cyc_feat + list(num_cols)

    # --- 4. VALIDASI & PLOTTING ---
    print(f"üìä Model Features: {len(importances)}")
    print(f"üìù Feature Names : {len(final_names)}")

    if len(final_names) != len(importances):
        print("‚ö†Ô∏è Warning: Jumlah fitur tidak cocok! Menggunakan nama dummy.")
        final_names = [f"Feature_{i}" for i in range(len(importances))]
    else:
        print("‚úÖ MATCH! Nama fitur sinkron.")

    # Buat DataFrame
    importance_df = pd.DataFrame({
        'Feature': final_names,
        'Importance': importances
    })

    # Sort & Plot
    importance_df = importance_df.sort_values(by='Importance', ascending=False).head(top_n)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
    plt.title(f'Top {top_n} Feature Importance (Wrapper Supported)')
    plt.tight_layout()
    plt.show()

In [None]:
print("‚è≥ Training ulang model pada seluruh dataset...")
pipeline_xgb.fit(X, y)
print("‚úÖ Model selesai dilatih!")

# 2. BARU PLOT FEATURE IMPORTANCE
plot_importance_final(
    pipeline_xgb,
    ord_cols,
    cyc_cols,
    num_cols
)

In [None]:
print("‚è≥ Training ulang model pada seluruh dataset...")
pipeline_lgbm.fit(X, y)
print("‚úÖ Model selesai dilatih!")


plot_importance_final(
    pipeline_lgbm,
    ord_cols,
    cyc_cols,
    num_cols
)

In [None]:
# --- 3. PREDIKSI KE DATA TEST ---
print("\nüîÆ Predicting Test Data...")
# Pastikan pakai data test yang sudah di-feature engineering
X_test = test_df

pred_xgb = pipeline_xgb.predict(X_test)
pred_lgbm = pipeline_lgbm.predict(X_test)

# --- 4. BLENDING (Weighted Average) ---
# Kasih bobot 70% XGBoost (karena skornya udah dewa) + 30% LightGBM (buat jaga-jaga)
print("‚öñÔ∏è  Blending: 50% XGB + 50% LGBM")
final_pred = (0.45 * pred_xgb) + (0.55 * pred_lgbm)

# Safety: Gak boleh negatif
final_pred = np.maximum(final_pred, 0)

# --- 5. MASKING MALAM HARI (Natural Sunrise/Sunset) ---
print("üåë Applying Night Masking...")

# Ambil jam sunrise/sunset dari data test
# Copy dulu biar gak ngerusak data asli
temp_test = test_df.copy()
temp_test['Timestamp'] = pd.to_datetime(temp_test['Timestamp'])

# Convert string jam ke object time
sunrise_dt = pd.to_datetime(temp_test['sunrise'], format='%I:%M %p').dt.time
sunset_dt = pd.to_datetime(temp_test['sunset'], format='%I:%M %p').dt.time
current_time = temp_test['Timestamp'].dt.time

# Logika: Kalau jam skrg < sunrise ATAU jam skrg > sunset -> NOL
is_night = [
    (curr < rise) or (curr > set_)
    for curr, rise, set_ in zip(current_time, sunrise_dt, sunset_dt)
]

# Eksekusi masking
# final_pred[is_night] = 0

# --- 6. SAVE SUBMISSION ---
submission = pd.read_csv(SAMPLE_SUBMISSION)
submission['% Baseline'] = final_pred
submission.to_csv('submission.csv', index=False)

print("\nFile saved in submission.csv")
display(submission.head())