In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_percentage_error
from itertools import combinations
import warnings
import time
import os
import gc

# --- Environment Setup ---
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

class CatBoostOptimizer:
    def __init__(self, n_splits=5, n_trials=100):
        self.n_splits = n_splits
        self.n_trials = n_trials
        self.models = []
        self.scalers = {}
        self.target_names = None
        self.is_fitted = False
        print(f"CatBoost Optimizer configured for {self.n_trials} trials per target with {self.n_splits}-Fold CV.")

    # ### --- THIS IS THE FINAL, MEMORY-EFFICIENT FEATURE FUNCTION --- ###
    def _create_hyper_advanced_features(self, df):
        print("  - Generating final, memory-safe features...")
        df_features = df.copy()
        fractions = [f'Component{i}_fraction' for i in range(1, 6)]
        
        # This is the most valuable and most efficient feature set to create.
        # We will let the CatBoost model handle finding all other interactions.
        for p in range(1, 11):
            prop_cols = [f'Component{c}_Property{p}' for c in range(1, 6)]
            df_features[f'prop_{p}_weighted_avg'] = np.sum(df[fractions].values * df[prop_cols].values, axis=1)
        
        for col in df_features.columns:
            if df_features[col].dtype == 'float64':
                df_features[col] = df_features[col].astype(np.float32)

        return df_features

    def _objective(self, trial, X, y):
        # ### --- THIS IS THE FINAL, ROBUST OBJECTIVE LOGIC --- ###
        params = {
            'iterations': trial.suggest_int('iterations', 500, 4000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
            'depth': trial.suggest_int('depth', 4, 10),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 30.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
            'task_type': 'GPU', 'verbose': 0, 'allow_writing_files': False,
        }

        boosting_type = trial.suggest_categorical('boosting_type', ['Plain', 'Ordered'])
        
        if boosting_type == 'Plain':
            params['bootstrap_type'] = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
            params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
            if params['bootstrap_type'] == 'Bayesian' and params['subsample'] < 1.0:
                raise optuna.exceptions.TrialPruned("Bayesian bootstrap does not support subsample.")
        
        # For 'Ordered' boosting, we don't add bootstrap_type or subsample
        
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        mape_scores = []
        for train_idx, val_idx in kf.split(X, y):
            model = cb.CatBoostRegressor(**params)
            model.fit(X.iloc[train_idx], y.iloc[train_idx], eval_set=[(X.iloc[val_idx], y.iloc[val_idx])], early_stopping_rounds=100, verbose=0)
            preds = model.predict(X.iloc[val_idx])
            mape_scores.append(mean_absolute_percentage_error(y.iloc[val_idx], preds))
        return np.mean(mape_scores)

    def fit(self, X, y):
        print("====== STARTING ULTIMATE CATBOOST OPTIMIZATION ======")
        self.target_names = y.columns.tolist()
        X_featured = self._create_hyper_advanced_features(X)
        self.scalers['feature_scaler'] = RobustScaler()
        X_scaled = pd.DataFrame(self.scalers['feature_scaler'].fit_transform(X_featured), columns=X_featured.columns, dtype=np.float32)
        del X_featured
        gc.collect()

        self.models = []
        for i, target in enumerate(self.target_names):
            print(f"\n--- Optimizing for Target: {target} ({i+1}/{len(self.target_names)}) ---")
            objective_func = lambda trial: self._objective(trial, X_scaled, y[target])
            study = optuna.create_study(direction='minimize')
            
            # The 'catch' is removed to ensure pruning works correctly and other errors fail loudly.
            study.optimize(objective_func, n_trials=self.n_trials, n_jobs=1) 
            
            completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
            if not completed_trials:
                print(f"  - No trials completed successfully for {target}. This may be due to pruning. Please check parameter ranges or increase n_trials. Skipping.")
                self.models.append(None)
                continue

            best_params = study.best_trial.params
            print(f"  - Best MAPE for {target}: {study.best_value:.6f}")
            
            final_params = best_params.copy()
            final_params['iterations'] = final_params.get('iterations', 2000) + 200
            final_params['task_type'] = 'GPU'
            final_params['verbose'] = 0
            
            final_model = cb.CatBoostRegressor(**final_params).fit(X_scaled, y[target])
            self.models.append(final_model)
            
        self.is_fitted = True
        print("\n====== ULTIMATE CATBOOST OPTIMIZATION COMPLETE ======")
        return self

    def predict(self, X):
        if not self.is_fitted: raise RuntimeError("Must fit before predicting.")
        print("\n====== GENERATING PREDICTIONS WITH OPTIMIZED MODELS ======")
        X_featured = self._create_hyper_advanced_features(X)
        X_scaled = pd.DataFrame(self.scalers['feature_scaler'].transform(X_featured), columns=X_featured.columns, dtype=np.float32)
        all_predictions = {}
        for target, model in zip(self.target_names, self.models):
            all_predictions[target] = model.predict(X_scaled) if model else np.zeros(len(X))
        return pd.DataFrame(all_predictions)

# =========================================================================
# Main Execution Block
# =========================================================================
def main():
    print("🚀 Starting Fuel Blending ML Pipeline: CatBoost MAX POWER")
    try:
        train_df = pd.read_csv('/kaggle/input/training/train.csv')
        test_df = pd.read_csv('/kaggle/input/testing/test.csv')
        for df in [train_df, test_df]:
            for col in df.select_dtypes(include=['float64']).columns:
                df[col] = df[col].astype(np.float32)
    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Please check file paths.")
        return
    
    target_columns = [col for col in train_df.columns if 'BlendProperty' in col]
    feature_columns = [col for col in train_df.columns if col not in target_columns and 'ID' not in col]
    
    X_train, y_train = train_df[feature_columns], train_df[target_columns]
    X_test = test_df[feature_columns]

    # Using 2 trials for a quick test. Set back to 100 for the final run.
    catboost_optimizer = CatBoostOptimizer(n_splits=5, n_trials=2)
    
    catboost_optimizer.fit(X_train, y_train)
    predictions = catboost_optimizer.predict(X_test)

    submission = pd.DataFrame({'ID': test_df.get('ID', test_df.index)})
    submission = pd.concat([submission, predictions], axis=1)
    submission.to_csv('submission.csv', index=False)
    
    print("\n💾 Submission file 'submission.csv' saved successfully.")
    print(submission.head())

if __name__ == "__main__":
    main()

🚀 Starting Fuel Blending ML Pipeline: CatBoost MAX POWER
CatBoost Optimizer configured for 2 trials per target with 5-Fold CV.
  - Generating final, memory-safe features...

--- Optimizing for Target: BlendProperty1 (1/10) ---
  - Best MAPE for BlendProperty1: 8.391317

--- Optimizing for Target: BlendProperty2 (2/10) ---
  - Best MAPE for BlendProperty2: 0.595957

--- Optimizing for Target: BlendProperty3 (3/10) ---
  - Best MAPE for BlendProperty3: 1.191549

--- Optimizing for Target: BlendProperty4 (4/10) ---
  - Best MAPE for BlendProperty4: 0.396134

--- Optimizing for Target: BlendProperty5 (5/10) ---
  - Best MAPE for BlendProperty5: 0.326432

--- Optimizing for Target: BlendProperty6 (6/10) ---
  - Best MAPE for BlendProperty6: 0.772791

--- Optimizing for Target: BlendProperty7 (7/10) ---
  - Best MAPE for BlendProperty7: 1.613032

--- Optimizing for Target: BlendProperty8 (8/10) ---
  - Best MAPE for BlendProperty8: 0.801923

--- Optimizing for Target: BlendProperty9 (9/10) -