In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from scipy.stats import pearsonr
import warnings
import time

# Attempt to import cupy for GPU acceleration of data arrays
try:
    import cupy as cp
    CUPY_AVAILABLE = True
    print("CuPy found. GPU data acceleration is enabled.")
except ImportError:
    CUPY_AVAILABLE = False
    print("CuPy not found. Predictions will use CPU data, which may be slower.")

warnings.filterwarnings('ignore')

# Set device for GPU acceleration for PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using PyTorch device: {device}")

# =========================================================================
# Helper Classes
# =========================================================================

class FuelBlendingDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X.values if isinstance(X, pd.DataFrame) else X)
        self.y = torch.FloatTensor(y.values if isinstance(y, pd.DataFrame) else y) if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx]) if self.y is not None else self.X[idx]

class MultiTargetModelWrapper:
    """A generic wrapper for multi-target models trained individually."""
    def __init__(self, models):
        self.models = models

    def predict(self, X):
        predictions = [model.predict(X) for model in self.models]
        if isinstance(predictions[0], np.ndarray):
            return np.column_stack(predictions)
        elif CUPY_AVAILABLE and isinstance(predictions[0], cp.ndarray):
            return cp.column_stack(predictions)
        return np.column_stack(predictions)

class AttentionLayer(nn.Module):
    def __init__(self, input_dim, attention_dim=64):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Sequential(nn.Linear(input_dim, attention_dim), nn.Tanh(), nn.Linear(attention_dim, 1, bias=False))
    def forward(self, x):
        weights = torch.softmax(self.attention(x), dim=1)
        return x * weights

class AdvancedNeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims=[512, 256, 128, 64], dropout_rate=0.3):
        super(AdvancedNeuralNetwork, self).__init__()
        layers = [nn.Linear(input_dim, hidden_dims[0]), nn.BatchNorm1d(hidden_dims[0]), nn.ReLU(), nn.Dropout(dropout_rate)]
        for i in range(len(hidden_dims) - 1):
            layers.extend([nn.Linear(hidden_dims[i], hidden_dims[i+1]), nn.BatchNorm1d(hidden_dims[i+1]), nn.ReLU(), nn.Dropout(dropout_rate)])
        self.main_layers = nn.Sequential(*layers)
        self.attention = AttentionLayer(hidden_dims[-1])
        self.output_layers = nn.Sequential(nn.Linear(hidden_dims[-1], hidden_dims[-1]//2), nn.ReLU(), nn.Dropout(dropout_rate/2), nn.Linear(hidden_dims[-1]//2, output_dim))
        self.residual = nn.Linear(input_dim, output_dim)
    def forward(self, x):
        main_out = self.attention(self.main_layers(x))
        main_pred = self.output_layers(main_out)
        resid_pred = self.residual(x)
        alpha = torch.sigmoid(main_pred.mean(dim=1, keepdim=True))
        return alpha * main_pred + (1 - alpha) * resid_pred

# =========================================================================
# Main Predictor Class
# =========================================================================

class FuelBlendingPredictor:
    def __init__(self, use_gpu=True):
        self.device = device if use_gpu and torch.cuda.is_available() else torch.device('cpu')
        self.models = {}
        self.scalers = {}
        self.neural_net = None
        self.best_features = None
        self.target_names = None
        
    def create_advanced_features(self, df):
        df_features = df.copy()
        blend_cols = [col for col in df.columns if 'fraction' in col]
        if not blend_cols: return df_features # Return if no fraction columns found
        df_features['blend_entropy'] = -np.sum(df[blend_cols] * np.log(df[blend_cols] + 1e-8), axis=1)
        df_features['blend_gini'] = 1 - np.sum(df[blend_cols]**2, axis=1)
        return df_features
    
    def select_best_features(self, X, y, max_features=150):
        X = X.fillna(0) # Simple imputation for feature selection
        feature_scores = pd.Series(index=X.columns, dtype=float).fillna(0)
        mi_scores = mutual_info_regression(X, y.mean(axis=1), random_state=42)
        feature_scores += pd.Series(mi_scores, index=X.columns)
        f_selector = SelectKBest(score_func=f_regression, k='all')
        f_selector.fit(X, y.mean(axis=1))
        feature_scores += pd.Series(f_selector.scores_ / 1000, index=X.columns).fillna(0)
        selected_features = feature_scores.nlargest(max_features).index.tolist()
        print(f"Selected {len(selected_features)} features out of {len(X.columns)}")
        return selected_features
    
    def train_neural_network(self, X_train, y_train, X_val, y_val, epochs=250, batch_size=64, lr=0.001):
        print("Training Neural Network...")
        train_loader = DataLoader(FuelBlendingDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(FuelBlendingDataset(X_val, y_val), batch_size=batch_size, shuffle=False)
        self.neural_net = AdvancedNeuralNetwork(X_train.shape[1], y_train.shape[1]).to(self.device)
        criterion = nn.MSELoss()
        optimizer = optim.AdamW(self.neural_net.parameters(), lr=lr, weight_decay=1e-4)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=15, factor=0.5)
        best_val_loss = float('inf')
        patience_counter, patience = 0, 30
        
        for epoch in range(epochs):
            self.neural_net.train()
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
                optimizer.zero_grad()
                loss = criterion(self.neural_net(X_batch), y_batch)
                loss.backward()
                optimizer.step()
            
            self.neural_net.eval()
            val_loss = sum(criterion(self.neural_net(X.to(self.device)), y.to(self.device)).item() for X, y in val_loader) / len(val_loader)
            scheduler.step(val_loss)
            
            if val_loss < best_val_loss:
                best_val_loss, patience_counter = val_loss, 0
                torch.save(self.neural_net.state_dict(), 'best_neural_net.pth')
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
        
        self.neural_net.load_state_dict(torch.load('best_neural_net.pth'))
        print(f"Neural Network training completed. Best validation loss: {best_val_loss:.6f}")
    
    def train_gradient_boosting_models(self, X_train, y_train, X_val, y_val):
        print("Training Gradient Boosting Models...")
        X_val_device = cp.asarray(X_val) if (self.device.type == 'cuda' and CUPY_AVAILABLE) else X_val

        # --- Per-target training for XGBoost ---
        print("Training xgb with per-target early stopping...")
        start_time = time.time()
        xgb_params = {'n_estimators': 2000, 'learning_rate': 0.05, 'max_depth': 7, 'subsample': 0.8,
                      'colsample_bytree': 0.8, 'random_state': 42, 'tree_method': 'hist',
                      'device': 'cuda' if torch.cuda.is_available() else 'cpu',
                      'early_stopping_rounds': 50} # Correct way to set early stopping
        xgb_models = []
        for i in range(y_train.shape[1]):
            model = xgb.XGBRegressor(**xgb_params)
            model.fit(X_train, y_train.iloc[:, i],
                      eval_set=[(X_val, y_val.iloc[:, i])], verbose=False)
            xgb_models.append(model)
        self.models['xgb'] = MultiTargetModelWrapper(xgb_models)
        print(f"xgb training completed in {time.time() - start_time:.2f}s.")

        # --- Per-target training for CatBoost ---
        print("Training catboost with per-target early stopping...")
        start_time = time.time()
        cb_params = {'iterations': 2000, 'learning_rate': 0.05, 'depth': 7, 'l2_leaf_reg': 3,
                     'random_seed': 42, 'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
                     'verbose': 0, 'allow_writing_files': False, 'early_stopping_rounds': 50}
        cb_models = []
        for i in range(y_train.shape[1]):
            model = cb.CatBoostRegressor(**cb_params)
            model.fit(X_train, y_train.iloc[:, i],
                      eval_set=[(X_val, y_val.iloc[:, i])], verbose=False)
            cb_models.append(model)
        self.models['catboost'] = MultiTargetModelWrapper(cb_models)
        print(f"catboost training completed in {time.time() - start_time:.2f}s.")

        # --- Per-target training for LightGBM ---
        print("Training lgb with per-target early stopping...")
        start_time = time.time()
        lgb_params = {'n_estimators': 2500, 'learning_rate': 0.05, 'num_leaves': 40, 'max_depth': 8,
                      'max_bin': 128, 'subsample': 0.8, 'colsample_bytree': 0.8, 'random_state': 42,
                      'device': 'gpu' if torch.cuda.is_available() else 'cpu', 'verbose': -1}
        lgb_models = []
        for i in range(y_train.shape[1]):
            model = lgb.LGBMRegressor(**lgb_params)
            model.fit(X_train, y_train.iloc[:, i],
                      eval_set=[(X_val, y_val.iloc[:, i])],
                      callbacks=[lgb.early_stopping(50, verbose=False)])
            lgb_models.append(model)
        self.models['lgb'] = MultiTargetModelWrapper(lgb_models)
        print(f"lgb training completed in {time.time() - start_time:.2f}s.")

        # --- Training for RandomForest ---
        print("Training rf...")
        start_time = time.time()
        rf = RandomForestRegressor(n_estimators=150, max_depth=12, min_samples_leaf=3, random_state=42, n_jobs=-1)
        self.models['rf'] = MultiOutputRegressor(rf, n_jobs=-1).fit(X_train, y_train)
        print(f"rf training completed in {time.time() - start_time:.2f}s.")
    
    def create_ensemble_predictions(self, X):
        predictions, weights = [], []
        use_cupy = self.device.type == 'cuda' and CUPY_AVAILABLE
        X_device = cp.asarray(X) if use_cupy else X

        if self.neural_net:
            self.neural_net.eval()
            with torch.no_grad():
                nn_pred = self.neural_net(torch.FloatTensor(X.values).to(self.device)).cpu().numpy()
                predictions.append(nn_pred)
                weights.append(0.35)
        
        for name, model in self.models.items():
            is_gpu_model = name in ['xgb', 'lgb', 'catboost']
            pred_device = model.predict(X_device if (use_cupy and is_gpu_model) else X)
            pred = pred_device.get() if (use_cupy and isinstance(pred_device, cp.ndarray)) else pred_device
            predictions.append(pred)
            weights.append(0.20 if is_gpu_model else 0.05)
        
        return pd.DataFrame(np.average(predictions, axis=0, weights=np.array(weights)/np.sum(weights)), columns=self.target_names)
    
    def fit(self, X, y):
        self.target_names = y.columns.tolist()
        X_processed = self.create_advanced_features(X)
        self.best_features = self.select_best_features(X_processed, y)
        X_selected = X_processed[self.best_features]
        
        self.scalers['feature_scaler'] = RobustScaler()
        X_scaled = pd.DataFrame(self.scalers['feature_scaler'].fit_transform(X_selected), columns=self.best_features)
        self.scalers['target_scaler'] = RobustScaler()
        y_scaled = pd.DataFrame(self.scalers['target_scaler'].fit_transform(y), columns=y.columns)
        
        X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
        
        self.train_neural_network(X_train, y_train, X_val, y_val)
        self.train_gradient_boosting_models(X_train, y_train, X_val, y_val)
        
        val_preds_scaled = self.create_ensemble_predictions(X_val)
        val_preds_orig = self.scalers['target_scaler'].inverse_transform(val_preds_scaled)
        y_val_orig = self.scalers['target_scaler'].inverse_transform(y_val)
        final_mape = mean_absolute_percentage_error(y_val_orig, val_preds_orig)
        print(f"\n🎯 Final Ensemble MAPE on Validation Set: {final_mape:.6f}")
        return self
    
    def predict(self, X):
        X_processed = self.create_advanced_features(X)
        X_selected = X_processed[self.best_features]
        X_scaled = pd.DataFrame(self.scalers['feature_scaler'].transform(X_selected), columns=self.best_features)
        preds_scaled = self.create_ensemble_predictions(X_scaled)
        return pd.DataFrame(self.scalers['target_scaler'].inverse_transform(preds_scaled), columns=self.target_names)

# =========================================================================
# Main Execution Block
# =========================================================================

def main():
    print("🚀 Starting Fuel Blending ML Pipeline")
    try:
        # Updated data loading paths
        train_df = pd.read_csv('/kaggle/input/training/train.csv')
        test_df = pd.read_csv('/kaggle/input/testing/test.csv')
        sample_submission = pd.read_csv('/kaggle/input/samplesubmission/sample_solution.csv')
        
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        print("Please ensure the Kaggle dataset is correctly mounted at /kaggle/input/")
        return
        
    target_columns = [col for col in train_df.columns if 'BlendProperty' in col]
    if not target_columns:
        # Fallback for different naming conventions
        target_columns = sample_submission.columns.drop('ID').tolist()

    feature_columns = [col for col in train_df.columns if col not in target_columns and 'ID' not in col]
    
    # Align test set columns with training set columns
    test_features = test_df[feature_columns]

    model = FuelBlendingPredictor(use_gpu=True)
    model.fit(train_df[feature_columns], train_df[target_columns])
    
    predictions = model.predict(test_features)
    
    submission_id_col = 'ID' if 'ID' in test_df.columns else test_df.index.name
    if submission_id_col not in test_df:
        submission = pd.DataFrame({'ID': test_df.index})
    else:
        submission = pd.DataFrame({'ID': test_df[submission_id_col]})
        
    submission = pd.concat([submission, predictions], axis=1)
        
    submission.to_csv('submission.csv', index=False)
    print("\n💾 Submission file 'submission.csv' saved successfully.")
    print(submission.head())

if __name__ == "__main__":
    main()

CuPy found. GPU data acceleration is enabled.
Using PyTorch device: cuda
🚀 Starting Fuel Blending ML Pipeline
Selected 57 features out of 57
Training Neural Network...
Early stopping at epoch 174
Neural Network training completed. Best validation loss: 0.065875
Training Gradient Boosting Models...
Training xgb with per-target early stopping...
xgb training completed in 18.04s.
Training catboost with per-target early stopping...
catboost training completed in 232.27s.
Training lgb with per-target early stopping...




lgb training completed in 26.01s.
Training rf...
rf training completed in 21.61s.


CatBoostError: Invalid data type=<class 'cupy.ndarray'> : must be list, numpy.ndarray, pandas.Series, pandas.DataFrame, scipy.sparse matrix, catboost.FeaturesData or catboost.Pool