In [None]:
# ==== Cell 1: Imports & Configuration ==== #
import os
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import pickle
import json
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Kaggle evaluation
import kaggle_evaluation.mitsui_inference_server

# ==== Enhanced Configuration ==== #
class Config:
    """Enhanced configuration for Mitsui Commodity Prediction Challenge"""
    NUM_TARGET_COLUMNS = 424
    RANDOM_STATE = 42
    CV_FOLDS = 3
    
    # Environment-specific model parameters
    LGBM_PARAMS_KAGGLE = {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'num_leaves': 31,
        'max_depth': 6,
        'random_state': RANDOM_STATE,
        'verbose': -1,
        'n_jobs': 1,
        'force_row_wise': True,
        'early_stopping_rounds': 10,
        'feature_fraction': 0.8,
    }
    
    LGBM_PARAMS_LOCAL = {
        'n_estimators': 200,
        'learning_rate': 0.05,
        'num_leaves': 64,
        'random_state': RANDOM_STATE,
        'verbose': -1,
        'n_jobs': -1,
        'force_row_wise': True
    }
    
    # Feature engineering parameters
    ROLLING_WINDOWS = [3, 5, 10, 20]
    LAG_PERIODS = [1, 2, 3]
    
    # Adaptive training configuration
    MAX_COMPLEX_MODELS = 5      # Stacking models for top targets
    MAX_SIMPLE_MODELS = 50      # LightGBM for medium importance
    MIN_SAMPLES_REQUIRED = 100
    
    @staticmethod
    def get_data_path():
        kaggle_path = Path('/kaggle/input/mitsui-commodity-prediction-challenge')
        local_path = Path("dataset")
        
        if kaggle_path.exists():
            print("🔧 Kaggle environment detected")
            return kaggle_path
        else:
            print("🔧 Local development environment detected")
            return local_path
    
    @staticmethod
    def get_lgbm_params():
        """Get environment-appropriate LightGBM parameters"""
        if Path('/kaggle').exists():
            return Config.LGBM_PARAMS_KAGGLE
        return Config.LGBM_PARAMS_LOCAL

CFG = Config()
data_path = CFG.get_data_path()

# ==== Enhanced Feature Engineering Pipeline ==== #
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Scikit-learn compatible feature engineering pipeline"""
    
    def __init__(self, rolling_windows=None, lag_periods=None, enable_heavy_features=True):
        self.rolling_windows = rolling_windows or CFG.ROLLING_WINDOWS
        self.lag_periods = lag_periods or CFG.LAG_PERIODS
        self.enable_heavy_features = enable_heavy_features
        self.fitted_stats_ = {}
        self.feature_names_ = []
        
    def fit(self, X, y=None):
        """Fit feature engineer on training data"""
        X = X.copy()
        feature_cols = [c for c in X.columns if c != 'date_id']
        
        self.feature_names_ = feature_cols
        for col in feature_cols:
            self.fitted_stats_[col] = {
                'mean': X[col].mean(),
                'std': X[col].std(),
                'quantiles': X[col].quantile([0.25, 0.5, 0.75]).to_dict()
            }
        return self
    
    def transform(self, X):
        """Transform data using fitted statistics"""
        if not hasattr(self, 'fitted_stats_'):
            raise ValueError("FeatureEngineer must be fitted before transform")
            
        return self._create_features(X.copy())
    
    def fit_transform(self, X, y=None):
        """Fit and transform in one step"""
        return self.fit(X, y).transform(X)
    
    def _create_features(self, df):
        """Create advanced features with consistent transformation"""
        print("🔧 Creating advanced features...")
        
        feature_count_before = len(df.columns)
        feature_cols = [c for c in df.columns if c != 'date_id']
        
        for col in feature_cols:
            if col == 'date_id':
                continue
                
            try:
                # Rolling statistics
                for window in self.rolling_windows:
                    df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window).mean()
                    df[f'{col}_rolling_std_{window}'] = df[col].rolling(window).std()
                    
                # Volatility measures
                df[f'{col}_annual_vol_20'] = df[col].rolling(20).std() * np.sqrt(252)
                df[f'{col}_pct_change'] = df[col].pct_change()
                
                # Lag features
                for lag in self.lag_periods:
                    df[f'{col}_lag_{lag}'] = df[col].shift(lag)
                
                if self.enable_heavy_features:
                    # Higher order statistics
                    df[f'{col}_rolling_skew_10'] = df[col].rolling(10).skew()
                    df[f'{col}_rolling_kurt_10'] = df[col].rolling(10).kurt()
                    
                    # Market regime indicators
                    roll_mean = df[col].rolling(10).mean()
                    roll_vol = df[col].rolling(10).std()
                    df[f'{col}_regime_trend_up'] = (roll_mean > roll_mean.shift(1)).astype(int)
                    df[f'{col}_regime_high_vol'] = (roll_vol > roll_vol.quantile(0.75)).astype(int)
                    
            except Exception as e:
                print(f"Warning: Error creating features for {col}: {e}")
                continue

        df = df.fillna(0)
        
        feature_count_after = len(df.columns)
        features_added = feature_count_after - feature_count_before
        print(f"✅ Feature engineering completed: {features_added} features added")
        
        return df

# ==== Enhanced Model Management ==== #
class AdaptiveModelManager:
    """Manages adaptive model selection and training"""
    
    def __init__(self):
        self.models = {}
        self.feature_columns = {}
        self.model_strategies = {}
        
    def calculate_target_importance(self, train_labels_df):
        """Calculate target importance for model selection"""
        target_columns = [col for col in train_labels_df.columns if col.startswith('target_')]
        importance_scores = []
        
        for target in target_columns:
            # Use variance as importance proxy
            variance = train_labels_df[target].var()
            importance_scores.append(variance)
            
        return importance_scores
    
    def select_model_strategies(self, target_columns, importance_scores):
        """Select appropriate model strategy for each target"""
        target_scores = list(zip(target_columns, importance_scores))
        target_scores.sort(key=lambda x: x[1], reverse=True)
        
        strategies = {}
        for i, (target, score) in enumerate(target_scores):
            if i < CFG.MAX_COMPLEX_MODELS:
                strategies[target] = 'stacking'
            elif i < CFG.MAX_SIMPLE_MODELS:
                strategies[target] = 'lightgbm'
            else:
                strategies[target] = 'linear'
                
        return strategies
    
    def train_stacking_model(self, X, y, target_name):
        """Train stacking ensemble model"""
        try:
            estimators = [
                ('lr', LinearRegression()),
                ('lgb', lgb.LGBMRegressor(**CFG.get_lgbm_params()))
            ]
            
            model = StackingRegressor(
                estimators=estimators,
                final_estimator=Ridge(alpha=1.0),
                cv=CFG.CV_FOLDS,
                n_jobs=1
            )
            
            model.fit(X, y)
            self.models[target_name] = model
            self.feature_columns[target_name] = X.columns.tolist()
            
            return model
            
        except Exception as e:
            print(f"❌ Stacking failed for {target_name}: {e}")
            return self.train_linear_model(X, y, target_name)
    
    def train_lightgbm_model(self, X, y, target_name):
        """Train LightGBM model"""
        try:
            model = lgb.LGBMRegressor(**CFG.get_lgbm_params())
            model.fit(X, y)
            
            self.models[target_name] = model
            self.feature_columns[target_name] = X.columns.tolist()
            
            return model
            
        except Exception as e:
            print(f"❌ LightGBM failed for {target_name}: {e}")
            return self.train_linear_model(X, y, target_name)
    
    def train_linear_model(self, X, y, target_name):
        """Train linear regression model as fallback"""
        model = Ridge(alpha=1.0, random_state=CFG.RANDOM_STATE)
        model.fit(X, y)
        
        self.models[target_name] = model
        self.feature_columns[target_name] = X.columns.tolist()
        
        return model

def calculate_normalized_correlation_metric(y_true, y_pred):
    """Calculate normalized correlation metric"""
    try:
        base_corr, _ = spearmanr(y_true, y_pred)
        if np.isnan(base_corr):
            return 0.0
            
        residuals = y_true - y_pred
        volatility_factor = np.std(residuals) / (np.std(y_true) + 1e-8)
        normalized_metric = abs(base_corr) / (1 + volatility_factor)
        
        return normalized_metric
        
    except Exception:
        return 0.0

# ==== Stabilization Function ==== #
def _stabilize_and_detie_rows(out_df, date_ids=None):
    """Ensure no flat rows in predictions"""
    out_df = out_df.astype(np.float32)
    out_df[:] = np.nan_to_num(out_df.values, nan=0.0, posinf=0.0, neginf=0.0)
    n_rows, n_cols = out_df.shape
    
    if date_ids is None:
        date_ids = np.zeros(n_rows, dtype=int)
        
    vals = out_df.to_numpy(np.float32)
    row_stds = np.std(vals, axis=1)
    flat_mask = row_stds < 1e-15
    
    if np.any(flat_mask):
        for r_idx in np.where(flat_mask)[0]:
            rng = np.random.default_rng(int(date_ids[r_idx]) + 131071)
            noise = rng.normal(loc=0.0, scale=1.0, size=n_cols).astype(np.float32)
            scale = (1.0 + abs(float(np.mean(vals[r_idx])))) * 1e-6
            vals[r_idx] = vals[r_idx] + noise * scale
        out_df.iloc[:, :] = vals
        
    return out_df

# ==== Enhanced Pipeline Predictor ==== #
class PipelinePredictor:
    """Complete prediction pipeline with consistent feature engineering"""
    
    def __init__(self):
        self.feature_pipeline = None
        self.model_manager = AdaptiveModelManager()
        self.is_fitted = False
    
    def fit(self, train_df, train_labels_df):
        """Fit the complete pipeline on training data"""
        print("🚀 Fitting prediction pipeline...")
        
        # Fit feature engineering pipeline
        self.feature_pipeline = FeatureEngineer(
            rolling_windows=[3, 5, 10],  # Optimized for Kaggle
            lag_periods=[1, 2],
            enable_heavy_features=False  # Disabled for speed
        )
        
        # Fit feature pipeline and transform training data
        X_train = self.feature_pipeline.fit_transform(train_df)
        X_train = X_train.drop(columns=['date_id'])
        
        # Calculate target importance and select strategies
        target_columns = [col for col in train_labels_df.columns if col.startswith('target_')]
        importance_scores = self.model_manager.calculate_target_importance(train_labels_df)
        model_strategies = self.model_manager.select_model_strategies(target_columns, importance_scores)
        
        print(f"📋 Training models for {len(target_columns)} targets")
        print(f"   - Stacking models: {sum(1 for s in model_strategies.values() if s == 'stacking')}")
        print(f"   - LightGBM models: {sum(1 for s in model_strategies.values() if s == 'lightgbm')}")
        print(f"   - Linear models: {sum(1 for s in model_strategies.values() if s == 'linear')}")
        
        # Train models with adaptive strategy
        for target in target_columns:
            strategy = model_strategies.get(target, 'linear')
            
            y = train_labels_df[target].dropna()
            common_idx = X_train.index.intersection(y.index)
            X_aligned = X_train.loc[common_idx].fillna(0)
            y_aligned = y.loc[common_idx]
            
            if len(X_aligned) >= CFG.MIN_SAMPLES_REQUIRED:
                if strategy == 'stacking':
                    self.model_manager.train_stacking_model(X_aligned, y_aligned, target)
                elif strategy == 'lightgbm':
                    self.model_manager.train_lightgbm_model(X_aligned, y_aligned, target)
                else:
                    self.model_manager.train_linear_model(X_aligned, y_aligned, target)
        
        self.is_fitted = True
        print("✅ Pipeline fitting completed")
        
    def predict(self, test: pl.DataFrame, *label_lags) -> pl.DataFrame:
        """Generate predictions using fitted pipeline"""
        if not self.is_fitted:
            raise ValueError("Pipeline must be fitted before prediction")
            
        test_df = test.to_pandas()
        
        # Transform test data using fitted pipeline
        X_test = self.feature_pipeline.transform(test_df)
        X_test = X_test.drop(columns=['date_id'])
        
        # Generate predictions
        predictions = np.zeros((len(test_df), CFG.NUM_TARGET_COLUMNS))
        
        for i in range(CFG.NUM_TARGET_COLUMNS):
            target_name = f"target_{i}"
            
            try:
                if target_name in self.model_manager.models:
                    model = self.model_manager.models[target_name]
                    feature_cols = self.model_manager.feature_columns[target_name]
                    X_aligned = X_test[feature_cols]
                    predictions[:, i] = model.predict(X_aligned)
                else:
                    # Simple fallback prediction
                    predictions[:, i] = np.mean(X_test.values, axis=1) * 0.01
                    
            except Exception as e:
                predictions[:, i] = np.random.normal(0, 0.001, len(test_df))
        
        # Create and stabilize output
        out_df = pd.DataFrame(predictions, columns=[f"target_{i}" for i in range(CFG.NUM_TARGET_COLUMNS)])
        out_df = _stabilize_and_detie_rows(out_df, test_df.get('date_id'))
        
        return pl.DataFrame(out_df)

# ==== Kaggle Submission Manager ==== #
class KaggleSubmissionManager:
    """Robust Kaggle submission management"""
    
    def __init__(self):
        self.predictor = None
        self.initialization_attempted = False
        
    def initialize_for_submission(self):
        """Initialize predictor with proper error handling"""
        if self.initialization_attempted:
            return
            
        try:
            print("🚀 Initializing Kaggle submission pipeline...")
            self.predictor = PipelinePredictor()
            
            # Load training data
            train_df = pd.read_csv(data_path / 'train.csv')
            train_labels_df = pd.read_csv(data_path / 'train_labels.csv')
            
            # Fit the complete pipeline
            self.predictor.fit(train_df, train_labels_df)
            
            print("✅ Kaggle submission pipeline initialized successfully")
            
        except Exception as e:
            print(f"❌ Initialization failed: {e}")
            # Create minimal fallback
            self.predictor = self._create_fallback_predictor()
            
        finally:
            self.initialization_attempted = True
    
    def _create_fallback_predictor(self):
        """Create minimal fallback predictor"""
        class FallbackPredictor:
            def predict(self, test, *args):
                n_samples = len(test)
                fallback_preds = np.random.normal(0, 0.001, (n_samples, CFG.NUM_TARGET_COLUMNS))
                out_df = pd.DataFrame(fallback_preds, 
                                    columns=[f"target_{i}" for i in range(CFG.NUM_TARGET_COLUMNS)])
                return pl.DataFrame(out_df)
        
        return FallbackPredictor()
    
    def predict(self, test, *label_lags):
        """Main prediction function"""
        if not self.initialization_attempted:
            self.initialize_for_submission()
            
        return self.predictor.predict(test, *label_lags)

# Global submission manager
submission_manager = KaggleSubmissionManager()

def predict(test: pl.DataFrame,
           label_lags_1_batch: pl.DataFrame,
           label_lags_2_batch: pl.DataFrame,
           label_lags_3_batch: pl.DataFrame,
           label_lags_4_batch: pl.DataFrame) -> pl.DataFrame:
    """Kaggle submission predict function"""
    return submission_manager.predict(test, label_lags_1_batch, 
                                    label_lags_2_batch, label_lags_3_batch, 
                                    label_lags_4_batch)

# Kaggle Inference Server
inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway((str(data_path),))