In [1]:
# Block 1: Imports and Setup
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
import joblib
import json
import os
import sys
import time
import optuna
import mlflow
import mlflow.lightgbm
import random
from typing import Any, Dict, Tuple
from datetime import datetime
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.metrics import roc_auc_score
import yaml
# Add project root to Python path
project_root = str(Path().absolute().parent.parent.parent.parent)
if project_root not in sys.path:
    sys.path.append(project_root)
os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["ARROW_S3_DISABLE"] = "1"

from utils.logger import ExperimentLogger
# Initialize logger
experiment_name = "lightgbm_soccer_prediction"
logger = ExperimentLogger(experiment_name)

from utils.create_evaluation_set import setup_mlflow_tracking
from models.StackedEnsemble.utils.metrics import calculate_metrics
from models.StackedEnsemble.shared.data_loader import DataLoader

mlruns_dir = setup_mlflow_tracking(experiment_name)

# Load data
dataloader = DataLoader()
X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data()


2025-02-26 21:13:39,280 | INFO     | lightgbm_soccer_prediction | Initialized ExperimentLogger for lightgbm_soccer_prediction at logs\lightgbm_soccer_prediction
Project root create_evaluation_set: c:\Users\szita\Documents\TheDrawCode
Project root mlflow_utils: c:\Users\szita\Documents\TheDrawCode
2025-02-26 21:13:39,481 | INFO     | lightgbm_soccer_prediction | Setting up MLflow tracking for experiment: lightgbm_soccer_prediction
mlflow local_path_uri: c:/Users/szita/Documents/TheDrawCode/mlruns
2025-02-26 21:13:39,596 | INFO     | lightgbm_soccer_prediction | Using existing experiment: lightgbm_soccer_prediction experiment_id: 202086942006118315
2025-02-26 21:13:39,596 | INFO     | lightgbm_soccer_prediction | MLflow tracking configured successfully at: c:/Users/szita/Documents/TheDrawCode/mlruns
2025-02-26 21:13:39,604 | INFO     | lightgbm_soccer_prediction | Loading data splits according to ensemble strategy
2025-02-26 21:13:39,604 | INFO     | lightgbm_soccer_prediction | Returnin

In [2]:
# Block 2: Configuration Loading
def load_hyperparameter_space():
    try:
        # Define hyperparameter space directly
        hyperparameter_space = {
            'learning_rate': {
                'type': 'float',
                'low': 0.11,
                'high': 0.125,
                'log': True
            },
            'num_leaves': {
                'type': 'int', 
                'low': 46,
                'high': 52
            },
            'max_depth': {
                'type': 'int',
                'low': 4,
                'high': 5
            },
            'min_child_samples': {
                'type': 'int',
                'low': 158,
                'high': 172
            },
            'feature_fraction': {
                'type': 'float',
                'low': 0.71,
                'high': 0.75
            },
            'bagging_fraction': {
                'type': 'float', 
                'low': 0.54,
                'high': 0.57
            },
            'bagging_freq': {
                'type': 'int',
                'low': 6,
                'high': 8
            },
            'reg_alpha': {
                'type': 'float',
                'low': 9.8,
                'high': 10.3,
                'log': True
            },
            'reg_lambda': {
                'type': 'float',
                'low': 7.8,
                'high': 8.2,
                'log': True
            },
            'min_split_gain': {
                'type': 'float',
                'low': 0.13,
                'high': 0.14,
                'log': True
            },
            'early_stopping_rounds': {
                'type': 'int',
                'low': 550,
                'high': 600
            },
            'path_smooth': {
                'type': 'float',
                'low': 0.001,
                'high': 0.05,
                'log': True
            },
            'cat_smooth': {
                'type': 'float',
                'low': 1.0,
                'high': 50.0,
                'log': True
            },
            'max_bin': {
                'type': 'int',
                'low': 255,
                'high': 511
            }
        }
        return hyperparameter_space
    except Exception as e:
        logger.error(f"Error creating hyperparameter space: {str(e)}")
        return None

hyperparameter_space = load_hyperparameter_space()


In [3]:
# Block 3: Model Creation
def create_model(**kwargs):
    """Create and configure LightGBM model instance."""
    try:
        params = {
            'objective': 'binary',
            'metric': ['binary_logloss', 'auc'],
            'n_jobs': -1,
            'verbose': -1
        }
        
        # Update with provided parameters
        params.update(kwargs)
        
        # Create model
        model = lgb.LGBMClassifier(**params)
        
        # logger.info(f"Created LightGBM model with parameters: {params}")
        return model
        
    except Exception as e:
        logger.error(f"Error creating LightGBM model: {str(e)}")
        raise


In [4]:
# Block 4: Data Conversion
def convert_to_model_format(X: pd.DataFrame, y: pd.Series = None):
    """Convert data to LightGBM format."""
    if X is None:
        raise ValueError("The feature dataset X must not be None.")
    
    # LightGBM can handle pandas DataFrames directly
    # Just ensure y is numeric if provided
    if y is not None and isinstance(y, pd.Series):
        y = y.astype(int)
    
    return X, y


In [5]:
# Block 5: Training Function
def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, **kwargs):
    """Train LightGBM model with early stopping."""
    try:
        # Extract early stopping rounds if present
        early_stopping_rounds = kwargs.pop('early_stopping_rounds', 100)
        
        # Create model with remaining parameters
        model = create_model(**kwargs)
        
        # Create eval set for early stopping
        eval_set = [(X_test, y_test)]
        
        # Fit model with early stopping
        model.fit(
            X_train, y_train,
            eval_set=eval_set,
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds)]
        )
        
        # Get validation predictions
        y_prob = model.predict_proba(X_eval)[:, 1]
        metrics = optimize_threshold(model, y_eval, y_prob)
        
        return model, metrics
        
    except Exception as e:
        logger.error(f"Error training LightGBM model: {str(e)}")
        raise


In [6]:
def optimize_threshold(model, y_true: np.ndarray, y_prob: np.ndarray) -> float:
    """Optimize prediction threshold with focus on precision while maintaining recall above 15%."""
    try:
        best_threshold = 0.5
        best_precision = 0.0
        
        # Search through thresholds
        for threshold in np.linspace(0.3, 0.8, 51):
            y_pred = (y_prob >= threshold).astype(int)
            
            # Calculate confusion matrix components
            tp = np.sum((y_true == 1) & (y_pred == 1))
            fp = np.sum((y_true == 0) & (y_pred == 1))
            fn = np.sum((y_true == 1) & (y_pred == 0))
            
            precision = tp / (tp + fp + 1e-10)
            recall = tp / (tp + fn + 1e-10)
            
            # Only consider thresholds that maintain recall above 15%
            if recall >= 0.15:
                if precision > best_precision:
                    best_precision = precision
                    best_threshold = threshold
        
        logger.info(f"Optimized threshold: {best_threshold:.3f} with precision: {best_precision:.3f}")

        metrics = evaluate(model, X_eval, y_eval, best_threshold)
        return metrics
        
    except Exception as e:
        logger.error(f"Error optimizing threshold: {str(e)}")
        return 0.5


In [7]:
def evaluate(model, X: Any, y: Any, best_threshold: float) -> Dict[str, float]:
    """Evaluate model performance on given data."""
    if model is None:
        raise RuntimeError("Model must be trained before evaluation")
    
    try:
        # Get probability predictions
        y_prob = model.predict_proba(X)[:, 1]
        
        # Get binary predictions using best threshold
        y_pred = (y_prob >= best_threshold).astype(int)
        
        # Calculate metrics
        tp = np.sum((y == 1) & (y_pred == 1))
        fp = np.sum((y == 0) & (y_pred == 1))
        fn = np.sum((y == 1) & (y_pred == 0))
        
        metrics = {
            'precision': tp / (tp + fp + 1e-10),
            'recall': tp / (tp + fn + 1e-10),
            'f1': 2 * tp / (2 * tp + fp + fn + 1e-10),
            'auc': roc_auc_score(y, y_prob),
            'brier_score': np.mean((y_prob - y) ** 2),
            'threshold': best_threshold
        }
        
        return metrics
        
    except Exception as e:
        logger.error(f"Error in model evaluation: {str(e)}")
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'auc': 0.0,
            'brier_score': 1.0,
            'threshold': best_threshold
        }


In [8]:
# Block 6: Prediction Functions
def predict(model, X, threshold=0.5):
    """Generate predictions using trained model."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        probas = model.predict_proba(X)[:, 1]
        return (probas >= threshold).astype(int)
        
    except Exception as e:
        logger.error(f"Error in model prediction: {str(e)}")
        return np.zeros(len(X))

def predict_proba(model, X):
    """Generate probability predictions."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        return model.predict_proba(X)[:, 1]
        
    except Exception as e:
        logger.error(f"Error in probability prediction: {str(e)}")
        return np.zeros(len(X))


In [9]:
# Block 7: Model Persistence
def save_model(model, path, threshold=0.5):
    """Save LightGBM model to specified path."""
    if model is None:
        raise RuntimeError("No model to save")
        
    try:
        # Create directory if it doesn't exist
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        
        # Save model
        joblib.dump(model, path)
        
        # Save threshold
        threshold_path = path.parent / "threshold.json"
        with open(threshold_path, 'w') as f:
            json.dump({
                'threshold': threshold,
                'model_type': 'lightgbm',
                'params': model.get_params()
            }, f, indent=2)
            
        logger.info(f"Model saved to {path}")
        
    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        raise

def load_model(path):
    """Load LightGBM model from specified path."""
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"No model file found at {path}")
        
    try:
        # Load model
        model = joblib.load(path)
        
        # Load threshold
        threshold_path = path.parent / "threshold.json"
        if threshold_path.exists():
            with open(threshold_path, 'r') as f:
                data = json.load(f)
                threshold = data.get('threshold', 0.5)
        else:
            threshold = 0.5
            
        logger.info(f"Model loaded from {path}")
        return model, threshold
        
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise


In [10]:
# Block 8: Feature Importance
def get_feature_importance(model):
    """Get feature importance scores."""
    try:
        # Get feature importance scores
        feature_importance = model.feature_importances_
        feature_names = model.feature_name_
        
        # Convert to DataFrame
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        })
        importance_df = importance_df.sort_values(
            'importance',
            ascending=False
        ).reset_index(drop=True)
        
        return importance_df
        
    except Exception as e:
        logger.error(f"Error getting feature importance: {str(e)}")
        return pd.DataFrame(columns=['feature', 'importance'])


In [11]:
# Block 9: Hyperparameter Optimization
def optimize_hyperparameters(X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space):
    """Run hyperparameter optimization with Optuna."""
    logger.info("Starting hyperparameter optimization")
    
    if not hyperparameter_space:
        hyperparameter_space = load_hyperparameter_space()
    best_score = 0.0
    def objective(trial):
        try:
            params = {
                'objective': 'binary',
                'metric': ['binary_logloss', 'auc'],
                'verbose': -1,
                'n_jobs': -1,
                'random_state': 19,
                'device': 'cpu'
            }
            
            # Add hyperparameters from config
            for param_name, param_config in hyperparameter_space.items():
                if param_config['type'] == 'float':
                    params[param_name] = trial.suggest_float(
                        param_name,
                        param_config['low'],
                        param_config['high'],
                        log=param_config.get('log', False)
                    )
                elif param_config['type'] == 'int':
                    params[param_name] = trial.suggest_int(
                        param_name,
                        param_config['low'],
                        param_config['high']
                    )

            # Train model and get metrics
            model, metrics = train_model(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                **params
            )
            
            recall = metrics.get('recall', 0.0)
            precision = metrics.get('precision', 0.0)
            
            # Report intermediate values for pruning
            trial.report(precision, step=1)
            if trial.should_prune():
                raise optuna.TrialPruned()
            
            # Optimize for precision while maintaining minimum recall
            score = precision if recall >= 0.15 else 0.0
            
            logger.info(f"Trial {trial.number}:")
            logger.info(f"  Params: {params}")
            # logger.info(f"  Metrics: {metrics}")
            logger.info(f"  Score: {score}")
            
            for metric_name, metric_value in metrics.items():
                trial.set_user_attr(metric_name, metric_value)
            return score
            
        except Exception as e:
            logger.error(f"Trial failed: {str(e)}")
            return 0.0

    try:
        study = optuna.create_study(
            study_name='lightgbm_optimization',
            direction='maximize',
            sampler=TPESampler(seed=42),
            pruner=MedianPruner(
                n_startup_trials=5,
                n_warmup_steps=2,
                interval_steps=1
            )
        )
        
        study.optimize(
            objective,
            n_trials=800,
            timeout=7200,
            show_progress_bar=True
        )
        
        best_params = study.best_params
        best_params.update({
            'objective': 'binary',
            'metric': ['binary_logloss', 'auc'],
            'verbose': -1,
            'n_jobs': -1,
            'random_state': 19,
            'device': 'cpu'
        })
        
        logger.info(f"Best trial value: {study.best_value}")
        logger.info(f"Best parameters found: {best_params}")
        return best_params
            
    except Exception as e:
        logger.error(f"Error in hyperparameter optimization: {str(e)}")
        raise


In [12]:
# Block 11: Main Training Function
def hypertune_lightgbm(experiment_name: str) -> float:
    """Main training function with MLflow tracking."""
    try:
        
        
        # Start MLflow run
        with mlflow.start_run(run_name=f"lightgbm_base_{datetime.now().strftime('%Y%m%d_%H%M')}"):
            # Log dataset info
            mlflow.log_params({
                "train_samples": len(X_train),
                "test_samples": len(X_test),
                "eval_samples": len(X_eval),
                "features": X_train.shape[1]
            })
            
            # Set tags
            mlflow.set_tags({
                "model_type": "lightgbm_base",
                "training_mode": "global",
                "cpu_only": True
            })
            
            # Load hyperparameter space
            hyperparameter_space = load_hyperparameter_space()
            
            # Run hyperparameter optimization
            logger.info("Starting hyperparameter optimization")
            best_params = optimize_hyperparameters(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                hyperparameter_space=hyperparameter_space
            )
            
            # Train final model with best parameters
            logger.info("Training final model with best parameters")
            model, metrics = train_model(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                **best_params
            )
            
            # Log metrics
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(f"final_{metric_name}", metric_value)
            
            # Log best parameters
            mlflow.log_params(best_params)
            
            precision = metrics.get('precision', 0.0)
            logger.info(f"Training completed with precision: {precision:.4f}")
            
            return precision
            
    except Exception as e:
        logger.error(f"Error in training main: {str(e)}")
        raise


In [13]:
def train_with_precision_target(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    X_eval: np.ndarray,
    y_eval: np.ndarray,
    logger: ExperimentLogger) -> Tuple[Any, float, Dict[str, Any]]:
    """Train LightGBM model with target precision threshold."""
    
    precision = 0
    best_precision = 0
    best_recall = 0
    best_params = None
    best_seed = 0
    best_model = None
    best_threshold = 0.5
    
    # Set basic parameters
    base_params = {
        'learning_rate': 0.11444068792330053,
        'num_leaves': 52,
        'max_depth': 4,
        'min_child_samples': 171,
        'feature_fraction': 0.7263512433582645,
        'bagging_fraction': 0.5499561443083391,
        'bagging_freq': 7,
        'reg_alpha': 9.967002635494612,
        'reg_lambda': 8.013583096360682,
        'min_split_gain': 0.13431182805041417,
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc'],
        'verbose': -1,
        'n_jobs': -1,
        'device': 'cpu',
        'early_stopping_rounds': 587
    }
    
    while best_precision < 0.48:  # Target precision threshold
        for random_seed in range(1, 800):  
            logger.info(f"Using sequential random seed: {random_seed}")
            
            # Set all random seeds
            os.environ['PYTHONHASHSEED'] = str(random_seed)
            np.random.seed(random_seed)
            random.seed(random_seed)
            base_params['random_state'] = random_seed
            
            try:
                # Create and train model
                model, metrics = train_model(
                    X_train, y_train,
                    X_test, y_test,
                    X_eval, y_eval,
                    **base_params
                )
                precision = metrics['precision']
                recall = metrics['recall']
                
                # Update best model if precision improved
                if precision > best_precision:
                    best_precision = precision
                    best_params = base_params
                    best_seed = random_seed
                    best_recall = recall
                    best_model = model
                    logger.info(f"New best precision: {precision:.4f} with seed {best_seed}")
                
                # Check if target precision reached
                if precision >= 0.48:
                    logger.info(f"Target precision achieved: {precision:.4f}")
                    return best_model, precision, recall, best_params
                
                logger.info(
                    f"Current precision: {precision:.4f}, "
                    f"target: 0.4800, highest precision: {best_precision:.4f}, "
                    f"best seed: {best_seed}"
                )
                
            except Exception as e:
                logger.error(f"Error training with seed {random_seed}: {str(e)}")
                continue
            
            # Clear model to free memory
            model = None
        
        # If target not reached after all seeds, return best model
        if precision < 0.48:
            logger.info(f"Target precision not reached, using best seed: {best_seed}")
            return best_model, best_precision, best_recall, best_params
    return best_model, best_precision, best_recall, best_params


In [14]:
def log_to_mlflow(model: object, precision: float, recall: float, params: dict, experiment_name: str) -> str:
    """Log model, metrics and parameters to MLflow.
    
    Args:
        model: Trained LightGBM model
        metrics (dict): Dictionary of metrics like precision, recall etc.
        params (dict): Model parameters used for training
        experiment_name (str): Name of the MLflow experiment
    """
    from utils.create_evaluation_set import setup_mlflow_tracking
    
    mlruns_dir = setup_mlflow_tracking(experiment_name)
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"lightgbm_base_{datetime.now().strftime('%Y%m%d_%H%M')}") as run:
        
        # Log parameters
        mlflow.log_params(params)
        
        # Log metrics
        mlflow.log_metrics({
            "precision": precision,
            "recall": recall
        })
        
        # Register model with timestamp
        model_name = f"lightgbm_{datetime.now().strftime('%Y%m%d_%H%M')}"
        
        # Create proper input example as DataFrame with single row
        input_example = pd.DataFrame(X_train.iloc[0]).T
        
        # Get prediction as array for signature
        pred = predict(model, input_example)
        
        # Log model with signature
        signature = mlflow.models.infer_signature(
            input_example,
            pred
        )
        
        mlflow.lightgbm.log_model(
            model,
            artifact_path="model", 
            registered_model_name=model_name,
            signature=signature
        )
        
        # Log run ID
        run_id = run.info.run_id
        logger.info(f"Run ID: {run_id}")
        return run_id


In [15]:
def train_seed_model():
    model, precision, recall, best_params = train_with_precision_target(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                logger
            )
    print(f"Training completed with precision: {precision:.4f}")
    
    # Log to MLflow if we got a valid model
    if model is not None:
        log_to_mlflow(model, precision, recall, best_params, experiment_name)


In [16]:
# Block 12: Run Training
if __name__ == "__main__":
    try:
        precision = hypertune_lightgbm(experiment_name)
        print(f"Training completed with precision: {precision:.4f}")
        
        # train_seed_model()
        
    except Exception as e:
        print(f"Training failed: {str(e)}")


2025-02-26 21:13:52,866 | INFO     | lightgbm_soccer_prediction | Starting hyperparameter optimization
2025-02-26 21:13:52,868 | INFO     | lightgbm_soccer_prediction | Starting hyperparameter optimization


[I 2025-02-26 21:13:52,870] A new study created in memory with name: lightgbm_optimization


  0%|          | 0/400 [00:00<?, ?it/s]

Training until validation scores don't improve for 551 rounds
Did not meet early stopping. Best iteration is:
[30]	valid_0's binary_logloss: 0.563137	valid_0's auc: 0.604581
2025-02-26 21:13:53,938 | INFO     | lightgbm_soccer_prediction | Optimized threshold: 0.320 with precision: 0.371
2025-02-26 21:13:53,952 | INFO     | lightgbm_soccer_prediction | Trial 0:
2025-02-26 21:13:53,954 | INFO     | lightgbm_soccer_prediction |   Params: {'objective': 'binary', 'metric': ['binary_logloss', 'auc'], 'verbose': -1, 'n_jobs': -1, 'random_state': 19, 'device': 'cpu', 'learning_rate': 0.11539477686914112, 'num_leaves': 52, 'max_depth': 5, 'min_child_samples': 166, 'feature_fraction': 0.7162407456176975, 'bagging_fraction': 0.5446798356100861, 'bagging_freq': 6, 'reg_alpha': 10.231637322523959, 'reg_lambda': 8.038043825060273, 'min_split_gain': 0.13700374665949525, 'early_stopping_rounds': 551, 'path_smooth': 0.044447541666908126, 'cat_smooth': 25.95942550311264, 'max_bin': 309}
2025-02-26 21:1