# XGBoost Model Implementation with CPU Optimization

This notebook implements an XGBoost-based model for soccer match draw prediction with CPU optimization. The implementation includes:

- Model creation and configuration
- Training with early stopping
- Threshold optimization
- Hyperparameter tuning
- Model evaluation
- MLflow integration for experiment tracking

## Setup and Imports

In [314]:
# Block 1: Imports and Setup
import numpy as np
import pandas as pd
from pathlib import Path
import xgboost as xgb
import joblib
import json
import os
import sys
import time
import optuna
import mlflow
import mlflow.xgboost
import random
from typing import Any, Dict, Tuple
from datetime import datetime
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.metrics import roc_auc_score
import yaml

# Add project root to Python path
project_root = str(Path().absolute().parent.parent.parent.parent)
if project_root not in sys.path:
    sys.path.append(project_root)
os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["ARROW_S3_DISABLE"] = "1"
# Configure Git executable path if available
git_executable = os.environ.get("GIT_PYTHON_GIT_EXECUTABLE")
if git_executable and os.path.exists(git_executable):
    import git
    git.refresh(git_executable)

from utils.logger import ExperimentLogger
experiment_name = "xgboost_soccer_prediction"
logger = ExperimentLogger(experiment_name)

from utils.create_evaluation_set import setup_mlflow_tracking
from models.StackedEnsemble.utils.metrics import calculate_metrics
from models.StackedEnsemble.shared.data_loader import DataLoader
from models.ensemble.data_utils import balance_and_clean_dataset

# Load data
mlruns_dir = setup_mlflow_tracking(experiment_name)
dataloader = DataLoader()
X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data()
# X_train, y_train = balance_and_clean_dataset(X_train, y_train)
# X_test, y_test = balance_and_clean_dataset(X_test, y_test)
min_recall = 0.20
n_trials = 500
base_params = {
            'objective': 'binary:logistic',
            'eval_metric': ['auc', 'logloss', 'error'],
            'tree_method': 'hist',
            'n_jobs': -1,
            'verbosity': 0
        }


2025-03-02 22:45:47,312 | INFO     | xgboost_soccer_prediction | Setting up MLflow tracking for experiment: xgboost_soccer_prediction
mlflow local_path_uri: c:/Users/szita/Documents/TheDrawCode/mlruns
2025-03-02 22:45:47,443 | INFO     | xgboost_soccer_prediction | Using existing experiment: xgboost_soccer_prediction experiment_id: 600562561289637747
2025-03-02 22:45:47,447 | INFO     | xgboost_soccer_prediction | MLflow tracking configured successfully at: c:/Users/szita/Documents/TheDrawCode/mlruns
2025-03-02 22:45:47,449 | INFO     | xgboost_soccer_prediction | Loading data splits according to ensemble strategy
2025-03-02 22:45:47,453 | INFO     | xgboost_soccer_prediction | Returning features common to all models
2025-03-02 22:45:47,455 | INFO     | xgboost_soccer_prediction | Loaded 102 selected features
2025-03-02 22:45:47,540 | INFO     | xgboost_soccer_prediction | Loaded training data from parquet: c:\Users\szita\Documents\TheDrawCode\data\api_training_final.parquet
2025-03-02

In [315]:
# Block 2: Configuration Loading
def load_hyperparameter_space():
    """Define hyperparameter space for optimization."""
    try:
        hyperparameter_space = {
            'learning_rate': {
                'type': 'float',
                'low': 0.001,
                'high': 0.05,
                'log': True
            },
            'max_depth': {
                'type': 'int', 
                'low': 4,
                'high': 10
            },
            'min_child_weight': {
                'type': 'int',
                'low': 100,
                'high': 400
            },
            'subsample': {
                'type': 'float',
                'low': 0.5,
                'high': 1.0
            },
            'colsample_bytree': {
                'type': 'float',
                'low': 0.6,
                'high': 1.0
            },
            'reg_alpha': {
                'type': 'float',
                'low': 0.001,
                'high': 10.0,
                'log': True
            },
            'reg_lambda': {
                'type': 'float',
                'low': 5.0,
                'high': 20.0,
                'log': True
            },
            'gamma': {
                'type': 'float',
                'low': 0.5,
                'high': 7.0
            },
            'early_stopping_rounds': {
                'type': 'int',
                'low': 300,
                'high': 1000
            },
            'scale_pos_weight': {
                'type': 'float',
                'low': 3.0,
                'high': 7.0
            }
        }
        
        
        return hyperparameter_space
    except Exception as e:
        logger.error(f"Error creating hyperparameter space: {str(e)}")
        return None

hyperparameter_space = load_hyperparameter_space()


In [316]:
# Block 3: Model Creation
def create_model(model_params):
    """Create and configure XGBoost model instance."""
    try:
        params = base_params
        
        # Update with provided parameters
        params.update(model_params)
        
        # Create model
        model = xgb.XGBClassifier(**params)
        
        return model
        
    except Exception as e:
        logger.error(f"Error creating XGBoost model: {str(e)}")
        raise


In [317]:
# Block 4: Data Conversion
def convert_to_model_format(X: pd.DataFrame, y: pd.Series = None):
    """Convert data to XGBoost DMatrix format."""
    if X is None:
        raise ValueError("The feature dataset X must not be None.")
    
    try:
        if y is not None:
            dmatrix = xgb.DMatrix(X, label=y)
        else:
            dmatrix = xgb.DMatrix(X)
        return dmatrix
    except Exception as e:
        logger.error(f"Error converting data to DMatrix: {str(e)}")
        raise


In [318]:
# Block 6: Prediction Functions
def predict(model, X, threshold=0.5):
    """Generate predictions using trained model."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        probas = model.predict_proba(X)[:, 1]
        return (probas >= threshold).astype(int)
        
    except Exception as e:
        logger.error(f"Error in model prediction: {str(e)}")
        return np.zeros(len(X))

def predict_proba(model, X):
    """Generate probability predictions."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        return model.predict_proba(X)[:, 1]
        
    except Exception as e:
        logger.error(f"Error in probability prediction: {str(e)}")
        return np.zeros(len(X))


In [319]:
def evaluate(model, X: Any, y: Any, best_threshold: float) -> Dict[str, float]:
    """Evaluate model performance on given data."""
    if model is None:
        raise RuntimeError("Model must be trained before evaluation")
    
    try:
        # Get probability predictions
        y_prob = model.predict_proba(X)[:, 1]
        
        # Get binary predictions using best threshold
        y_pred = (y_prob >= best_threshold).astype(int)
        
        # Calculate metrics
        tp = np.sum((y == 1) & (y_pred == 1))
        fp = np.sum((y == 0) & (y_pred == 1))
        fn = np.sum((y == 1) & (y_pred == 0))
        
        metrics = {
            'precision': tp / (tp + fp + 1e-10),
            'recall': tp / (tp + fn + 1e-10),
            'f1': 2 * tp / (2 * tp + fp + fn + 1e-10),
            'auc': roc_auc_score(y, y_prob),
            'brier_score': np.mean((y_prob - y) ** 2),
            'threshold': best_threshold
        }
        
        return metrics
        
    except Exception as e:
        logger.error(f"Error in model evaluation: {str(e)}")
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'auc': 0.0,
            'brier_score': 1.0,
            'threshold': best_threshold
        }


In [320]:
def optimize_threshold(model, y_true: np.ndarray, y_prob: np.ndarray) -> float:
    """Optimize prediction threshold with focus on precision while maintaining recall above 15%."""
    try:
        best_threshold = 0.5
        best_precision = 0.0
        
        # Search through thresholds
        for threshold in np.linspace(0.3, 0.8, 51):
            y_pred = (y_prob >= threshold).astype(int)
            
            # Calculate confusion matrix components
            tp = np.sum((y_true == 1) & (y_pred == 1))
            fp = np.sum((y_true == 0) & (y_pred == 1))
            fn = np.sum((y_true == 1) & (y_pred == 0))
            
            precision = tp / (tp + fp + 1e-10)
            recall = tp / (tp + fn + 1e-10)
            
            # Only consider thresholds that maintain recall above 15%
            if recall >= min_recall:
                if precision > best_precision:
                    best_precision = precision
                    best_threshold = threshold
        
        logger.info(f"Optimized threshold: {best_threshold:.3f} with precision: {best_precision:.3f}")
        metrics = evaluate(model, X_eval, y_eval, best_threshold)
        return metrics
        
    except Exception as e:
        logger.error(f"Error optimizing threshold: {str(e)}")
        return 0.5


In [321]:
# Block 5: Training Function
def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params):
    """Train XGBoost model with early stopping."""
    try:
        # Create model with remaining parameters
        model = create_model(model_params)
        
        # Create eval set for early stopping
        eval_set = [(X_test, y_test)]
        
        # Fit model with early stopping
        model.fit(
            X_train, y_train,
            eval_set=eval_set,
            verbose=False
        )
        
        # Get validation predictions
        y_prob = model.predict_proba(X_eval)[:, 1]
        metrics = optimize_threshold(model, y_eval, y_prob)
        
        return model, metrics
        
    except Exception as e:
        logger.error(f"Error training XGBoost model: {str(e)}")
        raise


In [322]:
# Block 7: Model Save/Load
def save_model(model, path, threshold=None):
    """Save XGBoost model and threshold to specified path."""
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Save model
        joblib.dump(model, path)
        
        # Save threshold
        if threshold:
            threshold_path = path.parent / "threshold.json"
            with open(threshold_path, 'w') as f:
                json.dump({
                    'threshold': threshold,
                    'model_type': 'xgboost',
                    'params': model.get_params()
                }, f, indent=2)
                
        logger.info(f"Model saved to {path}")
        
    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        raise

def load_model(path):
    """Load XGBoost model from specified path."""
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"No model file found at {path}")
        
    try:
        # Load model
        model = joblib.load(path)
        
        # Load threshold
        threshold_path = path.parent / "threshold.json"
        if threshold_path.exists():
            with open(threshold_path, 'r') as f:
                data = json.load(f)
                threshold = data.get('threshold', 0.5)
        else:
            threshold = 0.5
            
        logger.info(f"Model loaded from {path}")
        return model, threshold
        
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise


In [323]:
# Block 8: Hyperparameter Tuning
def objective(trial):
    """Optuna objective function for hyperparameter optimization."""
    try:
        params = base_params
        
        # Add hyperparameters from config
        hyperparameter_space = load_hyperparameter_space()
        for param_name, param_config in hyperparameter_space.items():
            if param_config['type'] == 'float':
                params[param_name] = trial.suggest_float(
                    param_name,
                    param_config['low'], 
                    param_config['high'],
                    log=param_config.get('log', False)  # Pass log if in param_config, default False
                )
            elif param_config['type'] == 'int':
                params[param_name] = trial.suggest_int(
                    param_name,
                    param_config['low'],
                    param_config['high']
                )
            elif param_config['type'] == 'categorical':
                params[param_name] = trial.suggest_categorical(
                    param_name,
                    param_config['choices']
                )
        # Train model and get metrics
        model, metrics = train_model(
            X_train, y_train,
            X_test, y_test,
            X_eval, y_eval,
            params
        )
        
        recall = metrics.get('recall', 0.0)
        precision = metrics.get('precision', 0.0)
        
        # Report intermediate values for pruning
        # trial.report(precision, step=1)
        # if trial.should_prune():
        #     raise optuna.TrialPruned()
        
        # Optimize for precision while maintaining minimum recall
        score = precision if recall >= min_recall else 0.0
        
        logger.info(f"Trial {trial.number}:")
        logger.info(f"  Params: {params}")
        logger.info(f"  Score: {score}")
        
        for metric_name, metric_value in metrics.items():
            trial.set_user_attr(metric_name, metric_value)
        return score

    except Exception as e:
        logger.error(f"Error in trial {trial.number}: {str(e)}")
        raise


In [324]:
# Block 9: Hypertuning Function
def hypertune_xgboost(experiment_name: str) -> float:
    """Run hyperparameter optimization with MLflow tracking."""
    try:
        # Create study
        study = optuna.create_study(
            study_name=f"xgboost_optimization_{datetime.now().strftime('%Y%m%d_%H%M')}",
            direction="maximize",
            sampler = optuna.samplers.TPESampler(                     # Different seed for better randomization
                    n_startup_trials=20,         # Reduced from 50 - more efficient
                    prior_weight=0.4
                )
            # pruner = optuna.pruners.HyperbandPruner(  # CHANGE: Better than MedianPruner for XGBoost
            #     min_resource=1,              # Minimum number of training steps
            #     max_resource=10,             # Maximum number of training iterations to consider
            #     reduction_factor=3           # Controls aggressiveness of pruning
            # )
        )
        
        # Start MLflow run
        with mlflow.start_run(run_name=f"xgboost_optimization_{datetime.now().strftime('%Y%m%d_%H%M')}"):
            # Log dataset info
            mlflow.log_params({
                "train_samples": len(X_train),
                "test_samples": len(X_test),
                "eval_samples": len(X_eval),
                "features": X_train.shape[1]
            })
            
            # Set tags
            mlflow.set_tags({
                "model_type": "xgboost_base",
                "optimization": "optuna",
                "cpu_only": True
            })
            
            # Optimize
            best_score = -float('inf')  # Initialize with worst possible score
            best_params = {}
            def callback(study, trial):
                nonlocal best_score
                nonlocal best_params
                logger.info(f"Current best score: {best_score:.4f}")
                if trial.value > best_score:
                    best_score = trial.value
                    best_params = trial.params
                    logger.info(f"New best score found in trial {trial.number}: {best_score:.4f}")
                return best_score
            
            study.optimize(objective, n_trials=n_trials, 
                            timeout=10000, show_progress_bar = True, callbacks=[callback])  # 2 hours timeout
            best_params.update(**base_params)
            # Log best trial info
            logger.info(f"Best trial value: {best_score}")
            logger.info(f"Best Parameters: {best_params}")
            logger.info("Training final model with best parameters")
            final_model, final_metrics = train_model(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                best_params
            )
            
            logger.info(f"Training completed with precision: {final_metrics['precision']:.4f}")
            return final_metrics['precision'], best_params
            
    except Exception as e:
        logger.error(f"Error in hyperparameter optimization: {str(e)}")
        raise


In [325]:
def train_with_precision_target(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    X_eval: np.ndarray,
    y_eval: np.ndarray,
    logger: ExperimentLogger) -> Tuple[Any, float, Dict[str, Any]]:
    """Train XGBoost model with target precision threshold."""
    
    precision = 0
    best_precision = 0
    best_recall = 0
    best_params = None
    best_seed = 0
    best_model = None
    best_threshold = 0.5
    
    # Base parameters from previous optimization
    params = {
        'tree_method': 'hist',  # Required for CPU-only training per project rules
        'objective': 'binary:logistic',
        'eval_metric': 'aucpr', 
        'verbose': -1,
        'n_jobs': -1,
        'learning_rate': 0.023063607691447444,
        'max_depth': 10,
        'min_child_weight': 100,
        'subsample': 0.4118438147817545, 
        'colsample_bytree': 0.7517099195364563,
        'reg_alpha': 0.031549778623225215,
        'reg_lambda': 7.079771696753476,
        'gamma': 0.8306756479665651,
        'early_stopping_rounds': 316,
        'scale_pos_weight': 2.52367628440707,
        
    }
    
    while best_precision < 0.48:  # Target precision threshold
        for random_seed in range(1, 800):  # Try up to 1000 different seeds
            logger.info(f"Using sequential random seed: {random_seed}")
            
            # Set all random seeds
            os.environ['PYTHONHASHSEED'] = str(random_seed)
            np.random.seed(random_seed)
            random.seed(random_seed)
            params['random_state'] = random_seed
            
            try:
                # Create and train model
                model, metrics = train_model(
                    X_train, y_train,
                    X_test, y_test,
                    X_eval, y_eval,
                    **params
                )
                precision = metrics['precision']
                recall = metrics['recall']

                # Update best model if precision improved
                if precision > best_precision:
                    best_precision = precision
                    best_recall = recall
                    best_params = params.copy()
                    best_seed = random_seed
                    best_model = model
                    logger.info(f"New best precision: {precision:.4f} with seed {best_seed}")
                
                # Check if target precision reached
                if precision >= 0.48:
                    logger.info(f"Target precision achieved: {precision:.4f}")
                    return best_model, precision, recall, best_params
                
                logger.info(
                    f"Current precision: {precision:.4f}, "
                    f"target: 0.4800, highest precision: {best_precision:.4f}, "
                    f"best seed: {best_seed}"
                )
                
            except Exception as e:
                logger.error(f"Error training with seed {random_seed}: {str(e)}")
                continue
            
            # Clear model to free memory
            model = None
        
        # If target not reached after all seeds, return best model
        if precision < 0.48:
            logger.info(f"Target precision not reached, using best seed: {best_seed}")
            return best_model, best_precision, best_recall, best_params
            
    return best_model, best_precision, best_recall, best_params


In [326]:
def log_to_mlflow(model: object, precision: float, recall: float, params: dict, experiment_name: str) -> str:
    """Log model, metrics and parameters to MLflow.
    
    Args:
        model: Trained XGBoost model
        metrics (dict): Dictionary of metrics like precision, recall etc.
        params (dict): Model parameters used for training
        experiment_name (str): Name of the MLflow experiment
    """
    from utils.create_evaluation_set import setup_mlflow_tracking
    
    mlruns_dir = setup_mlflow_tracking(experiment_name)
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"xgboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}") as run:
        
        # Log parameters
        mlflow.log_params(params)
        
        # Log metrics
        mlflow.log_metrics({
            "precision": precision,
            "recall": recall
        })
        
        # Register model with timestamp
        model_name = f"xgboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}"
        
        # Log model with signature
        # Create proper input example as DataFrame with single row
        input_example = pd.DataFrame(X_train.iloc[0]).T
        
        # Get prediction as array for signature
        pred = predict(model, input_example)
        signature = mlflow.models.infer_signature(
            model_input=input_example,
            model_output=pred
        )
        
        mlflow.xgboost.log_model(
            xgb_model=model,
            artifact_path="model",
            registered_model_name=model_name,
            signature=signature
        )
        
        # Log run ID
        run_id = run.info.run_id
        logger.info(f"Run ID: {run_id}")
        return run_id


In [327]:
def train_seed_model():
    model, precision, recall, best_params = train_with_precision_target(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                logger
            )
    print(f"Training completed with precision: {precision:.4f}")
    
    # Log to MLflow if we got a valid model
    if model is not None:
        log_to_mlflow(model, precision, recall, best_params, experiment_name)


In [328]:
if __name__ == "__main__":
    # train_seed_model()
    # Run multiple hypertuning sessions and track the best precision
    best_precision = 0
    best_params = {}
    best_run_id = None
    
    for i in range(5):
        logger.info(f"Starting hypertuning run {i+1}/5")
        precision, params = hypertune_xgboost(experiment_name)
        
        logger.info(f"Run {i+1} completed with precision: {precision:.4f}")
        
        # Track the best run
        if precision > best_precision:
            best_precision = precision
            best_params = params
    
    logger.info(f"Best precision: {best_precision:.4f} and best Parameters {best_params}")


2025-03-02 22:46:18,845 | INFO     | xgboost_soccer_prediction | Starting hypertuning run 1/5


[I 2025-03-02 22:46:18,849] A new study created in memory with name: xgboost_optimization_20250302_2246


  0%|          | 0/500 [00:00<?, ?it/s]

2025-03-02 22:46:22,224 | INFO     | xgboost_soccer_prediction | Optimized threshold: 0.700 with precision: 0.340
2025-03-02 22:46:22,278 | INFO     | xgboost_soccer_prediction | Trial 0:
2025-03-02 22:46:22,284 | INFO     | xgboost_soccer_prediction |   Params: {'objective': 'binary:logistic', 'eval_metric': ['auc', 'logloss', 'error'], 'tree_method': 'hist', 'n_jobs': -1, 'verbosity': 0, 'learning_rate': 0.008992711236478315, 'max_depth': 4, 'min_child_weight': 140, 'subsample': 0.7614330188267323, 'colsample_bytree': 0.9521778408204591, 'reg_alpha': 0.0025740742375474027, 'reg_lambda': 14.275779930124505, 'gamma': 0.5544819531149592, 'early_stopping_rounds': 898, 'scale_pos_weight': 6.058064369752594}
2025-03-02 22:46:22,290 | INFO     | xgboost_soccer_prediction |   Score: 0.3403416557161182
[I 2025-03-02 22:46:22,297] Trial 0 finished with value: 0.3403416557161182 and parameters: {'learning_rate': 0.008992711236478315, 'max_depth': 4, 'min_child_weight': 140, 'subsample': 0.76143

AttributeError: 'FrozenTrial' object has no attribute 'score'